In [5]:
import datetime
import psycopg2 # Postgresql
import pandas as pd # Ez table management
from dotenv import load_dotenv # Load .env strings
import os # console commands
import boto3 # AWS querier

In [13]:
# Note: .env file should be in the top-level directory of the repo
load_dotenv(dotenv_path="./.env")
S3_BUCKET = os.getenv('AWS_S3_BUCKET_NAME')
SRC_IMG_FOLDER = os.getenv('AWS_S3_BUCKET_ANNOTATIONS_FOLDER')
DB_NAME = os.getenv("DB_NAME")
DB_HOST = os.getenv("DB_HOST")
DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")
A_KEY = os.getenv("AWS_ACCESS_KEY_ID")
A_SEC = os.getenv("AWS_SECRET_ACCESS_KEY")
print(S3_BUCKET)

lubomirstanchev


In [14]:
# Connect to database using .env variables
def queryDB(query, params=None):
    conn = psycopg2.connect(database = DB_NAME,
                        user = DB_USER,
                        password = DB_PASSWORD,
                        host = DB_HOST,
                        port = "5432")
    # Use pandas to read queries into a dataframe
    result = pd.read_sql_query(query, conn, params=params)
    conn.close() # Close postgresql
    return result

# Remove old csv of s3Images

In [15]:
# Remove old s3Images
os.remove('./csv/s3Images.csv')

# Create csv of image names in lubo's s3 bucket

In [16]:
# Get all the image filename
with open('./csv/s3Images.csv','a') as fd:
    # Connect to AWS
    session = boto3.Session(aws_access_key_id = A_KEY, aws_secret_access_key = A_SEC)
    # Access simple storage and s3 bucket
    s3 = session.resource('s3')
    bucket = s3.Bucket("lubomirstanchev")
    # Iterate through bucket folder test
    for index, obj in enumerate(bucket.objects.filter(Delimiter='/',Prefix='test/')):
        if index >= 1:
            # Make csv of images in s3 bucket
            fd.write(obj.key + '\n')

# Get all annotations from database

In [17]:
annotation_rows = queryDB('''
    Select
        annotations.*, videos.filename
    FROM
        annotations
    LEFT JOIN
        videos
    ON
        videos.id=videoid
''')

# Read in s3 image names into memory

In [20]:
current_time = datetime.datetime.now().strftime("%b %d %H:%M")
s3Images = pd.read_csv(f"./csv/s3Images {current_time}.csv", header=None)
# Remove extra path info in s3 images names
s3Images[0] = s3Images[0].str.split('/', expand=True)[1]
s3Images = s3Images[0]

# Number of annotations in database

In [21]:
annotation_rows.shape

(2340907, 29)

# Remove images that exist in the s3 bucket

In [22]:
annotation_rows_no_s3img = annotation_rows[~(annotation_rows.image.isin(s3Images) & annotation_rows.imagewithbox.isin(s3Images))]

# Number of images that are missing from the s3 bucket

In [23]:
print(f"There are {annotation_rows_no_s3img.shape[0]} missing images")

There are 2340907 missing images


# Explore info about missing images

In [24]:
annotation_rows_no_s3img.userid.value_counts()

32     2298413
15       20947
12        8139
11        6382
6         5023
13        1453
16         337
4           79
5           40
171         28
569         17
7           13
14          10
9           10
8            7
125          3
10           3
27           2
272          1
Name: userid, dtype: int64

In [25]:
annotation_rows_no_s3img.conceptid.value_counts()

2136    365957
347     362462
383     308351
236     279797
1948    192183
         ...  
382          1
415          1
2600         1
1601         1
3255         1
Name: conceptid, Length: 119, dtype: int64

In [26]:
annotation_rows_no_s3img.dateannotated.value_counts()

2019-08-20    356362
2019-08-17    324156
2019-08-16    313185
2019-08-18    292059
2019-08-19    290595
               ...  
2019-09-20         1
2019-04-23         1
2019-09-18         1
2019-05-20         1
2019-06-25         1
Name: dateannotated, Length: 270, dtype: int64

In [42]:
annotation_rows_no_s3img.columns

Index(['id', 'videoid', 'userid', 'conceptid', 'timeinvideo', 'x1', 'y1', 'x2',
       'y2', 'videowidth', 'videoheight', 'dateannotated', 'image',
       'imagewithbox', 'comment', 'unsure', 'originalid', 'framenum', 'speed',
       'verifieddate', 'verifiedby', 'priority', 'oldconceptid', 'oldx1',
       'oldy1', 'oldx2', 'oldy2', 'tracking_flag', 'filename'],
      dtype='object')

# Save missing images to csv

In [43]:
# This is used for generation of missing images
current_time = datetime.datetime.now().strftime("%b %d %H:%M")
annotation_rows_no_s3img.to_csv(f"./csv/annotations_no_s3img {current_time}.csv", index=False)