In [None]:
import datetime
import psycopg2 # Postgresql
import pandas as pd # Ez table management
from dotenv import load_dotenv # Load .env strings
import os # console commands
import boto3 # AWS querier

In [None]:
# Note: .env file should be in the parent directory of the repo, in a folder called
# video-annotation-project. This folder contains repos video-annotation-tool and 
# misc-scripts
load_dotenv(dotenv_path="../.env")
S3_BUCKET = os.getenv('AWS_S3_BUCKET_NAME')
SRC_IMG_FOLDER = os.getenv('AWS_S3_BUCKET_ANNOTATIONS_FOLDER')
DB_NAME = os.getenv("DB_NAME")
DB_HOST = os.getenv("DB_HOST")
DB_USER = os.getenv("DB_USER")
DB_PASSWORD = os.getenv("DB_PASSWORD")
A_KEY = os.getenv("AWS_ACCESS_KEY_ID")
A_SEC = os.getenv("AWS_SECRET_ACCESS_KEY")
print(S3_BUCKET)

lubomirstanchev


In [None]:
# Connect to database using .env variables
def queryDB(query, params=None):
    conn = psycopg2.connect(database = DB_NAME,
                        user = DB_USER,
                        password = DB_PASSWORD,
                        host = DB_HOST,
                        port = "5432")
    # Use pandas to read queries into a dataframe
    result = pd.read_sql_query(query, conn, params=params)
    conn.close() # Close postgresql
    return result

# Get all annotations from database

In [None]:
annotation_rows = queryDB('''
    Select
        annotations.*, videos.filename
    FROM
        annotations
    LEFT JOIN
        videos
    ON
        videos.id=videoid
''')
# show number of annotations in database
annotation_rows.shape

# Create csv of image names in lubo's s3 bucket

In [None]:
current_time = datetime.datetime.now().strftime("%b %d %H:%M")
# Get all the image filename
with open(f'./csv/s3Images {current_time}.csv', 'a') as fd:
    # Connect to AWS
    session = boto3.Session(aws_access_key_id = A_KEY, aws_secret_access_key = A_SEC)
    # Access simple storage and s3 bucket
    s3 = session.resource('s3')
    bucket = s3.Bucket("lubomirstanchev")
    # Iterate through bucket folder test
    for index, obj in enumerate(bucket.objects.filter(Delimiter='/',Prefix='test/')):
        if index >= 1:
            # Make csv of images in s3 bucket
            fd.write(obj.key + '\n')

# Read in s3 image names into memory

In [None]:
s3Images = pd.read_csv(f"./csv/s3Images {current_time}.csv", header=None)
# Remove extra path info in s3 images names
s3Images[0] = s3Images[0].str.split('/', expand=True)[1]
s3Images = s3Images[0]
s3Images.shape

# Remove images that exist in the s3 bucket

In [None]:
annotation_rows_no_s3img = annotation_rows[~(annotation_rows.image.isin(s3Images))]

# Number of images that are missing from the s3 bucket

In [None]:
print(f"There are {annotation_rows_no_s3img.shape[0]} missing images")

# Explore info about missing images

In [None]:
annotation_rows_no_s3img.userid.value_counts()

In [None]:
annotation_rows_no_s3img.conceptid.value_counts()

In [None]:
annotation_rows_no_s3img.dateannotated.value_counts()

In [None]:
annotation_rows_no_s3img.columns

# Save missing images to csv

In [None]:
# This is used for generation of missing images
current_time = datetime.datetime.now().strftime("%b %d %H:%M")
annotation_rows_no_s3img.to_csv(f"./csv/annotations_no_s3img {current_time}.csv", index=False)