In [1]:
#import libraries
import db_ops
import pandas as pd

Connecting to the PostgreSQL database...
Connection successful


## Load file with images, parse csv and fix some of the problems

In [4]:
df = pd.read_excel('./data/final_output_with_images.xlsx')
df = df.groupby('image_url', as_index=False).first() # Remove all duplicate images
df.sort_values(by = 'id', inplace= True)             # Sort by ID
df = df.rename(columns={'id':'olm_id'})              # Rename ID column to olm_id to not confuse in the DB
print(f"Total {df['image_url'].count()} images. {df['image_url'].nunique()} of them are unique")

Total 27388 images. 27388 of them are unique


In [8]:
#Subset a df that only has image metadata, we will deal with catregories later
pictures_df = df[['olm_id', 'verification', 'phone', 'date_taken', 'date_uploaded', 'lat', 'lon', 'picked up', 'address', 'total_litter', 'image_url']]

In [9]:
# Replace imgae url with the correct s3 uri
pictures_df['image_url'] = pictures_df['image_url'].apply(lambda x: 's3://olm-pics-s3/' + x.split('.com/')[-1])
pictures_df = pictures_df.rename(columns={'image_url': 's3_uri', 'picked up':'picked_up'})

# Convert picked up to bool
pictures_df = pictures_df.rename(columns={'picked up':'picked_up'})
pictures_df['picked_up'] = pictures_df['picked_up'].apply(lambda x: 1 if x == 'Yes' else 0).astype(bool)

# Convert total litter into an integer
pictures_df['total_litter'] = pictures_df['total_litter'].fillna(0)
pictures_df['total_litter'] = pictures_df['total_litter'].astype(int)

# Sanitize address string
pictures_df['address'] = pictures_df['address'].apply(lambda x: x.replace("'", ""))
pictures_df.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pictures_df['image_url'] = pictures_df['image_url'].apply(lambda x: 's3://olm-pics-s3/' + x.split('.com/')[-1])


Unnamed: 0,olm_id,verification,phone,date_taken,date_uploaded,lat,lon,picked_up,address,total_litter,s3_uri
32,4222,2,SAMSUNG-SM-J727A,12/12/17 11:27,1/25/18 17:59,29.456944,-98.4225,False,"Warrior and Family Support Center, George Beac...",4,s3://olm-pics-s3/112/Pftq4550WzgZA5euhgIEsLHGs...
21,4223,2,SAMSUNG-SM-J727A,12/11/17 15:04,1/25/18 17:59,29.565833,-98.593889,False,"Silicon Drive, San Antonio, Bexar County, Texa...",3,s3://olm-pics-s3/112/HKXafwXvrpBQjoWC9aPtSY8lT...
6,4227,2,Unknown,1/25/18 18:24,1/25/18 18:24,29.455317,-98.424719,False,"Binz-Engleman Road, San Antonio, Bexar County,...",5,s3://olm-pics-s3/112/69M58FGWNVp1ZTObF0YmkMfGN...


In [11]:
# Create stage table for images 
db_ops.connection.rollback()
query = """

drop table if exists image;

CREATE TABLE "image"(image_id SERIAL PRIMARY KEY
                    ,olm_id INTEGER
                    ,verification SMALLINT
                    ,phone VARCHAR(30)
                    ,date_taken TIMESTAMP
                    ,date_uploaded TIMESTAMP
                    ,lat NUMERIC(15,12)
                    ,lon NUMERIC(15,12)
                    ,picked_up BOOL
                    ,address VARCHAR(255)
                    ,total_litter SMALLINT
                    ,s3_uri VARCHAR(255)
);

"""
db_ops.cursor.execute(query)
db_ops.connection.commit()

In [23]:
# Insert the df to this table
# Create a list of tupples from the dataframe values
tuples = [tuple(x) for x in pictures_df.to_numpy()]
# Comma-separated dataframe columns
cols = ','.join(list(pictures_df.columns))
# SQL quert to execute
query = "INSERT INTO %s(%s) VALUES(%%s,%%s,%%s, %%s,%%s,%%s,%%s,%%s,%%s,%%s,%%s)" % ("image", cols)
cursor = db_ops.connection.cursor()
cursor.executemany(query, tuples)
db_ops.connection.commit()


## Brands table

In [3]:
melt_df = pd.melt(df, id_vars=['A'], value_vars=['B', 'C'])

INSERT INTO brand (brand_id,name,score) VALUES ('1','Coca Cola','10');
INSERT INTO brand (brand_id,name,score) VALUES ('2','Corona','20');
INSERT INTO brand (brand_id,name,score) VALUES ('3','Miller','30');
INSERT INTO brand (brand_id,name,score) VALUES ('4','Budweiser','40');
INSERT INTO brand (brand_id,name,score) VALUES ('5','LoveIs','50');
INSERT INTO brand (brand_id,name,score) VALUES ('6','Snapple','100');
