In [1]:
import pandas as pd
from sqlalchemy import create_engine


# Make sure database is created in Postgres using Schema.sql file

In [2]:
from secrets import username, password


# Read in and put into a database the Kroger products

In [3]:
csv_file = "../data/Kroger_clean.csv"
kroger_data_df = pd.read_csv(csv_file)
#kroger_data_df.head()


# Read in and put into a database HEB Products

In [4]:
path = "../data/heb_clean.csv"
heb_data_df = pd.read_csv(path)
heb_milk = heb_data_df.drop(columns=['coupon', 'name' , 'price', 'uomSalePrice', 'simple_type'])
milk_type_file = "../data/heb_milk_vs_types.csv"
milk_type_df = pd.read_csv(milk_type_file)
heb_milk = heb_milk.merge(milk_type_df, on = 'id')
heb_milk = heb_milk.rename(columns= {'id':'product_id' , 'brand' : 'Brand'})
heb_milk['Store_Number'] = 1111111
today = pd.datetime.now().date()
heb_data_df['Date'] = today
#heb_data_df.head()
heb_milk = heb_milk.drop(columns = ['type_x','types'])
heb_milk = heb_milk.rename(columns = {'type_y' : 'type'})
#heb_milk


  today = pd.datetime.now().date()


# Create dataframe to match Milk database table. Merge HEB and Kroger milk dataframes.

In [5]:
milk_df = kroger_data_df
milk_df = milk_df.drop(columns=['description' , 'price.regular', 'price.promo', 'Date'])
features = 'none'
milk_df['features'] = features
kroger_milk_type_file = "../data/kroger_milk_vs_types.csv"
kroger_milk_type_df = pd.read_csv(kroger_milk_type_file)
kroger_milk_type_df = kroger_milk_type_df.rename(columns = {'id' : 'productId'})
milk_df = milk_df.merge(kroger_milk_type_df, on = 'productId')
milk_df = milk_df.drop(columns=['types'])
milk_df = milk_df.rename(columns={"productId": "product_id", "brand": "Brand" , "categories" : "category" , 'Store_Id' : 'Store_Number'})
complete_milk = pd.concat([milk_df, heb_milk])
#complete_milk


# Create and populate Store dataframe for Store database table. Will be automated once more stores are added in future itterations of project.

In [6]:
store_df = pd.DataFrame()
store_id = 3400312 , 1111111
store_df["Store_Number"] = store_id
zipcode = 77007 , 77007
store_df['Store_Zipcode'] = zipcode
name = "Kroger" , "HEB"
store_df['Store_Name'] = name
#store_df.head()

# Read in and create dataframe for milk types. Formatted for database table.



In [7]:
milk_type_file = "../data/milk_types.csv"
milk_type_df = pd.read_csv(milk_type_file)
milk_type_df['Full_Name'] = 'Almond-Milk' , 'Chocolate-Milk', 'Reduced-Fat-Milk', 'Lactose-Free-Milk', 'Oat-Milk', 'Organic-Milk', 'Other', 'Powder-Milk', 'Protein-Product' , 'Whole'
milk_type_df = milk_type_df.rename(columns={'count_id' : 'count'})
#milk_type_df

# Connect to local database

In [8]:
rds_connection_string = f"{username}:{password}@localhost:5432/milk_db"
engine = create_engine(f'postgresql://{rds_connection_string}')

# Check for tables

In [9]:
engine.table_names()

['Store', 'Milk', 'Milk_Type', 'Price_History']

# Use drop duplicates to only add updated data to a dataframe that is then loaded into the database

In [10]:
temp_store = pd.read_sql_query('select * from "Store"', con=engine)
temp_store = temp_store.drop(columns = 'id')
#temp_store

In [11]:
db_store = pd.concat([temp_store, store_df]).drop_duplicates(keep=False)
db_store = db_store.reset_index(drop=True)   
#db_store.head()

# Use pandas to load csv converted DataFrame into database

In [12]:
db_store.to_sql(name='Store', con=engine, if_exists='append', index=False)

# Confirm data has been added by querying the table

In [13]:
pd.read_sql_query('select * from "Store"', con=engine).head()

Unnamed: 0,id,Store_Number,Store_Name,Store_Zipcode
0,1,3400312,Kroger,77007
1,2,1111111,HEB,77007


# Same process as above for store table is followed for milk type.

In [14]:
temp_type = pd.read_sql_query('select * from "Milk_Type"', con=engine)
temp_type = temp_type.drop(columns = 'id')
db_type = pd.concat([temp_type, milk_type_df]).drop_duplicates(keep=False)
db_type = db_type.reset_index(drop=True)   
#db_type

In [15]:
db_type.to_sql(name='Milk_Type', con=engine, if_exists='append', index=False)

### Confirm data has been added by querying the  table

In [16]:
pd.read_sql_query('select * from "Milk_Type"', con=engine)

Unnamed: 0,id,type,Full_Name,count
0,1,almond,Almond-Milk,44
1,2,chocolate,Chocolate-Milk,43
2,3,fat,Reduced-Fat-Milk,51
3,4,lactose,Lactose-Free-Milk,19
4,5,oat,Oat-Milk,13
5,6,organic,Organic-Milk,10
6,7,other,Other,47
7,8,powder,Powder-Milk,3
8,9,protein,Protein-Product,9
9,10,whole,Whole,21


# Retrieves and merges Store data with milk data to set the Store ID then next cell deletes data not required in Milk Table

In [17]:
stores2_df = pd.read_sql_query('select * from "Store"', con=engine).head()
complete_milk = complete_milk.merge(stores2_df, on = 'Store_Number')
#complete_milk.head()

In [18]:
complete_milk = complete_milk.drop(columns = ['Store_Name' , 'Store_Zipcode', 'Store_Number'])
#complete_milk.head()

In [19]:
complete_milk = complete_milk.rename(columns ={'id' : 'Store_ID'})
#complete_milk.head()

In [20]:
type2_df =pd.read_sql_query('select * from "Milk_Type"', con=engine)
type2_df = type2_df.drop(columns = ['Full_Name' , 'count'])
complete_milk = complete_milk.merge(type2_df, on = 'type')
complete_milk = complete_milk.drop(columns = {'type' })
complete_milk = complete_milk.rename(columns = {'id' : 'Type_ID'})
#complete_milk


# Same process as above is followed to insert Milk data from dataframe into Milk table of database

In [21]:
temp_milk = pd.read_sql_query('select * from "Milk"', con=engine)
temp_milk = temp_milk.drop(columns = 'id')
#temp_milk

In [22]:
db_milk = pd.concat([temp_milk, complete_milk]).drop_duplicates(keep=False)
db_milk = db_milk.reset_index(drop=True)   
#db_milk.head()

In [23]:
db_milk.to_sql(name='Milk', con=engine, if_exists='append', index=False)

In [24]:
pd.read_sql_query('select * from "Milk"', con=engine)

Unnamed: 0,id,product_id,Brand,features,size,category,image,Store_ID,Type_ID,name
0,1,1111040101,Kroger,none,1 gal,['Dairy'],https://www.kroger.com/product/images/xlarge/f...,1,10,Kroger® Vitamin D Whole Milk
1,2,1111040601,Kroger,none,1/2 gal,['Dairy'],https://www.kroger.com/product/images/xlarge/f...,1,10,Kroger® Vitamin D Whole Milk
2,3,1111042850,Simple Truth Organic,none,0.5 gal,"['Dairy', 'Natural & Organic']",https://www.kroger.com/product/images/xlarge/f...,1,10,Simple Truth Organic® Whole Milk
3,4,1111042908,Simple Truth Organic,none,1 gal,"['Dairy', 'Natural & Organic']",https://www.kroger.com/product/images/xlarge/f...,1,10,Simple Truth Organic™ Whole Milk
4,5,81326702000,A2 Milk,none,59 fl oz,['Dairy'],https://www.kroger.com/product/images/xlarge/f...,1,10,A2 Ultra-Pasteurized Whole Milk
...,...,...,...,...,...,...,...,...,...,...
598,599,3981542,Iconic,,,Dairy & Eggs/Milk,https://images.heb.com/is/image/HEBGrocery/prd...,2,9,Kids Vanilla Vacay Protein Drink
599,600,3981541,Iconic,,,Dairy & Eggs/Milk,https://images.heb.com/is/image/HEBGrocery/prd...,2,9,Kids Fruity Fiesta Protein Drink
600,601,1290139,Nestle,,56.3 oz,Dairy & Eggs/Milk,https://images.heb.com/is/image/HEBGrocery/prd...,2,8,NIDO Kinder 1+ Powdered Milk Beverage
601,602,1148619,Nestle,[' LOW SODIUM'],12.6 oz,Dairy & Eggs/Milk,https://images.heb.com/is/image/HEBGrocery/prd...,2,8,NIDO Kinder 1+ Powdered Milk Beverage


# Milk data is pulled from database Milk table and merged with price history. Uneeded price history columns are dropped and renamed to coordinate with Price History Table. This sets the product ID in price history.

In [25]:
price_history_df = kroger_data_df[['Date', 'price.regular', 'price.promo', 'productId']].copy()
heb_price = heb_data_df[['price' , 'coupon', 'id', 'Date']]
heb_price =heb_price.rename(columns={'Date':'date', 'coupon':'saleprice', 'id':'product_id'})
price_history_df =price_history_df.rename(columns={'Date':'date', 'price.regular':'price', 'price.promo':'saleprice', 'productId': 'product_id'})
price_history_df = pd.concat([price_history_df, heb_price])
price_history_df['date'] = pd.to_datetime(price_history_df['date'])
#price_history_df

In [26]:
milk2_df = pd.read_sql_query('select * from "Milk"', con=engine)
milk2_df = milk2_df.drop(columns = ['Brand' , 'size', 'image', 'Store_ID', 'Type_ID', 'category', 'features', 'name'])
merged_price = price_history_df.merge(milk2_df, on = 'product_id')
#merged_price

In [27]:
merged_price = merged_price.drop(columns = ['product_id'])
merged_price = merged_price.rename(columns = {'id' : 'product_id'})
#merged_price

# Price History Dataframe is then added to Price History Table. Duplicates are not checked for in this instance because duplicates will trigger an error and stop the update process. 

In [28]:
temp_history = pd.read_sql_query('select * from "Price_History"', con=engine)
db_history = pd.concat([temp_history, merged_price]).drop_duplicates(keep=False)
db_history = db_history.reset_index(drop=True)   
#db_history

In [29]:
db_history.to_sql(name='Price_History', con=engine, if_exists='append', index=False)

In [30]:
pd.read_sql_query('select * from "Price_History"', con=engine)

Unnamed: 0,product_id,date,saleprice,price
0,1,2021-02-08,0.0,2.99
1,78,2021-02-08,0.0,1.99
2,79,2021-02-08,0.0,2.99
3,2,2021-02-08,0.0,1.99
4,80,2021-02-08,0.0,2.99
...,...,...,...,...
598,190,2021-02-10,0.0,3.07
599,77,2021-02-10,0.0,3.07
600,191,2021-02-10,0.0,3.07
601,296,2021-02-10,0.0,3.28
