## Tannis McCartney
## May 26, 2022

### This notebook merges the orders and products dataframes (that have already been wrangled and checked)

## Contents
### 01 Import libraries
### 02 Import orders and products data
### 03 Merge dataframes
### 04 Export merged dataframe

# 01 Import libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

# 02 Import data

In [2]:
# Turn project folder path into a string
path = r'C:\Users\tmmcc\Google Drive\Data Analytics Bootcamp\4 Python Fundamentals for Data Analysts\05-2022 Instacart Basket Analysis'

In [3]:
# Import merged orders data created in 4.6 Part 1 and Data Types changed in Part 2
df_orders = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared data', 'orders_combined.pkl'))
df_orders.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,new_customer,product_id,add_to_cart_order,reordered
0,2539329,1,1,2,8,,True,196,1,0
1,2539329,1,1,2,8,,True,14084,2,0
2,2539329,1,1,2,8,,True,12427,3,0
3,2539329,1,1,2,8,,True,26088,4,0
4,2539329,1,1,2,8,,True,26405,5,0


In [4]:
# Check the shape of df_orders
df_orders.shape

(32434489, 10)

In [5]:
# Import products dataset
df_prods = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared data', 'products.pkl'))
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [6]:
# Check the shape of df_prods
df_prods.shape

(49688, 5)

# 03 Merge dataframes

In [8]:
# Merge dataframes based on product_id
df_merged = df_orders.merge(df_prods, on ='product_id', indicator = True)
df_merged.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,new_customer,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge
0,2539329,1,1,2,8,,True,196,1,0,Soda,77,7,9.0,both
1,2398795,1,2,3,7,15.0,False,196,1,1,Soda,77,7,9.0,both
2,473747,1,3,3,12,21.0,False,196,1,1,Soda,77,7,9.0,both
3,2254736,1,4,4,7,29.0,False,196,1,1,Soda,77,7,9.0,both
4,431534,1,5,4,15,28.0,False,196,1,1,Soda,77,7,9.0,both


In [9]:
# Check for a full match
df_merged['_merge'].value_counts()

both          32433030
left_only            0
right_only           0
Name: _merge, dtype: int64

In [10]:
# Check the shape of df_merged
df_merged.shape

(32433030, 15)

#### df_merged has 1459 fewer rows than than the orders dataframe. 

In [14]:
# Filter df_orders to show the observations that are omitted
df_test = df_orders[~df_orders['product_id'].isin(df_prods['product_id'])]
df_test.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,new_customer,product_id,add_to_cart_order,reordered
2963,7099,27,63,3,10,1.0,False,6799,1,0
3205,1837192,27,80,2,8,6.0,False,6799,9,1
45603,2915432,298,1,4,14,,True,6799,2,0
45634,613874,298,3,1,12,14.0,False,6799,2,1
73858,690386,479,1,3,17,,True,6799,7,0


In [15]:
# Check the frequency of the product_id in df_test
df_test['product_id'].value_counts()

6799     1978
26519      51
Name: product_id, dtype: int64

In [18]:
# Check for these product IDs in df_prods
# df_prods.loc[df_prods['product_id']==6799]
df_prods.loc[df_prods['product_id']==26519]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices


#### The missing rows in df_merged are the result of two product_ids missing from df_prods. The client may have this information. For now, nothing can be done.

In [19]:
# Create an object of the missing product ids
# Check the frequency of the product_id in df_test
missing_products = df_test['product_id'].value_counts()

In [20]:
# Copy the missing_products to the clipboard
missing_products.to_clipboard()

# 04 Export merged dataframe

In [21]:
# Remove _merge column
df_merged = df_merged.drop(columns=['_merge'])

In [22]:
# Export df_merged to pickle
df_merged.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_merged.pkl'))