# 4.6 Combining & Exporting Data Part 2

### This script contains the following points:
#### 1. Import resources
#### 2. Combine orders_products_combined with products.csv

### 1. Import resources

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# Set a path variable for the import
path = r'C:\Users\zhoux\Documents\CF-Data Analyst Program\DA-Immersion\10-2023 Instacart Basket Analysis'

##### Import orders_products_combined.pkl

In [3]:
df_op_combined = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_combined.pkl'))

In [4]:
# Check the shape
df_op_combined.shape

(32434489, 10)

In [5]:
df_op_combined.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered
0,2539329,1,prior,1,2,8,,196,1,0
1,2539329,1,prior,1,2,8,,14084,2,0
2,2539329,1,prior,1,2,8,,12427,3,0
3,2539329,1,prior,1,2,8,,26088,4,0
4,2539329,1,prior,1,2,8,,26405,5,0


##### Import products_checked.csv

In [6]:
df_prod = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'products_checked.csv'), index_col = False)

In [7]:
# Check the shape
df_prod.shape

(49672, 5)

In [8]:
df_prod.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


### 2. Combine orders_products_combined with products.csv

The one column the 2 dataframes have in common is 'product_id'. We will combine them using this column.
Because we don't know how the dataframes are matched, we will first try how = 'outer' to see the full picture.

In [9]:
# perform an outer join, with merge flag displayed. 
df_op_merged = df_op_combined.merge(df_prod, on = 'product_id', indicator = True, how = 'outer')

In [10]:
df_op_merged.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge
0,2539329.0,1.0,prior,1.0,2.0,8.0,,196,1.0,0.0,Soda,77.0,7.0,9.0,both
1,2398795.0,1.0,prior,2.0,3.0,7.0,15.0,196,1.0,1.0,Soda,77.0,7.0,9.0,both
2,473747.0,1.0,prior,3.0,3.0,12.0,21.0,196,1.0,1.0,Soda,77.0,7.0,9.0,both
3,2254736.0,1.0,prior,4.0,4.0,7.0,29.0,196,1.0,1.0,Soda,77.0,7.0,9.0,both
4,431534.0,1.0,prior,5.0,4.0,15.0,28.0,196,1.0,1.0,Soda,77.0,7.0,9.0,both


In [11]:
df_op_merged['_merge'].value_counts()

_merge
both          32404859
left_only        30200
right_only          11
Name: count, dtype: int64

The merge flag indicates that the dataframes aren't fully matching.
For the sake of this project however, we only need fully matched data. So we will leave out the 'left_only' and 'right_only'.

In [12]:
# Perform an inner join, with merge flag displayed
df_op_merged = df_op_combined.merge(df_prod, on = 'product_id', indicator = True)

In [13]:
df_op_merged.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge
0,2539329,1,prior,1,2,8,,196,1,0,Soda,77,7,9.0,both
1,2398795,1,prior,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both
2,473747,1,prior,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both
3,2254736,1,prior,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both
4,431534,1,prior,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both


In [14]:
# Again we don't really need the merge flag to be in the exported file, so we remove it here
df_op_merged = df_op_merged.drop('_merge', axis=1)
df_op_merged.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices
0,2539329,1,prior,1,2,8,,196,1,0,Soda,77,7,9.0
1,2398795,1,prior,2,3,7,15.0,196,1,1,Soda,77,7,9.0
2,473747,1,prior,3,3,12,21.0,196,1,1,Soda,77,7,9.0
3,2254736,1,prior,4,4,7,29.0,196,1,1,Soda,77,7,9.0
4,431534,1,prior,5,4,15,28.0,196,1,1,Soda,77,7,9.0


##### Export as .pkl

In [15]:
df_op_merged.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_merged.pkl'))