## 01. Import Libraries

In [1]:
import pandas as pd
import numpy as np 
import os

## 02. Import Data Sets

In [2]:
path = r'D:\02.2022_Instacart Basket Analysis'

In [3]:
df_ords_prior = pd.read_csv(os.path.join(path,'02 Data','Original Data','order_products__prior.csv'),index_col = False)

In [4]:
df_ords_prior.columns

Index(['order_id', 'product_id', 'add_to_cart_order', 'reordered'], dtype='object')

In [5]:
df_ords_prior.shape

(32434489, 4)

In [6]:
df_ords_c = pd.read_csv(os.path.join(path,'02 Data','Prepared Data','orders_checked.csv'),index_col = 0)

  mask |= (ar1 == a)


In [7]:
df_ords_c.columns

Index(['order_id', 'user_id', 'order_number', 'orders_day_of_week',
       'order_time_of_day', 'days_since_prior_order', 'first_order'],
      dtype='object')

In [8]:
df_ords_c.shape

(3421083, 7)

## 03. Consistency Check

Orders and Products csv files have already gone through the consistency check. I would check the consistency of order_products_prior.csv here.

### (1.) Check for mixed-type data

In [10]:
for col in df_ords_prior.columns.tolist():
    weird = (df_ords_prior[[col]].applymap(type) != df_ords_prior[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (df_ords_prior[weird]) > 0:
        print (col + ' has mixed type data')
    else:
        print (col + ' is uniform')

order_id is uniform
product_id is uniform
add_to_cart_order is uniform
reordered is uniform


No mixed type data is found.

### (2.) Check for missing value

In [12]:
df_ords_prior.isnull().sum()

order_id             0
product_id           0
add_to_cart_order    0
reordered            0
dtype: int64

No missing value is found.

### (3.) Check for duplicates

In [18]:
df_ords_prior_dups= df_ords_prior[df_ords_prior.duplicated()]

In [19]:
df_ords_prior.dups

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered


No duplicate is found.

In [15]:
df_ords_prior.shape

(32434489, 4)

In [19]:
df_ords_prior_check.to_pickle(os.path.join(path,'02 Data','Prepared Data','df_ords_prior_check.pkl'))

## 04. Merging Instacart Data

In [20]:
## Change datatypes due to memory shortage
df_ords_c.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3421083 entries, 0 to 3421082
Data columns (total 7 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int64  
 1   user_id                 int64  
 2   order_number            int64  
 3   orders_day_of_week      int64  
 4   order_time_of_day       int64  
 5   days_since_prior_order  float64
 6   first_order             bool   
dtypes: bool(1), float64(1), int64(5)
memory usage: 186.0 MB


In [21]:
df_ords_prior_check.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32434489 entries, 0 to 32434488
Data columns (total 4 columns):
 #   Column             Dtype
---  ------             -----
 0   order_id           int64
 1   product_id         int64
 2   add_to_cart_order  int64
 3   reordered          int64
dtypes: int64(4)
memory usage: 1.2 GB


In [22]:
df_ords_c['order_id']=df_ords_c['order_id'].astype('int32')

In [23]:
df_ords_c['user_id']=df_ords_c['user_id'].astype('int32')

In [24]:
df_ords_c['order_number']=df_ords_c['order_number'].astype('int8')

In [25]:
df_ords_c['orders_day_of_week']=df_ords_c['orders_day_of_week'].astype('int8')

In [26]:
df_ords_c['order_time_of_day']=df_ords_c['order_time_of_day'].astype('int8')

In [27]:
df_ords_c['days_since_prior_order']=df_ords_c['days_since_prior_order'].astype('float16')

In [28]:
df_ords_c['first_order']=df_ords_c['first_order'].astype('object')

In [29]:
df_ords_prior_check['product_id']=df_ords_prior_check['product_id'].astype('int32')

In [30]:
df_ords_prior_check['reordered']=df_ords_prior_check['reordered'].astype('int8')

In [31]:
# Check the output
df_ords_c.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3421083 entries, 0 to 3421082
Data columns (total 7 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int32  
 1   user_id                 int32  
 2   order_number            int8   
 3   orders_day_of_week      int8   
 4   order_time_of_day       int8   
 5   days_since_prior_order  float16
 6   first_order             object 
dtypes: float16(1), int32(2), int8(3), object(1)
memory usage: 94.6+ MB


In [32]:
df_ords_prior_check.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32434489 entries, 0 to 32434488
Data columns (total 4 columns):
 #   Column             Dtype
---  ------             -----
 0   order_id           int64
 1   product_id         int32
 2   add_to_cart_order  int64
 3   reordered          int8 
dtypes: int32(1), int64(2), int8(1)
memory usage: 897.0 MB


In [33]:
df_ords_prior_check.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [34]:
df_ords_c.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_time_of_day,days_since_prior_order,first_order
0,2539329,1,1,2,8,,True
1,2398795,1,2,3,7,15.0,False
2,473747,1,3,3,12,21.0,False
3,2254736,1,4,4,7,29.0,False
4,431534,1,5,4,15,28.0,False


In [35]:
df_ords_prior_check.shape

(32434489, 4)

In [36]:
df_ords_c.shape

(3421083, 7)

In [37]:
# Merge df_ords_c and df_ords_prior with inner Join
# df_merged_large = df_ords_c.merge(df_ords_prior, on = 'order_id', indicator = True) 

In [38]:
# Merge df_ords_c and df_ords_prior with outer Join to see the merging percentage
df_merged_large_outer = df_ords_c.merge(df_ords_prior_check, on = 'order_id', how = 'outer', indicator = True)

In [39]:
# Check the merging result 
df_merged_large_outer.shape

(32640698, 11)

In [40]:
# Can see how many rows are inner join, left join or right join
df_merged_large_outer['_merge'].value_counts()

both          32434489
left_only       206209
right_only           0
Name: _merge, dtype: int64

In [41]:
# Choose only rows with inner join   
df_merge_large = df_merged_large_outer.loc[df_merged_large_outer['_merge'] == 'both']

In [42]:
df_merge_large.shape

(32434489, 11)

In [43]:
df_merge_large.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_time_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,_merge
0,2539329,1,1,2,8,,True,196.0,1.0,0.0,both
1,2539329,1,1,2,8,,True,14084.0,2.0,0.0,both
2,2539329,1,1,2,8,,True,12427.0,3.0,0.0,both
3,2539329,1,1,2,8,,True,26088.0,4.0,0.0,both
4,2539329,1,1,2,8,,True,26405.0,5.0,0.0,both


In [44]:
## Save df_merged_large to pickle files
df_merge_large .to_pickle(os.path.join(path,'02 Data','Prepared Data','orders_products_combined.pkl'))

## 05. Answer Questions

### (1.) In a new notebook, import the orders_products_combined dataframe from the pickle file you just saved.

In [46]:
orders_products_combined = pd.read_pickle(os.path.join(path,'02 Data','Prepared Data','orders_products_combined.pkl'))

In [47]:
orders_products_combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32434489 entries, 0 to 32640696
Data columns (total 11 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   order_id                int32   
 1   user_id                 int32   
 2   order_number            int8    
 3   orders_day_of_week      int8    
 4   order_time_of_day       int8    
 5   days_since_prior_order  float16 
 6   first_order             object  
 7   product_id              float64 
 8   add_to_cart_order       float64 
 9   reordered               float64 
 10  _merge                  category
dtypes: category(1), float16(1), float64(3), int32(2), int8(3), object(1)
memory usage: 1.6+ GB


### (2.) Check the shape of the imported dataframe (it should be the same as the one you exported—always check!).

In [49]:
orders_products_combined.shape

(32434489, 11)

It is the same as the exported dataframe.

In [50]:
orders_products_combined.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_time_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,_merge
0,2539329,1,1,2,8,,True,196.0,1.0,0.0,both
1,2539329,1,1,2,8,,True,14084.0,2.0,0.0,both
2,2539329,1,1,2,8,,True,12427.0,3.0,0.0,both
3,2539329,1,1,2,8,,True,26088.0,4.0,0.0,both
4,2539329,1,1,2,8,,True,26405.0,5.0,0.0,both


### (3.) Determine a suitable way to combine the orders_products_combined dataframe with your products data set. Make sure you’re using your wrangled, cleaned, and deduped products data set stored in your “Prepared Data” folder from the previous Exercise’s task.

### (4.) Confirm the results of the merge using the merge flag.

In [53]:
products = pd.read_csv(os.path.join(path,'02 Data','Prepared Data','products_checked.csv'), index_col = 0)

In [54]:
## Change Datatype due to memory shortage
products.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49670 entries, 0 to 49692
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   product_id     49670 non-null  int64  
 1   product_name   49670 non-null  object 
 2   aisle_id       49670 non-null  int64  
 3   department_id  49670 non-null  int64  
 4   prices         49670 non-null  float64
dtypes: float64(1), int64(3), object(1)
memory usage: 2.3+ MB


In [55]:
products['product_id'] = products['product_id'].astype('int32')

In [56]:
products['aisle_id'] = products['aisle_id'].astype('int8')

In [57]:
products['department_id'] = products['department_id'].astype('int8')

In [58]:
products['prices'] = products['prices'].astype('float16')

In [59]:
# Check the result
products.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49670 entries, 0 to 49692
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   product_id     49670 non-null  int32  
 1   product_name   49670 non-null  object 
 2   aisle_id       49670 non-null  int8   
 3   department_id  49670 non-null  int8   
 4   prices         49670 non-null  float16
dtypes: float16(1), int32(1), int8(2), object(1)
memory usage: 1.1+ MB


In [60]:
products.shape

(49670, 5)

In [61]:
products.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.800781
1,2,All-Seasons Salt,104,13,9.296875
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.300781


In [62]:
orders_products_combined.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_time_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,_merge
0,2539329,1,1,2,8,,True,196.0,1.0,0.0,both
1,2539329,1,1,2,8,,True,14084.0,2.0,0.0,both
2,2539329,1,1,2,8,,True,12427.0,3.0,0.0,both
3,2539329,1,1,2,8,,True,26088.0,4.0,0.0,both
4,2539329,1,1,2,8,,True,26405.0,5.0,0.0,both


In [63]:
# Check the merging percentage
orders_products_combined['_merge'].value_counts()

both          32434489
left_only            0
right_only           0
Name: _merge, dtype: int64

In [64]:
orders_products_combined.shape

(32434489, 11)

In [65]:
# Drop redundent column 
orders_products_combined = orders_products_combined.drop(columns = ['_merge'])

In [66]:
orders_products_combined.shape

(32434489, 10)

In [67]:
# Merge orders_products_combined with prodcuts through outer join
order_product_merge_outer = orders_products_combined.merge(products, on = 'product_id', how = 'outer', indicator = True)

In [68]:
# Check the merging result
order_product_merge_outer.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_time_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge
0,2539329.0,1.0,1.0,2.0,8.0,,True,196.0,1.0,0.0,Soda,77.0,7.0,9.0,both
1,2398795.0,1.0,2.0,3.0,7.0,15.0,False,196.0,1.0,1.0,Soda,77.0,7.0,9.0,both
2,473747.0,1.0,3.0,3.0,12.0,21.0,False,196.0,1.0,1.0,Soda,77.0,7.0,9.0,both
3,2254736.0,1.0,4.0,4.0,7.0,29.0,False,196.0,1.0,1.0,Soda,77.0,7.0,9.0,both
4,431534.0,1.0,5.0,4.0,15.0,28.0,False,196.0,1.0,1.0,Soda,77.0,7.0,9.0,both


In [69]:
order_product_merge_outer.shape

(32435070, 15)

In [70]:
# Check the merging percentage
order_product_merge_outer['_merge'].value_counts()

both          32399732
left_only        35327
right_only          11
Name: _merge, dtype: int64

In [71]:
# Choose only rows with inner join
order_product_merge_end= order_product_merge_outer[order_product_merge_outer['_merge'] == 'both']

In [72]:
# Check the changing number of rows and columns
order_product_merge_end.shape

(32399732, 15)

In [73]:
order_product_merge_end.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_time_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge
0,2539329.0,1.0,1.0,2.0,8.0,,True,196.0,1.0,0.0,Soda,77.0,7.0,9.0,both
1,2398795.0,1.0,2.0,3.0,7.0,15.0,False,196.0,1.0,1.0,Soda,77.0,7.0,9.0,both
2,473747.0,1.0,3.0,3.0,12.0,21.0,False,196.0,1.0,1.0,Soda,77.0,7.0,9.0,both
3,2254736.0,1.0,4.0,4.0,7.0,29.0,False,196.0,1.0,1.0,Soda,77.0,7.0,9.0,both
4,431534.0,1.0,5.0,4.0,15.0,28.0,False,196.0,1.0,1.0,Soda,77.0,7.0,9.0,both


### (5.) Export the newly created dataframe as orders_products_merged in a suitable format (taking into consideration the size).

### (6.) Ensure your notebooks and Instacart project folder are organized and that comments and section headings have been used throughout your code. All your exported data files should be effectively labelled and stored in your “Data” folder.

In [76]:
order_product_merge_end.to_pickle(os.path.join(path,'02 Data','Prepared Data','order_product_merge_f.pkl'))