# NOTEBOOK 2 - Merger of ORDERS_PRODUCTS with PRODUCTS

DATASETS:
- ORDERS_PRODUCTS = df_ORDS_large - merged dataset with 32.640.698 rows and 9 columns
- PRODUCTS = df_PRODS_clean - wrangled dataset with 49.677 rows and 5 columns

CONTENTS:
1. Data types
2. Merge
3. Missing values

In [21]:
# Import libraries

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [22]:
# For the merge purposes I am working on csv files

df_ORDS_large = pd.read_csv(r'D:\CareerFoundry\Data Immersion\Task 4\Instacart Basket Analysis_2021-07-22\02 Data\Prepared Data\df_ORDS_large_wrangled_final.csv')

In [23]:
df_PRODS_clean = pd.read_csv(r'D:\CareerFoundry\Data Immersion\Task 4\Instacart Basket Analysis_2021-07-22\02 Data\Prepared Data\df_PRODS_wrangled_final.csv')

    1. Data types

In [24]:
df_ORDS_large.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32640698 entries, 0 to 32640697
Data columns (total 9 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int64  
 1   user_id                 int64  
 2   order_number            int64  
 3   order_day_of_the_week   int64  
 4   order_hour_of_day       int64  
 5   days_since_prior_order  float64
 6   product_id              float64
 7   add_to_cart_order       float64
 8   reordered               float64
dtypes: float64(4), int64(5)
memory usage: 2.2 GB


In [25]:
df_PRODS_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49677 entries, 0 to 49676
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   product_id     49677 non-null  int64  
 1   product_name   49677 non-null  object 
 2   aisle_id       49677 non-null  int64  
 3   department_id  49677 non-null  int64  
 4   prices         49677 non-null  float64
dtypes: float64(1), int64(3), object(1)
memory usage: 1.9+ MB


In [28]:
# Change "product_id" data type into float

df_PRODS_clean['product_id'] = df_PRODS_clean['product_id'].astype('float64')

In [29]:
df_PRODS_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49677 entries, 0 to 49676
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   product_id     49677 non-null  float64
 1   product_name   49677 non-null  object 
 2   aisle_id       49677 non-null  int64  
 3   department_id  49677 non-null  int64  
 4   prices         49677 non-null  float64
dtypes: float64(2), int64(2), object(1)
memory usage: 1.9+ MB


    2. Merge

In [30]:
# Merge with df_ORDS_large

df_ORDS_PRODS_merged = df_ORDS_large.merge(df_PRODS_clean, on = 'product_id', how = 'outer', indicator = True)

In [31]:
df_ORDS_PRODS_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32642461 entries, 0 to 32642460
Data columns (total 14 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   order_id                float64 
 1   user_id                 float64 
 2   order_number            float64 
 3   order_day_of_the_week   float64 
 4   order_hour_of_day       float64 
 5   days_since_prior_order  float64 
 6   product_id              float64 
 7   add_to_cart_order       float64 
 8   reordered               float64 
 9   product_name            object  
 10  aisle_id                float64 
 11  department_id           float64 
 12  prices                  float64 
 13  _merge                  category
dtypes: category(1), float64(12), object(1)
memory usage: 3.4+ GB


In [32]:
df_ORDS_PRODS_merged['_merge'].value_counts()

both          32406041
left_only       236409
right_only          11
Name: _merge, dtype: int64

In [33]:
df_ORDS_PRODS_merged.drop(columns = ['_merge'], inplace = True)

In [34]:
df_ORDS_PRODS_merged.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_the_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices
0,2539329.0,1.0,1.0,2.0,8.0,,196.0,1.0,0.0,Soda,77.0,7.0,9.0
1,2398795.0,1.0,2.0,3.0,7.0,15.0,196.0,1.0,1.0,Soda,77.0,7.0,9.0
2,473747.0,1.0,3.0,3.0,12.0,21.0,196.0,1.0,1.0,Soda,77.0,7.0,9.0
3,2254736.0,1.0,4.0,4.0,7.0,29.0,196.0,1.0,1.0,Soda,77.0,7.0,9.0
4,431534.0,1.0,5.0,4.0,15.0,28.0,196.0,1.0,1.0,Soda,77.0,7.0,9.0


    3. Missing values

In [35]:
df_ORDS_PRODS_merged.isnull().sum()

order_id                       11
user_id                        11
order_number                   11
order_day_of_the_week          11
order_hour_of_day              11
days_since_prior_order    2078218
product_id                 206209
add_to_cart_order          206220
reordered                  206220
product_name               236409
aisle_id                   236409
department_id              236409
prices                     236409
dtype: int64

In [36]:
# Check the missing values in "order_id" column

df_NaN_order_id = df_ORDS_PRODS_merged[df_ORDS_PRODS_merged['order_id'].isnull() == True]

In [37]:
df_NaN_order_id

# it looks like those 11 products have never been ordered by any user yet

Unnamed: 0,order_id,user_id,order_number,order_day_of_the_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices
32642450,,,,,,,3630.0,,,Protein Granola Apple Crisp,57.0,14.0,2.6
32642451,,,,,,,3718.0,,,Wasabi Cheddar Spreadable Cheese,21.0,16.0,12.0
32642452,,,,,,,7045.0,,,Unpeeled Apricot Halves in Heavy Syrup,88.0,13.0,6.8
32642453,,,,,,,25383.0,,,Chocolate Go Bites,61.0,19.0,1.8
32642454,,,,,,,27499.0,,,Non-Dairy Coconut Seven Layer Bar,100.0,21.0,15.0
32642455,,,,,,,36233.0,,,Water With Electrolytes,100.0,21.0,10.4
32642456,,,,,,,37703.0,,,Ultra Sun Blossom Liquid 90 loads Fabric Enhan...,75.0,17.0,14.3
32642457,,,,,,,43725.0,,,Sweetart Jelly Beans,100.0,21.0,8.1
32642458,,,,,,,45971.0,,,12 Inch Taper Candle White,101.0,17.0,9.8
32642459,,,,,,,46625.0,,,Single Barrel Kentucky Straight Bourbon Whiskey,31.0,7.0,1.7


In [38]:
# Check the missing values in "days_since_prior_order" using crosstab as most likely those relate to all the first orders

crosstab_DSPO = pd.crosstab(df_ORDS_PRODS_merged['days_since_prior_order'], df_ORDS_PRODS_merged['order_number'], dropna = False)

In [39]:
crosstab_DSPO.to_clipboard()

# I am not sure if I am correct but in a dataset with 32.642.461 rows, where each rows represent a product attached to each order_id and user_id, 
# 2.078.218 products can be labeled as sold within first order of each user because this is the number of missing values in "days_since_prior_order". 

In [19]:
df_ORDS_PRODS_merged.to_csv(r'D:\CareerFoundry\Data Immersion\Task 4\Instacart Basket Analysis_2021-07-22\02 Data\Prepared Data\df_ORDS_PRODS_merged.csv', index = False)

In [20]:
df_ORDS_PRODS_merged.to_pickle(r'D:\CareerFoundry\Data Immersion\Task 4\Instacart Basket Analysis_2021-07-22\02 Data\Prepared Data\df_ORDS_PRODS_merged.pkl')

# For continuation refer to notebook 3_Final_Instacart