## Tannis McCartney
## May 26, 2022

### This notebook goes through importing, wrangling, and checking the orders and prior_orders data in preparation for merging with the products dataframe. 

## Contents
### 01 Import libraries
### 02 Import orders data
### 03 Data wrangling orders data
### 04 Consistency checks on orders data
### 05 Change orders data types to reduce memory usage
### 06 Import prior orders data
### 07 Data wrangling prior orders data
### 08 Consistency checks on prior orders data
### 09 Change orders data types to reduce memory usage
### 10 Merge orders and prior orders dataframes
### 11 Export merged dataframe

# 01 Import libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

# 02 Import orders data

In [2]:
# Turn project folder path into a string
path = r'C:\Users\tmmcc\Google Drive\Data Analytics Bootcamp\4 Python Fundamentals for Data Analysts\05-2022 Instacart Basket Analysis'

In [3]:
# Import orders.csv to df
df_orders = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders.csv'), index_col=False)
df_orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [4]:
# Check shape of df_orders
df_orders.shape

(3421083, 7)

# 03 Wrangling orders data

In [5]:
# Drop the eval_set column from the orders dataframe
df_orders = df_orders.drop(columns = ['eval_set'])

In [6]:
#Rename order_dow column in the orders dataframe
df_orders.rename(columns={'order_dow':'orders_day_of_week'}, inplace=True)
df_orders.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [None]:
# Change data type of orders_id column (in the orders dataframe) to string
df_orders['order_id'] = df_orders['order_id']=df_orders['order_id'].astype('str')
df_orders['order_id'].dtype

# 04 Consistency checks on orders dataframe

In [7]:
# Check for mixed types in the orders dataframe
for col in df_orders.columns.tolist():
    weird = (df_orders[[col]].applymap(type) != df_orders[[col]].iloc[0].apply(type)).any(axis=1)
    if len (df_orders[weird]) > 0:
        print(col)

#### No mixed-type data was found in the orders dataframe.

In [8]:
# Check for missing values
df_orders.isnull().sum()

order_id                       0
user_id                        0
order_number                   0
orders_day_of_week             0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

#### The days_since_prior_order column has 206209 null values. These could be customers who ordered the day the dataset was generated. Others could be customers who have accounts but have not placed any orders yet.

In [9]:
# Create crosstab to compare days_since_prior_order and order_number
crosstab = pd.crosstab(df_orders['days_since_prior_order'], df_orders['order_number'], dropna = False)

In [10]:
# Export crosstab to clipboard for examination in Excel
crosstab.to_clipboard()

#### Where the days_since_prior_order value is 0, the order_number is 1. These are new customers who have only placed one order. A new column to flag these customers as new customers, will be added. A flag of True means days_since_prior_order is NaN because it is a new customer.

In [11]:
# Add new variable to orders dataframe to flag customers who are on their first order.
df_orders_new = df_orders
df_orders_new['new_customer']=np.where(df_orders_new['days_since_prior_order'].isnull(), True, False)
df_orders_new.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,new_customer
0,2539329,1,1,2,8,,True
1,2398795,1,2,3,7,15.0,False
2,473747,1,3,3,12,21.0,False
3,2254736,1,4,4,7,29.0,False
4,431534,1,5,4,15,28.0,False


In [12]:
# Look for full duplicates in the products dataframe
df_dups_ords = df_orders_new[df_orders_new.duplicated()]
df_dups_ords

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,new_customer


#### There are no duplicates in df_orders

In [13]:
# Descriptive statistics on df_orders
df_orders.describe()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


#### The descriptive statistics seem reasonable.

In [14]:
# Check the shape of orders dataframe after consistency checks
df_orders.shape

(3421083, 7)

# 05 Change orders data types to reduce memory usage

In [15]:
# Check data types and memory usage for df_orders
df_orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 7 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int64  
 1   user_id                 int64  
 2   order_number            int64  
 3   orders_day_of_week      int64  
 4   order_hour_of_day       int64  
 5   days_since_prior_order  float64
 6   new_customer            bool   
dtypes: bool(1), float64(1), int64(5)
memory usage: 159.9 MB


In [16]:
# Change data types for df_orders
df_orders['order_id'] = df_orders['order_id'].astype('int32')
df_orders['user_id'] = df_orders['user_id'].astype('int32')
df_orders['order_number']=df_orders['order_number'].astype('int8')
df_orders['orders_day_of_week']=df_orders['orders_day_of_week'].astype('int8')
df_orders['order_hour_of_day']=df_orders['order_hour_of_day'].astype('int8')
df_orders['days_since_prior_order']=df_orders['days_since_prior_order'].astype('float16')

In [17]:
# Recheck data types and memory usage of df_ordersd
df_orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 7 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int32  
 1   user_id                 int32  
 2   order_number            int8   
 3   orders_day_of_week      int8   
 4   order_hour_of_day       int8   
 5   days_since_prior_order  float16
 6   new_customer            bool   
dtypes: bool(1), float16(1), int32(2), int8(3)
memory usage: 45.7 MB


#### The memory usage has been reduced from 159.9Mb to 45.7 Mb

In [18]:
# Check the descriptive statistics of df_orders
df_orders.describe()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,
std,987581.7,59533.72,17.73316,2.046829,4.226088,0.0
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


# 06 Import prior orders

In [19]:
# Import new previous orders data set
df_ords_prior = pd.read_csv(os.path.join(path, '02 Data', 'Original data', 'order_products_prior.csv'), index_col=False)
df_ords_prior.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [20]:
# Check shape of df_ords_prior
df_ords_prior.shape

(32434489, 4)

# 07 Wrangling prior orders dataframe

#### No wrangling is needed for prior orders dataframe

# 08 Consistency checks on prior orders

In [21]:
# Check for mixed types in the prior orders dataframe
for col in df_ords_prior.columns.tolist():
    weird = (df_ords_prior[[col]].applymap(type) != df_ords_prior[[col]].iloc[0].apply(type)).any(axis=1)
    if len (df_ords_prior[weird]) > 0:
        print(col)

#### There are no mixed types in the prior orders dataframe

In [22]:
# Check for missing observations in prior orders dataframe
df_ords_prior.isnull().sum()

order_id             0
product_id           0
add_to_cart_order    0
reordered            0
dtype: int64

#### There is no missing data in the prior orders dataframe

In [23]:
# Look for full duplicates in the prior orders dataframe
df_dups2 = df_ords_prior[df_ords_prior.duplicated()]
df_dups2

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered


#### There are no duplicates in the prior orders dataframe

In [24]:
# Check descriptive statistics on prior orders dataframe
df_ords_prior.describe()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
count,32434490.0,32434490.0,32434490.0,32434490.0
mean,1710749.0,25576.34,8.351076,0.5896975
std,987300.7,14096.69,7.126671,0.4918886
min,2.0,1.0,1.0,0.0
25%,855943.0,13530.0,3.0,0.0
50%,1711048.0,25256.0,6.0,1.0
75%,2565514.0,37935.0,11.0,1.0
max,3421083.0,49688.0,145.0,1.0


#### The descriptive statistics seem reasonable.

In [29]:
# Check the shape of df_ords_prior
df_ords_prior.shape

(32434489, 4)

# 09 Change orders data types to reduce memory usage

In [25]:
# Check data types and memory usage for df_ords_prior
df_ords_prior.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32434489 entries, 0 to 32434488
Data columns (total 4 columns):
 #   Column             Dtype
---  ------             -----
 0   order_id           int64
 1   product_id         int64
 2   add_to_cart_order  int64
 3   reordered          int64
dtypes: int64(4)
memory usage: 989.8 MB


In [26]:
# Change data types for df_ords_prior
df_ords_prior['order_id'] = df_ords_prior['order_id'].astype('int32')
df_ords_prior['product_id'] = df_ords_prior['product_id'].astype('int32')
df_ords_prior['add_to_cart_order'] = df_ords_prior['add_to_cart_order'].astype('int32')
df_ords_prior['reordered'] = df_ords_prior['reordered'].astype('int8')

In [27]:
# Recheck data types and memory usage of df_ords_prior
df_ords_prior.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32434489 entries, 0 to 32434488
Data columns (total 4 columns):
 #   Column             Dtype
---  ------             -----
 0   order_id           int32
 1   product_id         int32
 2   add_to_cart_order  int32
 3   reordered          int8 
dtypes: int32(3), int8(1)
memory usage: 402.1 MB


#### The memory usage has been reduced from 989.8Mb to 402.1Mb 

In [28]:
# Check the descriptive stats of df_ords_prior
df_ords_prior.describe()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
count,32434490.0,32434490.0,32434490.0,32434490.0
mean,1710749.0,25576.34,8.351076,0.5896975
std,987300.7,14096.69,7.126671,0.4918886
min,2.0,1.0,1.0,0.0
25%,855943.0,13530.0,3.0,0.0
50%,1711048.0,25256.0,6.0,1.0
75%,2565514.0,37935.0,11.0,1.0
max,3421083.0,49688.0,145.0,1.0


# 10 Merge orders and prior orders dataframes

In [33]:
# Merge dataframes based on order_id
df_merged_large = df_orders.merge(df_ords_prior, on = 'order_id', indicator=True)
df_merged_large.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,new_customer,product_id,add_to_cart_order,reordered,_merge
0,2539329,1,1,2,8,,True,196,1,0,both
1,2539329,1,1,2,8,,True,14084,2,0,both
2,2539329,1,1,2,8,,True,12427,3,0,both
3,2539329,1,1,2,8,,True,26088,4,0,both
4,2539329,1,1,2,8,,True,26405,5,0,both


In [34]:
# Check for a full match
df_merged_large['_merge'].value_counts()

both          32434489
left_only            0
right_only           0
Name: _merge, dtype: int64

In [35]:
# Remove _merge column 
df_merged_large = df_merged_large.drop(columns=['_merge'])
df_merged_large.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,new_customer,product_id,add_to_cart_order,reordered
0,2539329,1,1,2,8,,True,196,1,0
1,2539329,1,1,2,8,,True,14084,2,0
2,2539329,1,1,2,8,,True,12427,3,0
3,2539329,1,1,2,8,,True,26088,4,0
4,2539329,1,1,2,8,,True,26405,5,0


In [36]:
# Check shape of df_merged_large
df_merged_large.shape

(32434489, 10)

# 11 Export merged dataframe

In [37]:
# Export data to pkl
df_merged_large.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_combined.pkl'))