In [1]:
# Import libraries

import pandas as pd
import numpy as np
import os
path = r'/home/scruffy/anaconda_projects/Instacart Basket Analysis/'
# Import data to dataframes
ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_checked.csv'))
ords_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders_products_prior.csv'))

# Check outputs

In [3]:
# Check the output
print(ords_prods.shape)
print(ords_prods.head())

(32434489, 4)
   order_id  product_id  add_to_cart_order  reordered
0         2       33120                  1          1
1         2       28985                  2          1
2         2        9327                  3          0
3         2       45918                  4          1
4         2       30035                  5          0


In [4]:
ords.columns

Index(['order_id', 'user_id', 'eval_set', 'order_number', 'order_dow',
       'order_hour_of_day', 'days_since_prior_order', 'no_prior_orders'],
      dtype='object')

In [5]:
ords.drop(columns = {'eval_set', 'no_prior_orders'}, inplace = True)
ords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 6 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int64  
 1   user_id                 int64  
 2   order_number            int64  
 3   order_dow               int64  
 4   order_hour_of_day       int64  
 5   days_since_prior_order  float64
dtypes: float64(1), int64(5)
memory usage: 156.6 MB


# Change data types to conserve memory

In [7]:
# Change types for ords data set

ords['order_id']=ords['order_id'].astype('int32')
ords['user_id'] = ords['user_id'].astype('int32')
ords['order_number']=ords['order_number'].astype('int8')
ords['order_dow']=ords['order_dow'].astype('int8')
ords['order_hour_of_day']=ords['order_hour_of_day'].astype('int8')
ords['days_since_prior_order']=ords['days_since_prior_order'].astype('float16')

In [8]:
# Check result
ords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3421083 entries, 0 to 3421082
Data columns (total 6 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_id                int32  
 1   user_id                 int32  
 2   order_number            int8   
 3   order_dow               int8   
 4   order_hour_of_day       int8   
 5   days_since_prior_order  float16
dtypes: float16(1), int32(2), int8(3)
memory usage: 42.4 MB


In [9]:
# Check ords_prods info
ords_prods.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32434489 entries, 0 to 32434488
Data columns (total 4 columns):
 #   Column             Dtype
---  ------             -----
 0   order_id           int64
 1   product_id         int64
 2   add_to_cart_order  int64
 3   reordered          int64
dtypes: int64(4)
memory usage: 989.8 MB


In [10]:
# Change types for ords prods data set 

ords_prods['product_id'] =ords_prods['product_id'].astype('int32')
ords_prods['reordered']=ords_prods['reordered'].astype('int8')
ords_prods['add_to_cart_order']=ords_prods['add_to_cart_order'].astype('int32')
ords_prods['order_id']=ords_prods['order_id'].astype('int32')

In [11]:
# Check result
ords_prods.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32434489 entries, 0 to 32434488
Data columns (total 4 columns):
 #   Column             Dtype
---  ------             -----
 0   order_id           int32
 1   product_id         int32
 2   add_to_cart_order  int32
 3   reordered          int8 
dtypes: int32(3), int8(1)
memory usage: 402.1 MB


# Merge DataFrames

In [13]:
# Merge DataFrames
ords_prods_merged = ords.merge(ords_prods, on = 'order_id', indicator = True)

In [14]:
# Check result
print(ords_prods_merged.head())

   order_id  user_id  order_number  order_dow  order_hour_of_day  \
0   2539329        1             1          2                  8   
1   2539329        1             1          2                  8   
2   2539329        1             1          2                  8   
3   2539329        1             1          2                  8   
4   2539329        1             1          2                  8   

   days_since_prior_order  product_id  add_to_cart_order  reordered _merge  
0                     NaN         196                  1          0   both  
1                     NaN       14084                  2          0   both  
2                     NaN       12427                  3          0   both  
3                     NaN       26088                  4          0   both  
4                     NaN       26405                  5          0   both  


In [15]:
print(ords_prods_merged['_merge'].value_counts())
print(ords_prods_merged.shape)
print(ords_prods_merged.info())

_merge
both          32434489
left_only            0
right_only           0
Name: count, dtype: int64
(32434489, 10)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32434489 entries, 0 to 32434488
Data columns (total 10 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   order_id                int32   
 1   user_id                 int32   
 2   order_number            int8    
 3   order_dow               int8    
 4   order_hour_of_day       int8    
 5   days_since_prior_order  float16 
 6   product_id              int32   
 7   add_to_cart_order       int32   
 8   reordered               int8    
 9   _merge                  category
dtypes: category(1), float16(1), int32(4), int8(4)
memory usage: 711.4 MB
None


# Export data to csv

In [17]:
# Drop _merge flag and export data to pkl
ords_prods_merged.drop(['_merge'], axis=1, inplace=True)
ords_prods_merged.to_csv(os.path.join(path, '02 Data','Prepared Data', 'ords_prods_merged.csv'), index=False)