In [95]:
import pandas as pd
import numpy as np
import seaborn as sns
from file_utils import load_dataframes

In [97]:
def int_converter(x):
    try:
        return int(float(x))
    except ValueError:
        return np.nan

In [98]:
items = pd.read_csv('data/items.csv', index_col=['item_id', 'alley_id'], na_values='unknown')
items.drop(columns='Unnamed: 0', inplace=True)

In [99]:
items.info()
print(items.head())
print(items.index)

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 49147 entries, (1.0, 61.0) to (49688.0, 73.0)
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   item_name  47607 non-null  object 
 1   category   47622 non-null  float64
dtypes: float64(1), object(1)
memory usage: 2.4+ MB
                                                          item_name  category
item_id alley_id                                                             
1.0     61.0                             Chocolate Sandwich Cookies      19.0
2.0     104.0                                      All-Seasons Salt      13.0
3.0     94.0                   Robust Golden Unsweetened Oolong Tea       7.0
4.0     38.0      Smart Ones Classic Favorites Mini Rigatoni Wit...       1.0
5.0     5.0                               Green Chile Anytime Sauce      13.0
MultiIndex([(    1.0,  61.0),
            (    2.0, 104.0),
            (    3.0,  94.0),
            (    4.0,  3

In [100]:
categories = pd.read_csv('data/categories.csv', index_col='category_id')
categories.drop(columns='Unnamed: 0', inplace=True)

In [101]:
categories.info()
categories.describe()
categories.index

<class 'pandas.core.frame.DataFrame'>
Index: 21 entries, 1 to 21
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  21 non-null     object
dtypes: object(1)
memory usage: 336.0+ bytes


Index([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
       21],
      dtype='int64', name='category_id')

In [102]:
alley_inventory = pd.read_csv('data/alley_inventory.csv', index_col='alley_id')

In [103]:
alley_inventory.info()
alley_inventory.head()

<class 'pandas.core.frame.DataFrame'>
Index: 134 entries, 1 to 134
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   alley   134 non-null    object
dtypes: object(1)
memory usage: 2.1+ KB


Unnamed: 0_level_0,alley
alley_id,Unnamed: 1_level_1
1,prepared soups salads
2,specialty cheeses
3,energy granola bars
4,instant foods
5,marinades meat preparation


In [147]:
transaction_items__prior = pd.read_csv('data/transaction_items__prior.csv', na_values='unknown')

In [148]:
transaction_items__prior.drop(columns='Unnamed: 0', inplace=True)
transaction_items__prior.dropna(subset=['transaction_id'], inplace=True)
transaction_items__prior.set_index(['transaction_id','item_id'], inplace=True)

In [149]:
transaction_items__prior.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 31059342 entries, (2.0, 33120.0) to (3421083.0, 5020.0)
Data columns (total 2 columns):
 #   Column             Dtype  
---  ------             -----  
 0   add_to_cart_order  float64
 1   previous_bought    float64
dtypes: float64(2)
memory usage: 801.3 MB


In [143]:
transaction_items__prior.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,add_to_cart_order,previous_bought
transaction_id,item_id,Unnamed: 2_level_1,Unnamed: 3_level_1
2.0,33120.0,1.0,1.0
2.0,28985.0,2.0,1.0
2.0,45918.0,4.0,1.0
2.0,30035.0,5.0,0.0
2.0,17794.0,6.0,1.0
2.0,40141.0,7.0,1.0
2.0,1819.0,8.0,1.0
2.0,43668.0,9.0,0.0
3.0,33754.0,1.0,1.0
3.0,24838.0,2.0,1.0


In [150]:
transaction_items__train = pd.read_csv('data/transaction_items__train.csv', na_values='unknown')

In [151]:
transaction_items__train.drop(columns='Unnamed: 0', inplace=True)
transaction_items__train.dropna(subset=['transaction_id'], inplace=True)
transaction_items__train.set_index(['transaction_id', 'item_id'], inplace=True)

In [152]:
transaction_items__train.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1325747 entries, (1.0, 49302.0) to (3421070.0, 4724.0)
Data columns (total 2 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   add_to_cart_order  1284160 non-null  float64
 1   previous_bought    1283667 non-null  float64
dtypes: float64(2)
memory usage: 36.7 MB


In [153]:
transaction_items__train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,add_to_cart_order,previous_bought
transaction_id,item_id,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,49302.0,1.0,1.0
1.0,11109.0,2.0,1.0
1.0,10246.0,3.0,0.0
1.0,49683.0,4.0,0.0
1.0,43633.0,5.0,1.0


In [157]:
transactions = pd.read_csv('data/transactions.csv', na_values='unknown')

In [158]:
transactions.drop(columns='Unnamed: 0', inplace=True)
transactions.dropna(subset=['transaction_id'], inplace=True)
transactions.set_index(['transaction_id', 'customer_id'], inplace=True)

In [159]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 3204781 entries, (2539329.0, 1.0) to (272231.0, 206209.0)
Data columns (total 5 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   eval_set                object 
 1   transaction_number      float64
 2   day_of_week             float64
 3   time_of_day             float64
 4   days_since_prior_order  float64
dtypes: float64(4), object(1)
memory usage: 245.3+ MB


In [160]:
transactions.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,eval_set,transaction_number,day_of_week,time_of_day,days_since_prior_order
transaction_id,customer_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2539329.0,1.0,prior,1.0,2.0,8.0,
2398795.0,1.0,prior,2.0,3.0,7.0,15.0
473747.0,1.0,prior,3.0,3.0,12.0,21.0
2254736.0,1.0,prior,4.0,4.0,7.0,29.0
431534.0,1.0,prior,5.0,4.0,15.0,28.0
