# Loading Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.style

# Loading the Dataset

In [3]:
# Reading Data # First Column is the index
df = pd.read_csv('../00_Data/Rec_Sys_precleaned.csv', index_col=0)

# First five rows
df.head()

Unnamed: 0,InvoiceNo,StockCode,Quantity,InvoiceDate,DeliveryDate,Discount%,ShipMode,ShippingCost,CustomerID,Gender,...,Income,Zipcode,Customer Segment,Product Name,Description,Category,Brand,Unit Price,Num_word_text,Num_word_cat
0,536365,84029E,6,2010-12-01 08:26:00,2010-12-02 08:26:00,0.2,ExpressAir,30.12,17850,female,...,Medium,84306,Middle class,"3 1/2""W x 20""D x 20""H Funston Craftsman Smooth...",Our Rustic Collection is an instant classic. O...,Home Improvement|Hardware|Brackets and Angle I...,Ekena Milwork,199.11,129.0,9.0
1,536365,71053,6,2010-12-01 08:26:00,2010-12-02 08:26:00,0.21,ExpressAir,30.12,17850,female,...,Medium,84306,Middle class,Awkward Styles Shamrock Flag St. Patrick's Day...,Our St Patrick's Day Collection is perfect for...,Clothing|Men|Mens T-Shirts & Tank Tops|Mens Gr...,Awkward Styles,23.95,150.0,7.0
2,536365,21730,6,2010-12-01 08:26:00,2010-12-03 08:26:00,0.56,Regular Air,15.22,17850,female,...,Medium,84306,Middle class,Ebe Men Black Rectangle Half Rim Spring Hinge ...,Count on EBE for all of your eye correction ne...,Health|Home Health Care|Daily Living Aids,Eye Buy Express,26.99,176.0,5.0
3,536365,84406B,8,2010-12-01 08:26:00,2010-12-03 08:26:00,0.3,Regular Air,15.22,17850,female,...,Medium,84306,Middle class,MightySkins Skin Decal Wrap Compatible with Ap...,Mightyskins are removable vinyl skins for prot...,Electronics|Electronics Learning Center|Ads Fr...,Mightyskins,14.99,118.0,6.0
4,536365,22752,2,2010-12-01 08:26:00,2010-12-04 08:26:00,0.57,Delivery Truck,5.81,17850,female,...,Medium,84306,Middle class,awesome since 1948 - 69th birthday gift t-shir...,awesome since 1948 - 69th birthday gift t-shir...,Clothing|Men|Mens T-Shirts & Tank Tops|Mens T-...,Shirtinvaders,49.33,20.0,6.0


In [4]:
# Shape of data
df.shape

(245898, 21)

# Preparation: Binary Dataset

Main Steps
1. Aggregate all User - Item Interactions & Create Flag Column for Purchases
2. Split into Train & Test Data

## 1. User - Item Aggregation & Flag Column

In [5]:
# Calculate Count of all non-cancelled interactions between Users & Items
df_binary = df.groupby(['StockCode', 'CustomerID']).agg({'CustomerID': 'count'})

# Rename Columns 
df_binary.columns = ['purchased']

# Reset Index
df_binary.reset_index(inplace=True)

# Set all purchases to 1 
df_binary['purchased'] = 1 

# Check first five columns
df_binary.head()

Unnamed: 0,StockCode,CustomerID,purchased
0,10002,12451,1
1,10002,12510,1
2,10002,12583,1
3,10002,12637,1
4,10002,12673,1


In [6]:
# Sanity Check 
df_binary.purchased.describe()

count    172701.0
mean          1.0
std           0.0
min           1.0
25%           1.0
50%           1.0
75%           1.0
max           1.0
Name: purchased, dtype: float64

## 2. Split into Train & Test Data

In [7]:
# Load the Python Splitter
from recommenders.datasets.python_splitters import python_stratified_split

In [8]:
# Split into Train & Test 
train, test = python_stratified_split(df_binary, ratio=0.8, filter_by='user', min_rating=10, col_user='CustomerID', col_item='StockCode', seed=1)

In [9]:
# Sanity Check number of unique Users & Items in Train - If numbers differ, this means the stratified split 
print("Users Train: \t",train['CustomerID'].nunique())
print("Items Train: \t",train['StockCode'].nunique())
# Sanity Check number of unique Users & Items in Test
print("Users Test: \t",test['CustomerID'].nunique())
print("Items Test: \t",test['StockCode'].nunique())

Users Train: 	 3011
Items Train: 	 2349
Users Test: 	 3011
Items Test: 	 2307


In [10]:
# Ensure that the same items and users are in both sets 
# Find the set of unique items in both the train and test sets
train_items = set(train['StockCode'].unique())
test_items = set(test['StockCode'].unique())

# Find the intersection of the sets from step 1
common_items = train_items.intersection(test_items)

# Filter the train and test sets to include only the rows with item IDs that are in the intersection set
train = train[train['StockCode'].isin(common_items)]
test = test[test['StockCode'].isin(common_items)]

In [12]:
# Sanity Check number of unique Users & Items in Train
print("Users Train: \t",train['CustomerID'].nunique())
print("Items Train: \t",train['StockCode'].nunique())
# Sanity Check number of unique Users & Items in Test
print("Users Test: \t",test['CustomerID'].nunique())
print("Items Test: \t",test['StockCode'].nunique())

Users Train: 	 3011
Items Train: 	 2307
Users Test: 	 3011
Items Test: 	 2307


In [13]:
# Shapes to Check 
print("Shape of Train:", train.shape)
print("Shape of Test: \t", test.shape)

Shape of Train: (137627, 3)
Shape of Test: 	 (34505, 3)


In [14]:
# Save as csv 
train.to_csv('../00_Data/rec_sys_binary_train.csv')
test.to_csv('../00_Data/rec_sys_binary_test.csv')