# Loading Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.style

# Loading the Dataset

In [2]:
# Reading Data # First Column is the index
df = pd.read_csv('../00_Data/Rec_Sys_precleaned.csv', index_col=0)

# First five rows
df.head()

Unnamed: 0,InvoiceNo,StockCode,Quantity,InvoiceDate,DeliveryDate,Discount%,ShipMode,ShippingCost,CustomerID,Gender,...,Income,Zipcode,Customer Segment,Product Name,Description,Category,Brand,Unit Price,Num_word_text,Num_word_cat
0,536365,84029E,6,2010-12-01 08:26:00,2010-12-02 08:26:00,0.2,ExpressAir,30.12,17850,female,...,Medium,84306,Middle class,"3 1/2""W x 20""D x 20""H Funston Craftsman Smooth...",Our Rustic Collection is an instant classic. O...,Home Improvement|Hardware|Brackets and Angle I...,Ekena Milwork,199.11,129.0,9.0
1,536365,71053,6,2010-12-01 08:26:00,2010-12-02 08:26:00,0.21,ExpressAir,30.12,17850,female,...,Medium,84306,Middle class,Awkward Styles Shamrock Flag St. Patrick's Day...,Our St Patrick's Day Collection is perfect for...,Clothing|Men|Mens T-Shirts & Tank Tops|Mens Gr...,Awkward Styles,23.95,150.0,7.0
2,536365,21730,6,2010-12-01 08:26:00,2010-12-03 08:26:00,0.56,Regular Air,15.22,17850,female,...,Medium,84306,Middle class,Ebe Men Black Rectangle Half Rim Spring Hinge ...,Count on EBE for all of your eye correction ne...,Health|Home Health Care|Daily Living Aids,Eye Buy Express,26.99,176.0,5.0
3,536365,84406B,8,2010-12-01 08:26:00,2010-12-03 08:26:00,0.3,Regular Air,15.22,17850,female,...,Medium,84306,Middle class,MightySkins Skin Decal Wrap Compatible with Ap...,Mightyskins are removable vinyl skins for prot...,Electronics|Electronics Learning Center|Ads Fr...,Mightyskins,14.99,118.0,6.0
4,536365,22752,2,2010-12-01 08:26:00,2010-12-04 08:26:00,0.57,Delivery Truck,5.81,17850,female,...,Medium,84306,Middle class,awesome since 1948 - 69th birthday gift t-shir...,awesome since 1948 - 69th birthday gift t-shir...,Clothing|Men|Mens T-Shirts & Tank Tops|Mens T-...,Shirtinvaders,49.33,20.0,6.0


In [3]:
# Shape of data
df.shape

(245898, 21)

# Preparation: Binary Dataset

Main Steps
1. Aggregate all User - Item Interactions & Create Flag Column for Purchases
2. Split into Train & Test Data

## 1. User - Item Aggregation & Flag Column

In [4]:
# Aggregate count of unique User & Item combinations
df_binary = df.groupby(['StockCode', 'CustomerID']).agg({'CustomerID': 'count'})

# Rename Columns 
df_binary.columns = ['purchased']

# Reset Index
df_binary.reset_index(inplace=True)

# Set all purchases to 1 
df_binary['purchased'] = 1 

# Check first five columns
df_binary.head()

Unnamed: 0,StockCode,CustomerID,purchased
0,10002,12451,1
1,10002,12510,1
2,10002,12583,1
3,10002,12637,1
4,10002,12673,1


In [5]:
# Sanity Check 
df_binary.purchased.describe()

count    172701.0
mean          1.0
std           0.0
min           1.0
25%           1.0
50%           1.0
75%           1.0
max           1.0
Name: purchased, dtype: float64

## 2. Split into Train & Test Data

In [6]:
# Load the Python Splitter
from recommenders.datasets.python_splitters import python_stratified_split

In [7]:
# Split into Train & Test 
train, test = python_stratified_split(df_binary, ratio=0.8, filter_by='user', min_rating=10, col_user='CustomerID', col_item='StockCode', seed=1)

In [8]:
# Sanity Check number of unique Users & Items in Train - If numbers differ, this means the stratified split 
print("Users Train: \t",train['CustomerID'].nunique())
print("Items Train: \t",train['StockCode'].nunique())
# Sanity Check number of unique Users & Items in Test
print("Users Test: \t",test['CustomerID'].nunique())
print("Items Test: \t",test['StockCode'].nunique())

Users Train: 	 3011
Items Train: 	 2349
Users Test: 	 3011
Items Test: 	 2307


In [9]:
# Ensure that the same items and users are in both sets 
# Find the set of unique items in both the train and test sets
train_items = set(train['StockCode'].unique())
test_items = set(test['StockCode'].unique())

# Find the intersection of the sets from step 1
common_items = train_items.intersection(test_items)

# Filter the train and test sets to include only the rows with item IDs that are in the intersection set
train = train[train['StockCode'].isin(common_items)]
test = test[test['StockCode'].isin(common_items)]

In [10]:
# Sanity Check number of unique Users & Items in Train
print("Users Train: \t",train['CustomerID'].nunique())
print("Items Train: \t",train['StockCode'].nunique())
# Sanity Check number of unique Users & Items in Test
print("Users Test: \t",test['CustomerID'].nunique())
print("Items Test: \t",test['StockCode'].nunique())

Users Train: 	 3011
Items Train: 	 2307
Users Test: 	 3011
Items Test: 	 2307


In [11]:
# Shapes to Check 
print("Shape of Train:", train.shape)
print("Shape of Test: \t", test.shape)

Shape of Train: (137627, 3)
Shape of Test: 	 (34505, 3)


In [12]:
# Save as csv 
train.to_csv('../00_Data/rec_sys_binary_train.csv')
test.to_csv('../00_Data/rec_sys_binary_test.csv')

# Preparation: Pseudo-Rating Dataset

Main Steps

1. Aggregate all User - Item Interactions & Create Flag Column for Purchases
2. Increase value of repurchased items
3. Insert negative value for non-purchased popular items
4. Split into Train & Test Data

## 1. Aggregate Count of Unique User & Item Interactions

In [13]:
# Aggregate count of unique User & Item combinations
df_ratings = df.groupby(['StockCode', 'CustomerID']).agg({'CustomerID': 'count'})

# Rename the columns
df_ratings.columns = ['purchased']

# Reset the index
df_ratings.reset_index(inplace=True)

# Check first five rows 
df_ratings.head()

Unnamed: 0,StockCode,CustomerID,purchased
0,10002,12451,1
1,10002,12510,1
2,10002,12583,1
3,10002,12637,1
4,10002,12673,1


In [14]:
# How many interaction took place more than once? 
len(df_ratings[df_ratings['purchased']>1])

39191

## 2. Increase Value of Re-Purchased Items

In [15]:
# Apply function with lambda for the condition: If x in purchased <=1 x, else 2 
df_ratings['purchased'] = df_ratings['purchased'].apply(lambda x: x if x <= 1 else 2)

# Check Distribution
df_ratings.purchased.value_counts()

1    133510
2     39191
Name: purchased, dtype: int64

In [16]:
# Check random sample
df_ratings.sample(10)

Unnamed: 0,StockCode,CustomerID,purchased
13004,21088,16145,1
87564,22640,14505,1
161038,84832,16425,1
112595,22961,17126,2
137529,23298,15152,2
140567,23328,14911,2
119818,23054,14016,1
6085,20727,16153,1
43447,21974,14755,1
66655,22382,17921,1


## 3. Insert negative value for non-purchased popular items

In [17]:
# First Count number of Purchases per Item in the initial Dataset 
df_items = df.groupby('StockCode').agg({'StockCode': 'count'})

# Rename the columns
df_items.columns = ['Purchases']

# Reset the index
df_items.reset_index(inplace=True)

# Show first five rows
df_items.sort_values(by='Purchases', ascending=False).head()

Unnamed: 0,StockCode,Purchases
2274,85123A,1565
979,22423,1318
1929,47566,1195
2267,85099B,1184
2167,84879,1008


### 3.1. Top 2 Percent

In [18]:
# Filter Out best Items 
df_top2_items = df_items[df_items['Purchases'] >= df_items['Purchases'].quantile(0.98)]

# Number of top Items 
df_top2_items.shape[0]

48

In [19]:
# Save top Items in an array 
top2_perc_items = df_top2_items['StockCode'].unique()

# Save all Users from df_rating in an array
all_users = df_ratings['CustomerID'].unique()

In [20]:
# For sanity check in the next step multiply length of both arrays. This should later be the shape of dataframe with unique combinations out of both arrays
len(top2_perc_items) * len(all_users)

144528

In [21]:
# Create Dataframe with all unique matches of top items and all users 

# First Create a meshgrid of all_users and top5_perc_items
user_grid, item_grid = np.meshgrid(all_users, top2_perc_items, indexing='ij')

# Flatten the meshgrid arrays
user_flat = user_grid.flatten()
item_flat = item_grid.flatten()

# Create a pandas dataframe with a row for each combination of the values in all_users and top5_perc_items
top_potential_purchases = pd.DataFrame({'CustomerID': user_flat, 'StockCode': item_flat})

# Check shape: it should be 144528 
top_potential_purchases.shape

(144528, 2)

In [22]:
# Insert a -1 in a new column as value for all rows
top_potential_purchases['purchased'] = -2 

# Sanity check
top_potential_purchases.head()

Unnamed: 0,CustomerID,StockCode,purchased
0,12451,20724,-2
1,12451,20726,-2
2,12451,20727,-2
3,12451,20728,-2
4,12451,20914,-2


In [23]:
# Perform an Anti-Join between df_ratings and top_potential_purchases to find out combinations of CustomerID & StockCode that are only in top_potential purchases. Drop duplicates and concatenate to ratings

# Perform the anti join by merging on 'StockCode' and 'CustomerID' using a 'left' join and adding an '_merge' indicator column
popular_non_bought = top_potential_purchases.merge(df_ratings, on=['StockCode', 'CustomerID'], how='left', indicator=True)

# Keep only the rows where the '_merge' indicator is 'left_only', meaning they only exist in the 'df' dataframe
popular_non_bought = popular_non_bought[popular_non_bought['_merge'] == 'left_only']

# Drop the '_merge' column and any additional columns from 'df_cancelled' that might have been added during the merge
popular_non_bought = popular_non_bought.drop(columns=['_merge', 'purchased_y'])

# Reset the index if needed
popular_non_bought.reset_index(drop=True, inplace=True)

# Rename purchased_x to purchased
popular_non_bought = popular_non_bought.rename(columns={'purchased_x':'purchased'})

# Check Shape after Ánti Join
popular_non_bought.shape


(126326, 3)

In [24]:
# Concatenate popular_non_bought with df_ratings
df_ratings = pd.concat([df_ratings, popular_non_bought], ignore_index=True)

# Check sample
df_ratings.sample(10)

Unnamed: 0,StockCode,CustomerID,purchased
20747,21251,12527,1
9543,20974,16184,1
89438,22664,15810,1
106166,22890,17576,2
128133,23192,13735,1
140505,23327,12952,1
144406,23433,14298,1
21001,21260,12578,1
6737,20749,13934,1
166150,85035B,15311,2


### 3.2. Top 3-5 Percent

In [25]:
# Filter Out best Items 
df_top_3_5_items = df_items[(df_items['Purchases'] >= df_items['Purchases'].quantile(0.95)) & (df_items['Purchases'] < df_items['Purchases'].quantile(0.98)) ]

# Number of top Items 
df_top_3_5_items.shape[0]

70

In [26]:
# Save top Items in an array 
top3_5_perc_items = df_top_3_5_items['StockCode'].unique()

# Save all Users from df_rating in an array
all_users = df_ratings['CustomerID'].unique()

# For sanity check in the next step multiply length of both arrays. This should later be the shape of dataframe with unique combinations out of both arrays
len(top3_5_perc_items) * len(all_users)

210770

In [27]:
# Create Dataframe with all unique matches of top items and all users 

# First Create a meshgrid of all_users and top5_perc_items
user_grid, item_grid = np.meshgrid(all_users, top3_5_perc_items, indexing='ij')

# Flatten the meshgrid arrays
user_flat = user_grid.flatten()
item_flat = item_grid.flatten()

# Create a pandas dataframe with a row for each combination of the values in all_users and top5_perc_items
top_3_5_potential_purchases = pd.DataFrame({'CustomerID': user_flat, 'StockCode': item_flat})

# Check shape: it should be 144528 
top_3_5_potential_purchases.shape

(210770, 2)

In [28]:
# Insert a -1 in a new column as value for all rows
top_3_5_potential_purchases['purchased'] = -1 

# Sanity check
top_3_5_potential_purchases.head()

Unnamed: 0,CustomerID,StockCode,purchased
0,12451,20685,-1
1,12451,20712,-1
2,12451,20719,-1
3,12451,20723,-1
4,12451,20972,-1


In [29]:
# Perform an Anti-Join between df_ratings and top_3_5_potential_purchases to find out combinations of CustomerID & StockCode that are only in top_potential purchases. Drop duplicates and concatenate to ratings

# Perform the anti join by merging on 'StockCode' and 'CustomerID' using a 'left' join and adding an '_merge' indicator column
popular_non_bought_3_5 = top_3_5_potential_purchases.merge(df_ratings, on=['StockCode', 'CustomerID'], how='left', indicator=True)

# Keep only the rows where the '_merge' indicator is 'left_only', meaning they only exist in the 'df' dataframe
popular_non_bought_3_5 = popular_non_bought_3_5[popular_non_bought_3_5['_merge'] == 'left_only']

# Drop the '_merge' column and any additional columns from 'df_cancelled' that might have been added during the merge
popular_non_bought_3_5 = popular_non_bought_3_5.drop(columns=['_merge', 'purchased_y'])

# Reset the index if needed
popular_non_bought_3_5.reset_index(drop=True, inplace=True)

# Rename purchased_x to purchased
popular_non_bought_3_5 = popular_non_bought_3_5.rename(columns={'purchased_x':'purchased'})

# Check Shape after Ánti Join
popular_non_bought_3_5.shape


(192977, 3)

In [30]:
# Concatenate popular_non_bought with df_ratings
df_ratings = pd.concat([df_ratings, popular_non_bought_3_5], ignore_index=True)

# Check sample
df_ratings.sample(10)

Unnamed: 0,StockCode,CustomerID,purchased
406704,23245,12770,-1
376640,20719,13764,-1
93346,22713,12377,1
145826,35914,16919,1
79219,22555,12465,1
196318,22470,14524,-2
485630,22652,15341,-1
288751,22699,17101,-2
66132,22380,15062,1
11279,21035,16556,1


In [31]:
# Check Distribution
df_ratings.purchased.value_counts()

-1    192977
 1    133510
-2    126326
 2     39191
Name: purchased, dtype: int64

## 4. Split into Train & Test Data

In [32]:
# Load the Chronological Splitter
from recommenders.datasets.python_splitters import python_stratified_split

# Split with Python Stratified
train_rat, test_rat = python_stratified_split(df_ratings, ratio=0.8, filter_by='user', min_rating=10, col_user='CustomerID', col_item='StockCode', seed=1)

In [33]:
#  Check number of unique Users & Items in Test & Train 
print("Users in Test:",test_rat['CustomerID'].nunique())
print("Users in Train:",train_rat['CustomerID'].nunique())
print("Items in Test:",test_rat['StockCode'].nunique())
print("Items in Train:",train_rat['StockCode'].nunique())

Users in Test: 3011
Users in Train: 3011
Items in Test: 2327
Items in Train: 2349


In [34]:
# Ensure that the same items and users are in both sets 
# Find the set of unique items in both the train and test sets
train_items_rat = set(train_rat['StockCode'].unique())
test_items_rat = set(test_rat['StockCode'].unique())

# Find the intersection of the sets from step 1
common_items_rat = train_items_rat.intersection(test_items_rat)

# Filter the train and test sets to include only the rows with item IDs that are in the intersection set
train_rat = train_rat[train_rat['StockCode'].isin(common_items_rat)]
test_rat = test_rat[test_rat['StockCode'].isin(common_items_rat)]

In [35]:
#  Check number of unique Users & Items in Test & Train AGAIN
print("Users in Test:",test_rat['CustomerID'].nunique())
print("Users in Train:",train_rat['CustomerID'].nunique())
print("Items in Test:",test_rat['StockCode'].nunique())
print("Items in Train:",train_rat['StockCode'].nunique())

Users in Test: 3011
Users in Train: 3011
Items in Test: 2327
Items in Train: 2327


In [36]:
# Rows in Train & Test
print("Rows in train:", train_rat.shape[0])
print("Rows in test:", test_rat.shape[0])

Rows in train: 393307
Rows in test: 98407


In [37]:
# Save as csv 
train_rat.to_csv('../00_Data/rec_sys_ratings_train.csv')
test_rat.to_csv('../00_Data/rec_sys_ratings_test.csv')