# Loading necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.style

# Loading Dataset

In [33]:
# Reading Data # First Column is the index
df = pd.read_csv('../00_Data/online_retailed_precleaned.csv', index_col=0)

In [34]:
# Shape of data
df.shape

(397656, 8)

In [35]:
# First five rows
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,3249,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,4048,United Kingdom
1,536365,2649,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,4048,United Kingdom
2,536365,2855,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,4048,United Kingdom
3,536365,2803,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,4048,United Kingdom
4,536365,2802,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,4048,United Kingdom


# Preparation: Binary Dataset

**Main Steps**

1. Remove all cancelled orders
2. Create a flag column for purchased items
3. Split into Train & Test Data

## 1. Removing Cancelled Orders

In [36]:
# Filter out rows with cancelled orders
df = df[~df['InvoiceNo'].str.startswith('C')]

In [37]:
# Check Shape after Removal
df.shape

(389326, 8)

# 2. Create a Flag Column for purchased Items

In [38]:
# Create column 'purchased' and assign the value 1 for each row
df['purchased'] = 1 

In [39]:
# Check first 5 rows
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,purchased
0,536365,3249,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,4048,United Kingdom,1
1,536365,2649,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,4048,United Kingdom,1
2,536365,2855,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,4048,United Kingdom,1
3,536365,2803,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,4048,United Kingdom,1
4,536365,2802,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,4048,United Kingdom,1


In [51]:
# Print Unique Users & Items for Sanity Check  
print("Users: \t",df['CustomerID'].nunique())
print("Items: \t",df['StockCode'].nunique())

Users: 	 3697
Items: 	 2743


# 3. Split into Train & Test Data

In [45]:
# Load the Chronological Splitter
from recommenders.datasets.python_splitters import python_stratified_split

In [52]:
train, test = python_stratified_split(df, ratio=0.8, filter_by='user', min_rating=10, col_user='CustomerID', col_item='StockCode', seed=1)

In [61]:
# Sanity Check number of unique Users & Items in Train
print("Users: \t",train['CustomerID'].nunique())
print("Items: \t",train['StockCode'].nunique())

Users: 	 3694
Items: 	 2729


In [58]:
# Sanity Check number of unique Users & Items in Test
print("Users: \t",test['CustomerID'].nunique())
print("Items: \t",test['StockCode'].nunique())

Users: 	 3694
Items: 	 2729


In [59]:
# Ensure that the same items and users are in both sets 
# Find the set of unique items in both the train and test sets
train_items = set(train['StockCode'].unique())
test_items = set(test['StockCode'].unique())

# Find the intersection of the sets from step 1
common_items = train_items.intersection(test_items)

# Filter the train and test sets to include only the rows with item IDs that are in the intersection set
train = train[train['StockCode'].isin(common_items)]
test = test[test['StockCode'].isin(common_items)]

In [64]:
# Save as csv 
train.to_csv('../00_Data/online_retail_train.csv')
test.to_csv('../00_Data/online_retail_test.csv')
