In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from matplotlib import pyplot as plt
from collections import defaultdict
import random
import csv

In [2]:
file_name = 'df_modcloth.csv'
df = pd.read_csv(file_name, sep=',')

In [3]:
df.head()

Unnamed: 0,item_id,user_id,rating,timestamp,size,fit,user_attr,model_attr,category,brand,year,split
0,7443,Alex,4,2010-01-21 08:00:00+00:00,,,Small,Small,Dresses,,2012,0
1,7443,carolyn.agan,3,2010-01-27 08:00:00+00:00,,,,Small,Dresses,,2012,0
2,7443,Robyn,4,2010-01-29 08:00:00+00:00,,,Small,Small,Dresses,,2012,0
3,7443,De,4,2010-02-13 08:00:00+00:00,,,,Small,Dresses,,2012,0
4,7443,tasha,4,2010-02-18 08:00:00+00:00,,,Small,Small,Dresses,,2012,0


In [4]:
df.shape

(99893, 12)

In [5]:
# print the first row
row = df.iloc[0]
print(f"{row.to_dict()}")

{'item_id': 7443, 'user_id': 'Alex', 'rating': 4, 'timestamp': '2010-01-21 08:00:00+00:00', 'size': nan, 'fit': nan, 'user_attr': 'Small', 'model_attr': 'Small', 'category': 'Dresses', 'brand': nan, 'year': 2012, 'split': 0}


In [6]:
# check possible values for different fields
items = df['item_id'].unique().tolist()
users = df['user_id'].unique().tolist()
ratings = df['rating'].unique().tolist()
sizes = df['size'].unique().tolist()
fits = df['fit'].unique().tolist()
user_attrs = df['user_attr'].unique().tolist()
model_attrs = df['model_attr'].unique().tolist()
categories = df['category'].unique().tolist()
brands = df['brand'].unique().tolist()
years = df['year'].unique().tolist()
splits = df['split'].unique().tolist()

In [7]:
print(f"number of items: {len(items)}")
print(f"number of user: {len(users)}")
print(f"number of brands: {len(brands)}")
print(f"brands: {brands}")
print(f"ratings: {ratings}")
print(f"sizes: {sizes}")
print(f"fits: {fits}")
print(f"user_attrs: {user_attrs}")
print(f"model_attrs: {model_attrs}")
print(f"categories: {categories}")
print(f"years: {years}")
print(f"splits: {splits}")

number of items: 1020
number of user: 44784
number of brands: 32
brands: [nan, 'ModCloth', 'Retrolicious', 'Steve Madden', 'Ryu', 'Chi Chi London', 'Out of Print', 'Kin Ship', 'Jack by BB Dakota', 'Pink Martini', 'Miss Candyfloss', 'Emily and Fin', 'Daisey Natives', 'Hell Bunny', 'Banned', 'Sugarhill Boutique', 'Wrangler', 'Wendy Bird', 'Pepaloves', 'Collectif', 'Compania Fantastica', 'Closet London', 'Eliza J', 'BB Dakota', "Alice's Pig", 'Louche', "Effie's Heart", 'Miss Patina', 'Mata Traders', "Rolla's", 'Yumi', 'Blue Platypus']
ratings: [4, 3, 5, 2, 1]
sizes: [nan, 1.0, 2.0, 3.0, 7.0, 4.0, 6.0, 5.0, 8.0, 0.0]
fits: [nan, 'Just right', 'Slightly small', 'Very small', 'Slightly large', 'Very large']
user_attrs: ['Small', nan, 'Large']
model_attrs: ['Small', 'Small&Large']
categories: ['Dresses', 'Outerwear', 'Bottoms', 'Tops']
years: [2012, 2010, 2011, 2013, 2014, 2016, 2015, 2018, 2017, 2019]
splits: [0, 2, 1]


In [8]:
# check how many data don't have brand info
brand_list = df['brand'].tolist()
print(f"total number of data: {len(brand_list)}")
print(f"number of data which doesn't have brand info: {len([brand for brand in brand_list if pd.isna(brand)])}")

total number of data: 99893
number of data which doesn't have brand info: 73980


In [9]:
# check how many user_id is nan
user_list = df['user_id'].to_list()
print(f"total number of data: {len(user_list)}")
print(f"number of nan user_id: {len([user for user in user_list if pd.isna(user)])}")

# check how many item_id is nan
item_list = df['item_id'].to_list()
print(f"total number of data: {len(item_list)}")
print(f"number of nan item_id: {len([item for item in item_list if pd.isna(item)])}")


total number of data: 99893
number of nan user_id: 1
total number of data: 99893
number of nan item_id: 0


In [10]:
fit_to_idx = {'Just right': 1, 'Slightly small': 2, 'Very small': 3, 'Slightly large': 4, 'Very large': 5}
user_attr_to_idx = {'Small': 1, 'Large': 2}
model_attr_to_idx = {'Small': 1, 'Small&Large': 2}
category_to_idx = {'Dresses': 1, 'Outerwear': 2, 'Bottoms': 3, 'Tops': 4}

# map brand name to idx, idx for nan is 0
brand_to_idx = {}
idx = 1
for brand in brands:
    if not pd.isna(brand):
        brand_to_idx[brand] = idx
        idx += 1

# map user_id to idx, starting from 1
user_id_to_idx = {}
idx = 1
for user in users:
    if not pd.isna(user):
        user_id_to_idx[user] = idx
        idx += 1

# map item_id to idx, starting from 1
item_id_to_idx = {}
idx = 1
for item in items:
    if not pd.isna(item):
        item_id_to_idx[item] = idx
        idx += 1

# group the data by user
# each data is a tuple (item_idx, rating, size_idx, fit_idx, user_attr_idx, model_attr_idx, category_idx, brand_idx, year_idx, split_idx)
data_per_user = defaultdict(list)

# group the data by item
# each data is a tuple (user_idx, rating, size_idx, fit_idx, user_attr_idx, model_attr_idx, category_idx, brand_idx, year_idx, split_idx)
# data_per_item = defaultdict(list)

# iterate through the whole dataset
for _, row in df.iterrows():
    item = row['item_id']
    user = row['user_id']
    if pd.isna(item) or pd.isna(user):
        continue

    # convert features to int idx
    item_idx = item_id_to_idx[item]
    user_idx = user_id_to_idx[user]
    rating = row['rating']
    
    size = row['size']
    size_idx = 0
    if not pd.isna(size):
        size_idx = int(size) + 1    # mapping to 1-9
    
    fit = row['fit']
    fit_idx = 0
    if not pd.isna(fit):
        fit_idx = fit_to_idx[fit]

    user_attr = row['user_attr']
    user_attr_idx = 0
    if not pd.isna(user_attr):
        user_attr_idx = user_attr_to_idx[user_attr]
    
    model_attr = row['model_attr']
    model_attr_idx = 0
    if not pd.isna(model_attr):
        model_attr_idx = model_attr_to_idx[model_attr]
    
    category = row['category']
    category_idx = 0
    if not pd.isna(category):
        category_idx = category_to_idx[category]
    
    brand = row['brand']
    brand_idx = 0
    if not pd.isna(brand):
        brand_idx = brand_to_idx[brand]
    
    year_idx = row['year'] - 2010

    split_idx = row['split']

    # store the data tuple in the corresponding user/item group
    data_per_user[user_idx].append((item_idx, rating, size_idx, fit_idx, user_attr_idx, model_attr_idx, category_idx, brand_idx, year_idx, split_idx))
    # data_per_item[item_idx].append((user_idx, rating, size_idx, fit_idx, user_attr_idx, model_attr_idx, category_idx, brand_idx, year_idx, split_idx))


In [11]:
# sort users by the number of items they purchased
user_buy_counts = []
for user, datas in data_per_user.items():
    user_buy_counts.append((len(datas), user))

# sort users by the number of items they purchased
user_buy_counts.sort(reverse=True)

# put the last 100 users' data in unseen test set to ensure there are unseen users in test set.
unseen_test_users = set([user for _, user in user_buy_counts[-100:]])
print(f'number of users in unseen test set: {len(unseen_test_users)}')

number of users in unseen test set: 100


In [12]:
# for each remaining user, 20% data in test set, 10% data in validation set, 70% data in training set
# calculate global average on training set + validation set
unseen_test_set_per_user = {}
seen_test_set_per_user = {}
train_set_per_user = {}
valid_set_per_user = {}

for user, datas in data_per_user.items():
    if user in unseen_test_users:
        unseen_test_set_per_user[user] = datas[:]
        continue
    random.shuffle(datas)
    n_data = len(datas)
    test_set = datas[:int(n_data * 0.25):]
    train_valid_set = datas[int(n_data * 0.25):]

    valid_set = train_valid_set[:int(n_data * 0.15)]
    train_set = train_valid_set[int(n_data * 0.15):]

    if len(train_set) == 0:
        train_set = datas[:]
    
    train_set_per_user[user] = train_set

    if len(valid_set) > 0:
        valid_set_per_user[user] = valid_set
    
    if len(test_set) > 0:
        seen_test_set_per_user[user] = test_set


In [13]:
# calculate global average rating on train+valid
# calculate user average rating per user on train+valid
# calculate item average rating per item on train+valid

ratings_train_valid = []
users_train_valid = set()
items_train_valid = set()

user_ratings_train_valid = defaultdict(list)
item_ratings_train_valid = defaultdict(list)

# iterate through training set
for user, datas in train_set_per_user.items():
    users_train_valid.add(user)
    for d in datas:
        item = d[0]
        rating = d[1]
        items_train_valid.add(item)
        ratings_train_valid.append(rating)
        user_ratings_train_valid[user].append(rating)
        item_ratings_train_valid[item].append(rating)

# iterate through validation set
for user, datas in valid_set_per_user.items():
    users_train_valid.add(user)
    for d in datas:
        item = d[0]
        rating = d[1]
        items_train_valid.add(item)
        ratings_train_valid.append(rating)
        user_ratings_train_valid[user].append(rating)
        item_ratings_train_valid[item].append(rating)

avg_rating_train_valid = np.mean(ratings_train_valid)

user_avg_rating_train_valid = {}
item_avg_rating_train_valid = {}

for user, ratings in user_ratings_train_valid.items():
    user_avg_rating_train_valid[user] = np.mean(ratings)

for item, ratings in item_ratings_train_valid.items():
    item_avg_rating_train_valid[item] = np.mean(ratings)


In [14]:
print(len(users_train_valid))
print(len(items_train_valid))

44683
1019


In [18]:
unseen_test_set = []
seen_test_set = []
train_set = []
valid_set = []

# each data tuple should be (item_idx, user_idx, rating, size_idx, fit_idx, user_attr_idx, model_attr_idx, category_idx, brand_idx, year_idx, split_idx, user_avg_rating, item_avg_rating)

def add_data_to_list(dataset_per_user, dataset_list):
    for user, datas in dataset_per_user.items():
        for d in datas:
            item = d[0]
            
            user_avg_rating = avg_rating_train_valid
            if user in users_train_valid:
                user_avg_rating = user_avg_rating_train_valid[user]

            item_avg_rating = avg_rating_train_valid
            if item in items_train_valid:
                item_avg_rating = item_avg_rating_train_valid[item]
            
            dataset_list.append((item, user) + d[1:] + (user_avg_rating, item_avg_rating))

add_data_to_list(unseen_test_set_per_user, unseen_test_set)
add_data_to_list(seen_test_set_per_user, seen_test_set)
add_data_to_list(train_set_per_user, train_set)
add_data_to_list(valid_set_per_user, valid_set)

In [16]:
# re-index the user_idx and item_idx in train+valid set, starting from 1
# if a user/item is not seen in the train+valid set, its idx should be 0

user_idx_old_to_new = {}
item_idx_old_to_new = {}

for new_idx, old_idx in enumerate(list(users_train_valid), start=1):
    user_idx_old_to_new[old_idx] = new_idx

for new_idx, old_idx in enumerate(list(items_train_valid), start=1):
    item_idx_old_to_new[old_idx] = new_idx

In [19]:
# each data tuple should be (item_idx, user_idx, rating, size_idx, fit_idx, user_attr_idx, model_attr_idx, category_idx, brand_idx, year_idx, split_idx, user_avg_rating, item_avg_rating)
def reindex(dataset_list):
    for i in range(len(dataset_list)):
        data = dataset_list[i]
        
        old_item_idx = data[0]
        old_user_idx = data[1]

        new_item_idx = 0
        new_user_idx = 0

        if old_item_idx in items_train_valid:
            new_item_idx = item_idx_old_to_new[old_item_idx]
        
        if old_user_idx in users_train_valid:
            new_user_idx = user_idx_old_to_new[old_user_idx]
        
        dataset_list[i] = (new_item_idx, new_user_idx) + data[2:]

reindex(train_set)
reindex(valid_set)
reindex(unseen_test_set)
reindex(seen_test_set)

In [20]:
random.shuffle(train_set)
random.shuffle(valid_set)
random.shuffle(unseen_test_set)
random.shuffle(seen_test_set)

print(len(train_set))
print(len(valid_set))
print(len(unseen_test_set))
print(len(seen_test_set))

84811
4547
100
10434


In [21]:
# check if there are unseen users/items in different datasets
def check_num_unseen(dataset):
    n_unseen_users = len([data for data in dataset if data[1] == 0])
    n_unseen_items = len([data for data in dataset if data[0] == 0])
    print(f'num of unseen users: {n_unseen_users}, num of unseen items: {n_unseen_items}')

print("in train set")
check_num_unseen(train_set)

print("in valid set")
check_num_unseen(valid_set)

print("in unseen test set")
check_num_unseen(unseen_test_set)

print("in seen test set")
check_num_unseen(seen_test_set)

in train set
num of unseen users: 0, num of unseen items: 0
in valid set
num of unseen users: 0, num of unseen items: 0
in unseen test set
num of unseen users: 100, num of unseen items: 0
in seen test set
num of unseen users: 0, num of unseen items: 2


In [22]:
# write preprocessd train set file, validation set file, test set file
# test set = seen (user) test set + unseen (user) test set

columns = [
    "item_idx", "user_idx", "rating", "size_idx", "fit_idx", 
    "user_attr_idx", "model_attr_idx", "category_idx", 
    "brand_idx", "year_idx", "split_idx", "user_avg_rating", "item_avg_rating"
]


def write_data_to_csv(csv_file_name, datasets):
    with open(csv_file_name, mode='w', newline='') as file:
        writer = csv.writer(file)

        writer.writerow(columns)

        for dataset in datasets:
            writer.writerows(dataset)
            
train_file = 'train_set.csv'
valid_file = 'valid_set.csv'
seen_test_file = 'seen_test_set.csv'
unseen_test_file = 'unseen_test_set.csv'
test_file = 'test_set.csv'

write_data_to_csv(train_file, [train_set])
write_data_to_csv(valid_file, [valid_set])
write_data_to_csv(seen_test_file, [seen_test_set])
write_data_to_csv(unseen_test_file, [unseen_test_set])
write_data_to_csv(test_file, [seen_test_set, unseen_test_set])