In [56]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from matplotlib import pyplot as plt
from collections import defaultdict

In [2]:
file_name = 'df_modcloth.csv'
df = pd.read_csv(file_name, sep=',')

In [3]:
df.head()

Unnamed: 0,item_id,user_id,rating,timestamp,size,fit,user_attr,model_attr,category,brand,year,split
0,7443,Alex,4,2010-01-21 08:00:00+00:00,,,Small,Small,Dresses,,2012,0
1,7443,carolyn.agan,3,2010-01-27 08:00:00+00:00,,,,Small,Dresses,,2012,0
2,7443,Robyn,4,2010-01-29 08:00:00+00:00,,,Small,Small,Dresses,,2012,0
3,7443,De,4,2010-02-13 08:00:00+00:00,,,,Small,Dresses,,2012,0
4,7443,tasha,4,2010-02-18 08:00:00+00:00,,,Small,Small,Dresses,,2012,0


In [5]:
df.shape

(99893, 12)

In [18]:
# print the first row
row = df.iloc[0]
print(f"{row.to_dict()}")

{'item_id': 7443, 'user_id': 'Alex', 'rating': 4, 'timestamp': '2010-01-21 08:00:00+00:00', 'size': nan, 'fit': nan, 'user_attr': 'Small', 'model_attr': 'Small', 'category': 'Dresses', 'brand': nan, 'year': 2012, 'split': 0}


In [24]:
# check possible values for different fields
items = df['item_id'].unique().tolist()
users = df['user_id'].unique().tolist()
ratings = df['rating'].unique().tolist()
sizes = df['size'].unique().tolist()
fits = df['fit'].unique().tolist()
user_attrs = df['user_attr'].unique().tolist()
model_attrs = df['model_attr'].unique().tolist()
categories = df['category'].unique().tolist()
brands = df['brand'].unique().tolist()
years = df['year'].unique().tolist()
splits = df['split'].unique().tolist()

In [28]:
print(f"number of items: {len(items)}")
print(f"number of user: {len(users)}")
print(f"number of brands: {len(brands)}")
print(f"brands: {brands}")
print(f"ratings: {ratings}")
print(f"sizes: {sizes}")
print(f"fits: {fits}")
print(f"user_attrs: {user_attrs}")
print(f"model_attrs: {model_attrs}")
print(f"categories: {categories}")
print(f"years: {years}")
print(f"splits: {splits}")

number of items: 1020
number of user: 44784
number of brands: 32
brands: [nan, 'ModCloth', 'Retrolicious', 'Steve Madden', 'Ryu', 'Chi Chi London', 'Out of Print', 'Kin Ship', 'Jack by BB Dakota', 'Pink Martini', 'Miss Candyfloss', 'Emily and Fin', 'Daisey Natives', 'Hell Bunny', 'Banned', 'Sugarhill Boutique', 'Wrangler', 'Wendy Bird', 'Pepaloves', 'Collectif', 'Compania Fantastica', 'Closet London', 'Eliza J', 'BB Dakota', "Alice's Pig", 'Louche', "Effie's Heart", 'Miss Patina', 'Mata Traders', "Rolla's", 'Yumi', 'Blue Platypus']
ratings: [4, 3, 5, 2, 1]
sizes: [nan, 1.0, 2.0, 3.0, 7.0, 4.0, 6.0, 5.0, 8.0, 0.0]
fits: [nan, 'Just right', 'Slightly small', 'Very small', 'Slightly large', 'Very large']
user_attrs: ['Small', nan, 'Large']
model_attrs: ['Small', 'Small&Large']
categories: ['Dresses', 'Outerwear', 'Bottoms', 'Tops']
years: [2012, 2010, 2011, 2013, 2014, 2016, 2015, 2018, 2017, 2019]
splits: [0, 2, 1]


In [None]:
# check how many data don't have brand info
brand_list = df['brand'].tolist()
print(f"total number of data: {len(brand_list)}")
print(f"number of data which doesn't have brand info: {len([brand for brand in brand_list if brand is np.nan])}")

total number of data: 99893
number of data which doesn't have brand info: 73980


In [54]:
# check how many user_id is nan
user_list = df['user_id'].to_list()
print(f"total number of data: {len(user_list)}")
print(f"number of nan user_id: {len([user for user in user_list if user is np.nan])}")

# check how many item_id is nan
item_list = df['item_id'].to_list()
print(f"total number of data: {len(item_list)}")
print(f"number of nan item_id: {len([item for item in item_list if item is np.nan])}")


total number of data: 99893
number of nan user_id: 1
total number of data: 99893
number of nan item_id: 0


In [None]:
fit_to_idx = {'Just right': 1, 'Slightly small': 2, 'Very small': 3, 'Slightly large': 4, 'Very large': 5}
user_attr_to_idx = {'Small': 1, 'Large': 2}
model_attr_to_idx = {'Small': 1, 'Small&Large': 2}
category_to_idx = {'Dresses': 1, 'Outerwear': 2, 'Bottoms': 3, 'Tops': 4}

# map brand name to idx, idx for nan is 0
brand_to_idx = {}
idx = 1
for brand in brands:
    if brand is not np.nan:
        brand_to_idx[brand] = idx
        idx += 1

# map user_id to idx, starting from 1
user_id_to_idx = {}
idx = 1
for user in users:
    if user is not np.nan:
        user_id_to_idx[user] = idx
        idx += 1

# map item_id to idx, starting from 1
item_id_to_idx = {}
idx = 1
for item in items:
    if item is not np.nan:
        item_id_to_idx[item] = idx
        idx += 1

# group the data by user
# group the data by item
data_per_user = defaultdict(list)
data_per_item = defaultdict(list)

# sort users by the number of items they purchased,
# put the last 100 users' data in test set to ensure there are unseen users in test set.

# for each remaining user, 20% data in test set, 10% data in validation set, 70% data in training set




44783
