In [3]:
import gzip
from collections import defaultdict
import math
import numpy as np
import pandas as pd
import string
import random
import string
from sklearn import linear_model
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error as MSE
import json

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
data = []
f = gzip.open('renttherunway_final_data.json.gz', 'r')

ratings = []
ages = []
weights = []
heights = []

for l in f:
    try:
        x = eval(l)
        
        # Data cleaning: convert height, weight, and rating to integer types
        x['rating'] = int(x['rating'])
        ratings.append(x['rating'])

        if 'age' in x:
            x['age'] = int(x['age'])
            ages.append(x['age'])
        if 'weight' in x:
            x['weight'] = int(x['weight'][:-3])
            weights.append(x['weight'])
        if 'height' in x:
            feet = int(x['height'][0])
            inches = int(x['height'][-3:-1].strip(" "))
            x['height'] = (feet * 12) + inches
            heights.append(x['height'])

        data.append(x)
    except NameError:
        continue

average_rating = sum(ratings) / len(ratings)

average_age = sum(ages) / len(ages)
average_weight = sum(weights) / len(weights)
average_height = sum(heights) / len(heights)

print(average_rating, average_age, average_height, average_weight)

9.092371481123546 33.86995503986883 65.31070730244805 137.3936494261715


In [6]:
data[0]

{'fit': 'fit',
 'user_id': '420272',
 'bust size': '34d',
 'item_id': '2260466',
 'weight': 137,
 'rating': 10,
 'rented for': 'vacation',
 'review_text': "An adorable romper! Belt and zipper were a little hard to navigate in a full day of wear/bathroom use, but that's to be expected. Wish it had pockets, but other than that-- absolutely perfect! I got a million compliments.",
 'body type': 'hourglass',
 'review_summary': 'So many compliments!',
 'category': 'romper',
 'height': 68,
 'size': 14,
 'age': 28,
 'review_date': 'April 20, 2016'}

In [7]:
len(data)

192462

In [8]:
# insert EDA here

In [9]:
# Linear regression analysis

def train_and_predict_model(feature_fn):
  # Training/test/validation sets - 70% train, 15% valid, 15% test
  X = [feature_fn(d) for d in data]
  y = [d['rating'] for d in data]

  Xtrain = X[:7 * len(X) // 10]
  ytrain = y[:7 * len(y) // 10]
  Xvalid = X[7 * len(X) // 10: int(8.5 * len(X) // 10)]
  yvalid = y[7 * len(y) // 10: int(8.5 * len(y) // 10)]
  Xtest = X[int(8.5 * len(X) // 10):]
  ytest = y[int(8.5 * len(y) // 10):]

  # Model creation
  model = linear_model.LinearRegression(fit_intercept=False)
  model.fit(Xtrain, np.matrix(ytrain).T)

  # Model validation
  ans = model.predict(Xvalid)
  actual = ans.T[0]

  mse = MSE(yvalid, actual)
  return mse

In [10]:
# Baseline rating model - predict the average rating for every prediction
def feature_base(datum):
  return [1, average_rating]

In [11]:
mse_base = train_and_predict_model(feature_base)
mse_base

2.0101764219048124

In [12]:
# Feature enginering #1: use the number of exclamation marks in review text to predict rating
def feature_1(datum):
  if 'review_text' in datum:
    return [1, datum['review_text'].count('!')]
  return [1, 0]

In [13]:
mse_1 = train_and_predict_model(feature_1)
mse_1

1.9444446124699253

In [14]:
# Feature enginering #2: use user attributes such as height, weight, and age along with item size to predict ratings of item 
def feature_2(datum):
  weight = datum['weight'] if 'weight' in datum else average_weight
  height = datum['height'] if 'height' in datum else average_height
  age = datum['age'] if 'age' in datum else average_age
  size = datum['size']
  return [1, weight, height, age, size]  

In [15]:
mse_2 = train_and_predict_model(feature_2)
mse_2

2.0062008791177117

In [16]:
# Feature enginering #3: use user attributes along with review text exclamations
def feature_3(datum):
  weight = datum['weight'] if 'weight' in datum else average_weight
  height = datum['height'] if 'height' in datum else average_height
  age = datum['age'] if 'age' in datum else average_age
  size = datum['size']
  excl_count = datum['review_text'].count('!') if 'review_text' in datum else 0
  return [1, weight, height, age, size, excl_count]  

In [17]:
mse_3 = train_and_predict_model(feature_3)
mse_3

1.942181367154325

In [18]:
mses_linreg = [mse_base, mse_1, mse_2, mse_3]
mses_linreg

[2.0101764219048124, 1.9444446124699253, 2.0062008791177117, 1.942181367154325]

In [19]:
# Interaction prediction
dataTrain = data[:7*len(data) // 10]
dataValid = data[7*len(data) // 10 : int(8.5 * len(data) // 10)]
dataTest = data[int(8.5 * len(data) // 10):]

usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set) # Maps a user to the items that they rated
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)
ratingDict = {} # To retrieve a rating for a specific user/item pair

trainingRatings = []

for d in dataTrain:
    user, item, rating = d['user_id'], d['item_id'], d['rating']
    
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)
    reviewsPerUser[user].append(d)
    reviewsPerItem[item].append(d)
    ratingDict[(user, item)] = rating
    
    trainingRatings.append(rating)
    
userAverages = {}
itemAverages = {}

for u in itemsPerUser:
    rs = [ratingDict[(u,i)] for i in itemsPerUser[u]]
    userAverages[u] = sum(rs) / len(rs)
    
for i in usersPerItem:
    rs = [ratingDict[(u,i)] for u in usersPerItem[i]]
    itemAverages[i] = sum(rs) / len(rs)
    
ratingMean = sum(trainingRatings) / len(trainingRatings)

In [20]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    
    return numer / denom

In [21]:
def predictRating(user, item):
    ratings = []
    sims = []
    
    for d in reviewsPerUser[user]:
        j = d['item_id']
        
        if j == item:
            continue
        
        ratings.append(ratingDict[(user, j)] - itemAverages[j])
        sims.append(Jaccard(usersPerItem[item], usersPerItem[j]))
    
    if sum(sims) > 0:
        weightedRatings = [(x * y) for x, y in zip(ratings, sims)]
        return itemAverages[item] + sum(weightedRatings) / sum(sims)
    else:
        return ratingMean

simPredictions = [predictRating(d['user_id'], d['item_id']) for d in dataValid]
labels = [d['rating'] for d in dataValid]

mse = MSE(simPredictions, labels)
mse

2.2100620042373786

In [22]:
# Beta gamma fuck shit
dataTrain = data[:7*len(data) // 10]
dataValid = data[7*len(data) // 10 :]

ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)

for d in dataTrain:
    u, i, r = d['user_id'], d['item_id'], d['rating']
    
    ratingsPerUser[u].append((i,r))
    ratingsPerItem[i].append((u,r))

trainRatings = [d['rating'] for d in dataTrain]
globalAverage = sum(trainRatings) * 1.0 / len(trainRatings)

In [23]:
betaU = {}
betaI = {}
for u in ratingsPerUser:
    betaU[u] = 0

for i in ratingsPerItem:
    betaI[i] = 0

alpha = globalAverage

In [24]:
def iterate(lamb):
    newAlpha = 0
    for d in dataTrain:
        u, i, r = d['user_id'], d['item_id'], d['rating']
        newAlpha += r - (betaU[u] + betaI[i])
    alpha = newAlpha / len(dataTrain)
    for u in ratingsPerUser:
        newBetaU = 0
        for i,r in ratingsPerUser[u]:
            newBetaU += r - (alpha + betaI[i])
        betaU[u] = newBetaU / (lamb + len(ratingsPerUser[u]))
    for i in ratingsPerItem:
        newBetaI = 0
        for u,r in ratingsPerItem[i]:
            newBetaI += r - (alpha + betaU[u])
        betaI[i] = newBetaI / (lamb + len(ratingsPerItem[i]))
    mse = 0
    for d in dataTrain:
        u, i, r = d['user_id'], d['item_id'], d['rating']
        prediction = alpha + betaU[u] + betaI[i]
        mse += (r - prediction)**2
    regularizer = 0
    for u in betaU:
        regularizer += betaU[u]**2
    for i in betaI:
        regularizer += betaI[i]**2
    mse /= len(dataTrain)
    return mse, mse + lamb*regularizer

In [25]:
mse,objective = iterate(1)
newMSE,newObjective = iterate(1)
iterations = 2

In [26]:
while iterations < 10 or objective - newObjective > 0.0001:
    mse, objective = newMSE, newObjective
    newMSE, newObjective = iterate(10)
    iterations += 1
    print("Objective after "
        + str(iterations) + " iterations = " + str(newObjective))
    print("MSE after "
        + str(iterations) + " iterations = " + str(newMSE))

Objective after 3 iterations = 23311.166874083257
MSE after 3 iterations = 1.5622401007624203
Objective after 4 iterations = 22939.439119740124
MSE after 4 iterations = 1.5635516616259384
Objective after 5 iterations = 22920.76285641952
MSE after 5 iterations = 1.5635794588707819
Objective after 6 iterations = 22918.607186913378
MSE after 6 iterations = 1.563521470547877
Objective after 7 iterations = 22918.82083238304
MSE after 7 iterations = 1.5634710974168786
Objective after 8 iterations = 22919.77964959742
MSE after 8 iterations = 1.5634322438593167
Objective after 9 iterations = 22921.007971392628
MSE after 9 iterations = 1.563402567564283
Objective after 10 iterations = 22922.28293229403
MSE after 10 iterations = 1.5633798023686516


In [27]:
validMSE = 0
for d in dataValid:
    u, i, r = d['user_id'], d['item_id'], d['rating']

    bu = 0
    bi = 0
    if u in betaU:
        bu = betaU[u]
    if i in betaI:
        bi = betaI[i]
    prediction = alpha + bu + bi
    validMSE += (r - prediction) ** 2

validMSE /= len(dataValid)
print("Validation MSE = " + str(validMSE))

Validation MSE = 1.9081931152299059


In [39]:
# Proposed model: combines interaction data and feature data
# Idea: if the user and item have never been seen before in the dataset,
# make a prediction using the best linear regression model we examined.

feature_fn = feature_3

# Train the model
X = [feature_fn(d) for d in dataTrain]
y = [d['rating'] for d in dataTrain]

# Model creation
model = linear_model.LinearRegression(fit_intercept=False)
model.fit(X, np.matrix(y).T)

c = 0

# Combine with interaction data
proposedValidMSE = 0
for d in dataValid:
    u, i, r = d['user_id'], d['item_id'], d['rating']

    bu = 0
    bi = 0
    if u in betaU:
        bu = betaU[u]
    if i in betaI:
        bi = betaI[i]
    
    if not bi and not bu:
        c += 1
        prediction = model.predict([feature_fn(d)]).T[0]
    else:
        prediction = alpha + bu + bi
    proposedValidMSE += (r - prediction) ** 2

proposedValidMSE /= len(dataValid)
print("Validation MSE = " + str(proposedValidMSE))

Validation MSE = [1.90818754]


In [40]:
proposedValidMSE[0] - validMSE[0]

0.0