In [1]:
import gzip
from collections import defaultdict
import math
import numpy as np
import pandas as pd
import string
import random
import string
from sklearn import linear_model
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error as MSE
import json

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [33]:
data = []
f = gzip.open('renttherunway_final_data.json.gz', 'r')

ratings = []
ages = []
weights = []
heights = []

for l in f:
    try:
        x = eval(l)
        
        # Data cleaning: convert height, weight, and rating to integer types
        x['rating'] = int(x['rating'])
        ratings.append(x['rating'])

        if 'age' in x:
            x['age'] = int(x['age'])
            ages.append(x['age'])
        if 'weight' in x:
            x['weight'] = int(x['weight'][:-3])
            weights.append(x['weight'])
        if 'height' in x:
            feet = int(x['height'][0])
            inches = int(x['height'][-3:-1].strip(" "))
            x['height'] = (feet * 12) + inches
            heights.append(x['height'])

        data.append(x)
    except NameError:
        continue

average_rating = sum(ratings) / len(ratings)

average_age = sum(ages) / len(ages)
average_weight = sum(weights) / len(weights)
average_height = sum(heights) / len(heights)

print(average_rating, average_age, average_height, average_weight)

9.092371481123546 33.86995503986883 65.31070730244805 137.3936494261715


In [4]:
data[0]

{'fit': 'fit',
 'user_id': '420272',
 'bust size': '34d',
 'item_id': '2260466',
 'weight': '137lbs',
 'rating': 10,
 'rented for': 'vacation',
 'review_text': "An adorable romper! Belt and zipper were a little hard to navigate in a full day of wear/bathroom use, but that's to be expected. Wish it had pockets, but other than that-- absolutely perfect! I got a million compliments.",
 'body type': 'hourglass',
 'review_summary': 'So many compliments!',
 'category': 'romper',
 'height': '5\' 8"',
 'size': 14,
 'age': '28',
 'review_date': 'April 20, 2016'}

In [5]:
len(data)

192462

In [None]:
# insert EDA here

In [36]:
# Linear regression analysis

def train_and_predict_model(feature_fn):
  # Training/test/validation sets - 70% train, 15% valid, 15% test
  X = [feature_fn(d) for d in data]
  y = [d['rating'] for d in data]

  Xtrain = X[:7 * len(X) // 10]
  ytrain = y[:7 * len(y) // 10]
  Xvalid = X[7 * len(X) // 10: int(8.5 * len(X) // 10)]
  yvalid = y[7 * len(y) // 10: int(8.5 * len(y) // 10)]
  Xtest = X[int(8.5 * len(X) // 10):]
  ytest = y[int(8.5 * len(y) // 10):]

  # Model creation
  model = linear_model.LinearRegression(fit_intercept=False)
  model.fit(Xtrain, np.matrix(ytrain).T)

  # Model validation
  ans = model.predict(Xvalid)
  actual = ans.T[0]

  mse = MSE(yvalid, actual)
  return mse

In [29]:
# Baseline rating model - predict the average rating for every prediction
def feature_base(datum):
  return [1, average_rating]

In [49]:
mse_base = train_and_predict_model(feature_base)
mse_base

2.0101764219048124

In [51]:
# Feature enginering #1: use the number of exclamation marks in review text to predict rating
def feature_1(datum):
  if 'review_text' in datum:
    return [1, datum['review_text'].count('!')]
  return [1, 0]

In [53]:
mse_1 = train_and_predict_model(feature_1)
mse_1

1.9444446124699253

In [54]:
# Feature enginering #2: use user attributes such as height, weight, and age along with item size to predict ratings of item 
def feature_2(datum):
  weight = datum['weight'] if 'weight' in datum else average_weight
  height = datum['height'] if 'height' in datum else average_height
  age = datum['age'] if 'age' in datum else average_age
  size = datum['size']
  return [1, weight, height, age, size]  

In [55]:
mse_2 = train_and_predict_model(feature_2)
mse_2

2.0062008791177117

In [56]:
# Feature enginering #3: use user attributes along with review text exclamations
def feature_3(datum):
  weight = datum['weight'] if 'weight' in datum else average_weight
  height = datum['height'] if 'height' in datum else average_height
  age = datum['age'] if 'age' in datum else average_age
  size = datum['size']
  excl_count = datum['review_text'].count('!') if 'review_text' in datum else 0
  return [1, weight, height, age, size, excl_count]  

In [57]:
mse_3 = train_and_predict_model(feature_3)
mse_3

1.942181367154325

In [58]:
mses_linreg = [mse_base, mse_1, mse_2, mse_3]
mses_linreg

[2.0101764219048124, 1.9444446124699253, 2.0062008791177117, 1.942181367154325]