In [1]:
import json
from matplotlib import pyplot as plt
from collections import defaultdict
from sklearn import linear_model, metrics
from scipy import optimize
import numpy as np
import random
import gzip
import math

In [2]:
def parse(path):
  g = gzip.open(path, 'r')
  for l in g:
    yield eval(l)

null = None
count = 0

dataset = []
for l in parse("renttherunway_final_data.json.gz"):
  if l["rating"] != null:
    dataset.append(l)
    count += 1
    if count >= 100000:
      break


In [3]:
dataTrain = dataset[:80000]
dataTest = dataset[80000:]

print(len(dataTrain))
print(len(dataTest))
print(dataset[10])

80000
20000
{'fit': 'small', 'user_id': '185966', 'bust size': '34b', 'item_id': '1077123', 'weight': '135lbs', 'rating': '8', 'rented for': 'party', 'review_text': "The dress arrived with a small hole in the beading on the front but wasn't too noticeable. Glad I was able to get two sizes because the 4 was a little tight and would've made for an uncomfortable night of dancing! ", 'body type': 'athletic', 'review_summary': "It was fun to wear a dress I wouldn't normally buy! ", 'category': 'dress', 'height': '5\' 3"', 'size': 12, 'age': '33', 'review_date': 'January 2, 2018'}


In [4]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

Ytrain = [float(y['rating']) for y in dataTrain]
Ytest = [float(y['rating']) for y in dataTest]

## 1. Estimates a rating from the number of times the exclamation mark (!) symbol is used in the review

In [5]:
def feature1(data):
    return [1, float(data['review_text'].count('!'))]

Xtrain = [feature1(x) for x in dataTrain]
Xtest = [feature1(x) for x in dataTest]

theta, residuals, rank, s = np.linalg.lstsq(Xtrain,Ytrain,rcond=None)

predictions = np.dot(Xtest, theta)
print("MSE: " + str(MSE(predictions, Ytest)))

MSE: 1.9583700211654784


## 2. Estimates a rating from the number of times the exclamation mark (!) symbol is used in the review and the review summary

In [6]:
def feature2(data):
    return [1, float(data['review_text'].count('!')), float(data['review_summary'].count('!'))]

Xtrain = [feature2(x) for x in dataTrain]
Xtest = [feature2(x) for x in dataTest]

theta, residuals, rank, s = np.linalg.lstsq(Xtrain,Ytrain,rcond=None)

predictions = np.dot(Xtest, theta)
print("MSE: " + str(MSE(predictions, Ytest)))

MSE: 1.9106472689289316


## 3. Estimates a rating from the length of the review and the number of times the exclamation mark (!) symbol is used in the review and the review summary

In [7]:
def feature3(data):
    return [1, float(len(data['review_text'])), float(data['review_text'].count('!')), float(data['review_summary'].count('!'))]

Xtrain = [feature3(x) for x in dataTrain]
Xtest = [feature3(x) for x in dataTest]

theta, residuals, rank, s = np.linalg.lstsq(Xtrain,Ytrain,rcond=None)

predictions = np.dot(Xtest, theta)
print("MSE: " + str(MSE(predictions, Ytest)))

MSE: 1.9029322010328265


## 4. Train a model that fits a polynomial function to estimate ratings based on our ‘!’ feature

In [8]:
def feature4(datum, deg):
    # feature for a specific polynomial degree
    val = float(datum['review_text'].count('!'))
    res = [1, val]
    i = 1
    while i < deg:
        res.append(val ** (i+1))
        i += 1
    return res

mses = []
for deg in range(2,6):
    Xtrain = [feature4(x, deg) for x in dataTrain]
    Xtest = [feature4(x, deg) for x in dataTest]

    theta, residuals, rank, s = np.linalg.lstsq(Xtrain,Ytrain,rcond=None)
    predictions = np.dot(Xtest, theta)
    print("MSE for count('!') with degree " + str(deg) + " :" + str(MSE(predictions, Ytest)))

MSE for count('!') with degree 2 :1.94793280030395
MSE for count('!') with degree 3 :1.9417690198764075
MSE for count('!') with degree 4 :1.9370901166116563
MSE for count('!') with degree 5 :1.9344976330643133
