In [None]:
import gzip
import math
import numpy
import sklearn
import string
import pandas as pd
from collections import defaultdict
from sklearn import linear_model
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
train_path = "/content/drive/Shareddrives/CSE258/clean_data/train.csv"
valid_path = "/content/drive/Shareddrives/CSE258/clean_data/valid.csv"
test_path = "/content/drive/Shareddrives/CSE258/clean_data/test.csv"

In [None]:
train_df = pd.read_csv(train_path)

In [None]:
wordCount = defaultdict(int)
punctuation = set(string.punctuation)
stopwords = set(stopwords.words("english"))

for index, data in train_df.iterrows():
  d = str(data['review'])
  r = ''.join([c for c in d.lower() if not c in punctuation])
  for w in r.split():
    if w not in stopwords:
      wordCount[w] += 1

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

In [None]:
words = [x[1] for x in counts[:1000]]

In [None]:
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [None]:
def feature(datum):
  feat = [0]*(len(words)+1)
  r = ''.join([c for c in datum.lower() if not c in punctuation])
  for w in r.split():
      if w in words:
        if w not in stopwords:
          feat[wordId[w]] += 1

  feat[-1] = len(r.split())

  return feat

In [None]:
X = []
Y = []

for index, data in train_df.iterrows():
  review = str(data['review'])
  rating = data['rating']

  X.append(feature(review))
  Y.append(rating)

In [None]:
valid_X = []
valid_Y = []

valid_df = pd.read_csv(valid_path)

for index, data in valid_df.iterrows():
  review = str(data['review'])
  rating = data['rating']

  valid_X.append(feature(review))
  valid_Y.append(rating)

In [None]:
test_X = []
test_Y = []

test_df = pd.read_csv(test_path)

for index, data in test_df.iterrows():
  review = str(data['review'])
  rating = data['rating']

  test_X.append(feature(review))
  test_Y.append(rating)

In [None]:
def MSE(predictions, labels):
  differences = [(x-y)**2 for x,y in zip(predictions,labels)]
  return sum(differences) / len(differences)

In [None]:
for alpha in [5, 4.5, 4, 3.5, 3, 2.5, 2, 1.5, 1, 0.1, 0.01]:
  clf = linear_model.Ridge(alpha = alpha, fit_intercept=True)
  clf.fit(X, Y)

  Y_pred = clf.predict(X)
  train_MSE = MSE(Y_pred, Y)

  valid_Y_pred = clf.predict(valid_X)
  valid_MSE = MSE(valid_Y_pred, valid_Y)

  test_Y_pred = clf.predict(test_X)
  test_MSE = MSE(test_Y_pred, test_Y)

  print("=========================")
  print("Train MSE: ", train_MSE)
  print("Valid MSE: ", valid_MSE)
  print("Test MSE: ", test_MSE)

Train MSE:  1.5820089697422464
Valid MSE:  1.5816573078626341
Test MSE:  1.6020241280475243
Train MSE:  1.5820089302658586
Valid MSE:  1.5816589097239118
Test MSE:  1.6020265749951508
Train MSE:  1.5820088949290916
Valid MSE:  1.5816605161679012
Test MSE:  1.602029026662673
Train MSE:  1.582008863735834
Valid MSE:  1.5816621271985822
Test MSE:  1.6020314830541043
Train MSE:  1.5820088366897613
Valid MSE:  1.581663742819921
Test MSE:  1.6020339441734073
Train MSE:  1.5820088137944106
Valid MSE:  1.5816653630358926
Test MSE:  1.60203641002453
Train MSE:  1.582008795053514
Valid MSE:  1.5816669878505027
Test MSE:  1.6020388806114645
Train MSE:  1.5820087804708174
Valid MSE:  1.5816686172677057
Test MSE:  1.6020413559381972
Train MSE:  1.5820087700500138
Valid MSE:  1.581670251291531
Test MSE:  1.6020438360086944
Train MSE:  1.5820087617925134
Valid MSE:  1.5816732041557826
Test MSE:  1.6020483121026436
Train MSE:  1.58200876170979
Valid MSE:  1.5816735002647295
Test MSE:  1.60204876055900

In [None]:
coef = clf.coef_
wordsCoef = [(c, w) for c, w in zip(coef[:-1], words)]
wordsCoef.sort()
wordsCoef.reverse()

In [None]:
wordsCoef[:10]

[(0.3800203674465144, 'nan'),
 (0.31394229536930823, 'glad'),
 (0.2787621780643798, 'amazing'),
 (0.2675841286770794, 'highly'),
 (0.26026877726257786, 'satisfied'),
 (0.25725503873408784, 'fantastic'),
 (0.24261042665048332, 'love'),
 (0.2244220575581366, 'best'),
 (0.22313142444752718, 'loves'),
 (0.22068679345473605, 'awesome')]

In [None]:
wordsCoef[-10:]

[(-0.2777942097321673, 'worked'),
 (-0.28890371151045396, 'return'),
 (-0.2936391652289254, 'piece'),
 (-0.3093405590364681, 'stay'),
 (-0.32194412339713796, 'returned'),
 (-0.32368781778625144, 'worse'),
 (-0.3703865328949903, 'disappointed'),
 (-0.3781283234670061, 'poor'),
 (-0.4344605377841999, 'horrible'),
 (-0.790016162569127, 'worst')]

In [None]:
with open("file.txt", 'w') as output:
    for row in wordsCoef:
        c = row[0]
        w = row[1]
        output.write(str(c) + '\t' + w + '\n')

In [None]:
str(wordsCoef[0])

"(0.3800203674465144, 'nan')"

In [None]:
coef[-1]

0.0015649162936596896