## Question 7:

In [1]:
import json
import numpy as np
import random
import string

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import linear_model
from collections import defaultdict

In [2]:
def parseData(fname):
    for l in open(fname):
        yield eval(l)

In [3]:
path = "/home/cui/Projects/PycharmProjects/CSE-158/data/train_Category.json"

In [4]:
dataset = list(parseData(path))

In [5]:
random.shuffle(dataset)

In [6]:
y_training = [d['rating'] for d in dataset[:10000]]
y_validation = [d['rating'] for d in dataset[10000:20000]]
y_test = [d['rating'] for d in dataset[20000:30000]]

In [7]:
def MSE(predictions, targets):
    return ((predictions - targets) ** 2).mean()

In [8]:
def function(ngrams, removePunctuation, tfidf, wordCounts):
    punctuation = set(string.punctuation)
    corpus = []
    max_features = 20000
    
    if removePunctuation:
        for d in dataset[:30000]:
            text = d['review_text']
            text = text.lower()
            text = [c for c in text if not (c in punctuation)]
            text = ''.join(text)
            corpus.append(text)
    else:
        for d in dataset[:30000]:
            tmp = d['review_text']
            tmp = tmp.lower()
            text = []
            for c in tmp:
                if c in punctuation:
                    text.append(" ")
                    text.append(c)
                    text.append(" ")
                else:
                    text.append(c)
            text = ''.join(text)
            corpus.append(text)
    
    X = []
    
    # if parameter tfidf is True, using tf-idf vectorizer 
    if tfidf:
        vec = TfidfVectorizer(ngram_range=(ngrams, ngrams), max_features=max_features)
        X = vec.fit_transform(corpus)
        X = X.toarray()
    
    # if parameter wordCounts is True, using count vectorizer
    if wordCounts:
        vec = CountVectorizer(ngram_range=(ngrams, ngrams), max_features=max_features)
        X = vec.fit_transform(corpus)
        X = X.toarray()
    
    X_training = X[:10000]
    X_validation = X[10000:20000]
    X_test = X[20000:30000]
    
    MSE_list = []
    regularization = [0.01, 0.1, 1, 10, 100]
    for r in regularization:
        clf = linear_model.Ridge(r, fit_intercept=False)
        clf.fit(X_training, y_training)
        predictions = clf.predict(X_validation)
        MSE_list.append((r, MSE(predictions, y_validation)))
    
        del clf
        
    return MSE_list, X_training, X_validation, X_test

In [9]:
def MSE_testSet(MSE_list, X_training, X_validation, X_test):
    r = min(MSE_list, key = lambda x:x[1])[0]

    clf = linear_model.Ridge(r, fit_intercept=False)
    clf.fit(X_training, y_training)
    predictions = clf.predict(X_test)
    
    mse = round(MSE(predictions, y_test), 3)

    print ("The MSE of the test set is {:.3f}.".format(mse))
    
    return mse

In [10]:
performance = []

### 1. Unigrams & Removing punctuation & tfidf scores

In [11]:
MSE_list, X_training, X_validation, X_test = function(1, True, True, False)

In [12]:
MSE_list

[(0.01, 4.231921387378723),
 (0.1, 2.530514873651722),
 (1, 1.98077786370894),
 (10, 2.26540381409375),
 (100, 3.3078669950698605)]

In [13]:
performance.append(("Unigrams, remove punctuation and using tf-idf scores", MSE_testSet(MSE_list, X_training, X_validation, X_test)))

The MSE of the test set is 1.940.


### 2. Unigrams & Removing punctuation & Word counts

In [14]:
MSE_list, X_training, X_validation, X_test = function(1, True, False, True)

In [15]:
MSE_list

[(0.01, 100.56290076827047),
 (0.1, 54.292731348707385),
 (1, 22.805745018853557),
 (10, 10.85740713173832),
 (100, 7.388132286034386)]

In [16]:
performance.append(("Unigrams, remove punctuation and using word counts", MSE_testSet(MSE_list, X_training, X_validation, X_test)))

The MSE of the test set is 7.116.


### 3. Unigrams & Preserving punctuation & tfidf scores

In [17]:
MSE_list, X_training, X_validation, X_test = function(1, False, True, False)

In [18]:
MSE_list

[(0.01, 4.316604607819873),
 (0.1, 2.5358068742106763),
 (1, 1.9652322732523957),
 (10, 2.2433897060794266),
 (100, 3.2837694424099646)]

In [19]:
performance.append(("Unigrams, preserve punctuation and using tf-idf scores", MSE_testSet(MSE_list, X_training, X_validation, X_test)))

The MSE of the test set is 1.932.


### 4. Unigrams & Preserving punctuation & Word counts

In [20]:
MSE_list, X_training, X_validation, X_test = function(1, False, False, True)

In [21]:
MSE_list

[(0.01, 104.49586690530538),
 (0.1, 55.371576744287566),
 (1, 22.85794513506134),
 (10, 10.999730770757104),
 (100, 7.454292447464862)]

In [22]:
performance.append(("Unigrams, preserve punctuation and using word counts", MSE_testSet(MSE_list, X_training, X_validation, X_test)))

The MSE of the test set is 7.220.


###  5. Bigrams & Removing punctuation & tfidf scores

In [23]:
MSE_list, X_training, X_validation, X_test = function(2, True, True, False)

In [24]:
MSE_list

[(0.01, 4.896307534843344),
 (0.1, 3.8012622958526743),
 (1, 3.0363105832826585),
 (10, 3.6130739101107743),
 (100, 6.587131482229626)]

In [25]:
performance.append(("Bigrams, remove punctuation and using tf-idf scores", MSE_testSet(MSE_list, X_training, X_validation, X_test)))

The MSE of the test set is 2.889.


### 6. Bigrams & Removing punctuation & Word counts

In [26]:
MSE_list, X_training, X_validation, X_test = function(2, True, False, True)

In [27]:
MSE_list

[(0.01, 61.28342585489422),
 (0.1, 33.54553164355451),
 (1, 19.63217253144386),
 (10, 10.230048234295362),
 (100, 7.455529943741971)]

In [28]:
performance.append(("Bigrams, remove punctuation and using word counts", MSE_testSet(MSE_list, X_training, X_validation, X_test)))

The MSE of the test set is 7.127.


### 7. Bigrams & Preserving punctuation & tfidf scores

In [29]:
MSE_list, X_training, X_validation, X_test = function(2, False, True, False)

In [30]:
MSE_list

[(0.01, 4.863034411156802),
 (0.1, 3.758555054253984),
 (1, 3.002637063205719),
 (10, 3.5770305692365634),
 (100, 6.538793974606483)]

In [31]:
performance.append(("Bigrams, preserve punctuation and using tf-idf scores", MSE_testSet(MSE_list, X_training, X_validation, X_test)))

The MSE of the test set is 2.856.


### 8. Bigrams & Preserving punctuation & Word Counts

In [32]:
MSE_list, X_training, X_validation, X_test = function(2, False, False, True)

In [33]:
MSE_list

[(0.01, 61.10764171329383),
 (0.1, 34.60855769334343),
 (1, 19.747199837154405),
 (10, 10.290871247045763),
 (100, 7.420340762065661)]

In [34]:
performance.append(("Bigrams, preserve punctuation and using word counts", MSE_testSet(MSE_list, X_training, X_validation, X_test)))

The MSE of the test set is 7.089.


In [35]:
for i in performance:
    print(i[0] + ": " + str(i[1]))

print("The best performance on test set is using " + min(performance, key = lambda x:x[1])[0])

Unigrams, remove punctuation and using tf-idf scores: 1.94
Unigrams, remove punctuation and using word counts: 7.116
Unigrams, preserve punctuation and using tf-idf scores: 1.932
Unigrams, preserve punctuation and using word counts: 7.22
Bigrams, remove punctuation and using tf-idf scores: 2.889
Bigrams, remove punctuation and using word counts: 7.127
Bigrams, preserve punctuation and using tf-idf scores: 2.856
Bigrams, preserve punctuation and using word counts: 7.089
The best performance on test set is using Unigrams, preserve punctuation and using tf-idf scores
