In [None]:
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

# for auto-reloading extenrnal modules
%load_ext autoreload
%autoreload 2

In [None]:
# preprocessing
from util import preprocessData, createFeatures

(ratings, reviews) = preprocessData('sportsTrain.csv')

In [None]:
# Building a dictionary
dic = {}
for review in reviews:
    for word in review.split():
        dic[word] = dic.get(word, 0) + 1
print(len(dic))

In [None]:
# keeping words that occurs over 500 times
occurenceThreshold = 500
wordToIndex = {}
indexToWord = []
for key, value in dic.items():
    if value > occurenceThreshold:
        indexToWord.append(key)
        wordToIndex[key] = len(indexToWord) - 1
n = len(wordToIndex)
print(len(wordToIndex), len(indexToWord))

In [None]:
(ratingsVal, reviewsVal) = preprocessData('sportsDev.csv')
print(len(ratingsVal))
(Xval, yval) = createFeatures(reviewsVal, ratingsVal, wordToIndex, 10000)

In [None]:
(ratingsTest, reviewsTest) = preprocessData('sportsTest.csv')
print(len(ratingsTest))
(Xtest, ytest) = createFeatures(reviewsTest, ratingsTest, wordToIndex, 10000)

In [None]:
# creating a more balanced training set
import random

max_sample = 80000
classes = [[] for i in range(5)]
for i in range(len(ratings)):
    r = ratings[i]
    classes[r-1].append(i)

indices_balanced = []
for i in range(5):
    indices = random.sample(classes[i], max_sample)
    indices_balanced += indices

In [None]:
random.shuffle(indices_balanced)
reviews = [reviews[i] for i in indices_balanced]
ratings = [ratings[i] for i in indices_balanced]

In [None]:
print(len(ratings))

In [None]:
from sklearn.svm import LinearSVC
from util import evalModel

# Plotting the learning curve
trainSizes = [1000, 5000, 10000, 30000, 50000, 70000, 100000]
trainAccs = []
valAccs = []
trainF1s = []
valF1s = []

for trainSize in trainSizes:
    # create feature vector and labels
    (X, y) = createFeatures(reviews, ratings, wordToIndex, trainSize)
    # train a linear model
    svmModel = LinearSVC(dual = False, max_iter=5000).fit(X, y)
    (valAcc, valF1, _) = evalModel(svmModel.predict(Xval), yval)
    (trainAcc, trainF1, _) = evalModel(svmModel.predict(X), y)
    trainAccs.append(trainAcc)
    valAccs.append(valAcc)
    trainF1s.append(trainF1)
    valF1s.append(valF1)

# plot
plt.title('Learning Curve Using Accuracy')
plt.plot(trainSizes, trainAccs, 'bo-', label = 'training')
plt.plot(trainSizes, valAccs, 'ro-', label = 'validation')
plt.xlabel('Number of Training Examples')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

plt.title('Learning Curve Using F1 Score')
plt.plot(trainSizes, trainF1s, 'bo-', label = 'training')
plt.plot(trainSizes, valF1s, 'ro-', label = 'validation')
plt.xlabel('Number of Training Examples')
plt.ylabel('Average F1 Score')
plt.legend()
plt.show()

In [None]:
(testAcc, testF1, confM) = evalModel(svmModel.predict(Xtest), ytest)
print(testAcc)
print(testF1)
print(confM.astype(np.int))

In [None]:
from sklearn.linear_model import SGDClassifier
from util import evalModel

# Plotting the learning curve
trainSizes = [30000, 50000, 70000, 100000, 200000, 400000]
trainAccs = []
valAccs = []
trainF1s = []
valF1s = []
maxDataSize = 100000

for trainSize in trainSizes:
    # train a linear model
    svmModel = None
    if trainSize > maxDataSize:
        for i in range(trainSize // maxDataSize):
            (X, y) = createFeatures(reviews[i*maxDataSize:(i+1)*maxDataSize],\
                                    ratings[i*maxDataSize:(i+1)*maxDataSize],\
                                    wordToIndex, maxDataSize)
            svmModel = SGDClassifier(max_iter=1000, loss='squared_hinge'\
                                     , warm_start = True).fit(X, y)
    else:
        (X, y) = createFeatures(reviews, ratings, wordToIndex, trainSize)
        svmModel = SGDClassifier(max_iter=1000, loss='squared_hinge').fit(X, y)
    (valAcc, valF1, _) = evalModel(svmModel.predict(Xval), yval)
    (trainAcc, trainF1, _) = evalModel(svmModel.predict(X), y)
    trainAccs.append(trainAcc)
    valAccs.append(valAcc)
    trainF1s.append(trainF1)
    valF1s.append(valF1)

# plot
plt.title('Learning Curve Using Accuracy')
plt.plot(trainSizes, trainAccs, 'bo-', label = 'training')
plt.plot(trainSizes, valAccs, 'ro-', label = 'validation')
plt.xlabel('Number of Training Examples')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

plt.title('Learning Curve Using F1 Score')
plt.plot(trainSizes, trainF1s, 'bo-', label = 'training')
plt.plot(trainSizes, valF1s, 'ro-', label = 'validation')
plt.xlabel('Number of Training Examples')
plt.ylabel('Average F1 Score')
plt.legend()
plt.show()

In [None]:
(testAcc, testF1, confM) = evalModel(svmModel.predict(Xtest), ytest)
print(testAcc)
print(testF1)
print(confM.astype(np.int))