In [118]:
import nltk
from nltk.tokenize import RegexpTokenizer, wordpunct_tokenize, sent_tokenize
import os
import operator
from nltk.corpus import stopwords
import numpy as np

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [119]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/xizheng/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

#### 3.1

In [120]:
all_text = ''

In [121]:
files= os.listdir('training/neg') #get all file names under the training dataset

for file in files: 
    f = open('training/neg/'+file, mode = 'r')
    article = f.read()
    #all_text.append(article.lower()) # lowercase all the words
    all_text = all_text + article.lower()
    f.close()

In [122]:
files= os.listdir('training/pos') #get all file names under the training dataset

for file in files: 
    f = open('training/pos/'+file, mode = 'r')
    article = f.read()
    #all_text.append(article.lower()) # lowercase all the words
    all_text = all_text + article.lower()
    f.close()

In [123]:
def tokenit(text):
    text_token = wordpunct_tokenize(text)
    return text_token

In [124]:
Allwords = tokenit(all_text)

In [125]:
Allwords_dic = {}
for i in Allwords:
    if i in Allwords_dic:
        Allwords_dic[i] += 1
    if i not in Allwords_dic:
        Allwords_dic[i] = 1
Allwords_dic = dict(sorted(Allwords_dic.items(), key=operator.itemgetter(1),reverse=True))


In [126]:
#use stop words to get rid of some meaningless words
stopwords = set(stopwords.words('english'))
VOC_withstopwords = list(Allwords_dic.keys())[:600]
VOC = []
for i in VOC_withstopwords:
    if i not in stopwords:
        VOC.append(i)  #this is the vocabulary used later

In [127]:
words_count_feature = [] #this is X
Sentiment = [] #this is y

In [128]:
files= os.listdir('training/neg') #get all file names under the training dataset

for file in files: 
    f = open('training/neg/'+file, mode = 'r')
    article = f.read()
    f.close()
    windows = tokenit(article.lower())
    temp_feature = []
    for i in VOC:
        if i in windows:
            temp_feature.append(1)
        else:
            temp_feature.append(0)
    words_count_feature.append(temp_feature)
    Sentiment.append(1)

In [129]:
files= os.listdir('training/pos') #get all file names under the training dataset

for file in files: 
    f = open('training/pos/'+file, mode = 'r')
    article = f.read()
    f.close()
    windows = tokenit(article.lower())
    temp_feature = []
    for i in VOC:
        if i in windows:
            temp_feature.append(1)
        else:
            temp_feature.append(0)
    words_count_feature.append(temp_feature)
    Sentiment.append(0)

In [130]:
def ten_fold_multilayer(X, y, hidden_layer,activation_func):
    X, y = shuffle(X, y, random_state=42)
    ini = 0
    accuracy = []
    for i in range(10):
        
        TrainingX = X[0:ini]+X[ini+999:len(X)]
        Trainingy = y[0:ini]+y[ini+999:len(X)]
        testX = X[ini:ini+999]
        testy = y[ini:ini+999]
        clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=hidden_layer, activation = activation_func,random_state = 1)
        clf.fit(TrainingX, Trainingy)
        y_pred = clf.predict(testX)
        accuracy.append(accuracy_score(testy, y_pred))
    print('With hidden layer ' +str(hidden_layer)+', using activation function '+activation_func
          +', the average accuracy to predict sentiment score is '+ str(sum(accuracy)/len(accuracy)))

In [148]:
Hidden_Layers = [(50,10),(250,10),(600,10),(1000,10)]
Activation_Func = ['relu','identity','tanh']

In [149]:
for i in Hidden_Layers:
    for j in Activation_Func:
        ten_fold_multilayer(words_count_feature, Sentiment, i,j)

With hidden layer (50, 10), using activation function relu, the average accuracy to predict sentiment score is 0.912912912912913
With hidden layer (50, 10), using activation function identity, the average accuracy to predict sentiment score is 0.914914914914915
With hidden layer (50, 10), using activation function tanh, the average accuracy to predict sentiment score is 0.907907907907908
With hidden layer (250, 10), using activation function relu, the average accuracy to predict sentiment score is 0.922922922922923
With hidden layer (250, 10), using activation function identity, the average accuracy to predict sentiment score is 0.914914914914915
With hidden layer (250, 10), using activation function tanh, the average accuracy to predict sentiment score is 0.9119119119119119
With hidden layer (600, 10), using activation function relu, the average accuracy to predict sentiment score is 0.9219219219219219
With hidden layer (600, 10), using activation function identity, the average accura

#### 3.2

In [152]:
clf3_2 = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(250,10), activation = 'relu',random_state = 1)
clf3_2.fit(words_count_feature, Sentiment)
y_pred_3_2 = clf3_2.predict(words_count_feature)
accuracy_3_2 = accuracy_score(Sentiment, y_pred_3_2)

In [153]:
accuracy_3_2


1.0

#### 3.3

In [156]:
from gensim.models import KeyedVectors

In [157]:
google_vecs = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [182]:
Avgwordembeddings = []

In [183]:
files= os.listdir('training/neg') #get all file names under the training dataset

for file in files: 
    f = open('training/neg/'+file, mode = 'r')
    article = f.read()
    f.close()
    windows = tokenit(article.lower())
    temp_feature = []
    for i in windows:
        if i in google_vecs.vocab:
            temp_feature.append(google_vecs[i])
        else:
            temp_feature.append(np.zeros(google_vecs.vector_size))
    Avgwordembeddings.append(np.mean(temp_feature,axis = 0))

In [190]:
files= os.listdir('training/pos') #get all file names under the training dataset

for file in files: 
    f = open('training/pos/'+file, mode = 'r')
    article = f.read()
    f.close()
    windows = tokenit(article.lower())
    temp_feature = []
    for i in windows:
        if i in google_vecs.vocab:
            temp_feature.append(google_vecs[i])
        else:
            temp_feature.append(np.zeros(google_vecs.vector_size))
    Avgwordembeddings.append(np.mean(temp_feature,axis = 0))

In [200]:
# We can still use the Sentiment list as y since we follow the same order as one hot encoding to build average
#word embeddings' X
for i in Hidden_Layers:
    for j in Activation_Func:
        ten_fold_multilayer(Avgwordembeddings, Sentiment, i,j)

With hidden layer (50, 10), using activation function relu, the average accuracy to predict sentiment score is 0.922922922922923
With hidden layer (50, 10), using activation function identity, the average accuracy to predict sentiment score is 0.9189189189189191
With hidden layer (50, 10), using activation function tanh, the average accuracy to predict sentiment score is 0.922922922922923
With hidden layer (250, 10), using activation function relu, the average accuracy to predict sentiment score is 0.922922922922923
With hidden layer (250, 10), using activation function identity, the average accuracy to predict sentiment score is 0.9169169169169169
With hidden layer (250, 10), using activation function tanh, the average accuracy to predict sentiment score is 0.9239239239239241
With hidden layer (600, 10), using activation function relu, the average accuracy to predict sentiment score is 0.9219219219219219
With hidden layer (600, 10), using activation function identity, the average accu

In [283]:
clf3_3 = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(1000,10), activation = 'relu',random_state = 1)
clf3_3.fit(Avgwordembeddings, Sentiment)
y_pred_3_3 = clf3_3.predict(Avgwordembeddings)
accuracy_3_3 = accuracy_score(Sentiment, y_pred_3_3)

In [284]:
accuracy_3_3

0.9455

#### 3.4

In [148]:
summary_list = []

In [149]:
files= os.listdir('training/neg') #get all file names under the training dataset

for file in files: 
    f = open('training/neg/'+file, mode = 'r')
    article = f.read()
    f.close()
    summary_list.append(article)

In [150]:
files= os.listdir('training/pos') #get all file names under the training dataset

for file in files: 
    f = open('training/pos/'+file, mode = 'r')
    article = f.read()
    f.close()
    windows = tokenit(article.lower())
    summary_list.append(article)

In [151]:
vectorizer = TfidfVectorizer()
train_tfidf = vectorizer.fit_transform(summary_list)
tfidf_feature = train_tfidf.toarray()

In [152]:
def svd_vector(fv, n_c):
    svd_transformer = TruncatedSVD(n_components=n_c, n_iter=7, random_state=42)
    svd = svd_transformer.fit_transform(fv,Sentiment)
    return svd

In [153]:
number_of_components = [300, 600, 900]
Hidden_Layers2 = [(250,10),(600,10)]
Activation_Func2 = ['relu']

In [156]:
for h in number_of_components:
    print('with '+str(h)+' components:')
    for i in Hidden_Layers2:
        for j in Activation_Func2:
            tempX = list(svd_vector(tfidf_feature,h))
            ten_fold_multilayer(tempX, Sentiment, i,j)

with 300 components:
With hidden layer (250, 10), using activation function relu, the average accuracy to predict sentiment score is 0.9309309309309308
With hidden layer (600, 10), using activation function relu, the average accuracy to predict sentiment score is 0.9419419419419419
with 600 components:
With hidden layer (250, 10), using activation function relu, the average accuracy to predict sentiment score is 0.924924924924925
With hidden layer (600, 10), using activation function relu, the average accuracy to predict sentiment score is 0.92992992992993
with 900 components:
With hidden layer (250, 10), using activation function relu, the average accuracy to predict sentiment score is 0.937937937937938
With hidden layer (600, 10), using activation function relu, the average accuracy to predict sentiment score is 0.9319319319319319


In [157]:
clf3_4 = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(600,10), activation = 'relu',random_state = 1)
clf3_4.fit(list(svd_vector(tfidf_feature,300)), Sentiment)
y_pred_3_4 = clf3_4.predict(list(svd_vector(tfidf_feature,300)))
accuracy3_4 = accuracy_score(Sentiment, y_pred_3_4)

In [158]:
accuracy3_4

0.9897

#### 3.5

In [167]:
svd_transformer1 = TruncatedSVD(n_components=300, n_iter=7, random_state=42)
svd1 = svd_transformer1.fit(tfidf_feature)

In [190]:
topic = np.array(vectorizer.get_feature_names())
weight = np.argsort(svd1.singular_values_).flatten()[::-1]

In [191]:
n = 5
top_n = topic[weight][:n]

In [193]:
top_n

array(['00', '000', '0000000', '00a', '00am'], dtype='<U32')

In [194]:
svd1.components_

array([[ 7.73255211e-03,  1.32318325e-03,  5.91089788e-05, ...,
         3.14515864e-05,  3.14515864e-05,  3.14515864e-05],
       [ 1.14342345e-02,  1.61863908e-03, -7.61384861e-05, ...,
        -2.65806449e-05, -2.65806449e-05, -2.65806449e-05],
       [ 8.56233157e-03,  3.73887052e-03,  3.17068405e-05, ...,
        -2.80741789e-04, -2.80741789e-04, -2.80741789e-04],
       ...,
       [ 5.44553562e-02,  1.60506269e-04, -7.72609053e-04, ...,
        -9.88369769e-04, -9.88369769e-04, -9.88369769e-04],
       [ 1.70943091e-02,  2.90847610e-03, -6.70595420e-04, ...,
         1.29130783e-03,  1.29130783e-03,  1.29130783e-03],
       [-1.58436542e-03, -9.30359719e-04, -2.70084618e-03, ...,
         1.78089013e-03,  1.78089013e-03,  1.78089013e-03]])

#### 3.6

In [209]:
test_vector = []

In [210]:
files_test= os.listdir('test/') #get all file names under the training dataset

for file in files_test: 
    f = open('test/'+file, mode = 'r')
    article = f.read()
    
    f.close()
    windows = tokenit(article.lower())
    temp_feature = []
    for i in VOC:
        if i in windows:
            temp_feature.append(1)
        else:
            temp_feature.append(0)
    test_vector.append(temp_feature)

In [211]:
clf3_2 = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(250,10), activation = 'relu',random_state = 1)
clf3_2.fit(words_count_feature, Sentiment)
y_pred_3_6 = clf3_2.predict(test_vector)


In [212]:
len(test_vector)

495

In [213]:
pos_test = []
neg_test = []

In [214]:
for i in range(len(y_pred_3_6)):
    if y_pred_3_6[i] ==1:
        neg_test.append(files_test[i])
    if y_pred_3_6[i] ==0:
        pos_test.append(files_test[i])

In [215]:
with open('pos.txt', 'w') as f:
    for i in pos_test:
        f.writelines(i + '\n')

In [216]:
with open('neg.txt', 'w') as f:
    for i in neg_test:
        f.writelines(i + '\n')