# Task2b Sentiment Analysis of Movie Reviews

In [1]:
# read MPQA lexicon (all positive and negative lexicon)
mpqa_dict = {}
with open('MPQA.tff', 'r') as f:
    all_lines = f.readlines()
    for line in all_lines:
        attrs = line.split(' ')
        word_attr = attrs[2]
        polarity_attr = attrs[-1]
        word = word_attr.split('=')[1]
        polarity = polarity_attr.split('=')[1]
        polarity = polarity.replace('\n', '')
        if polarity != 'positive' and polarity != 'negative':
            continue
        mpqa_dict.update({word: polarity})
        
print(mpqa_dict)   



In [2]:
# load corpus data
neg_data = []
pos_data = []
with open('corpus2/rt-polarity.neg', 'r', encoding='latin-1') as f:
    neg_all_lines = f.readlines()

for line in neg_all_lines:
    neg_data.append((line, 'neg'))

with open('corpus2/rt-polarity.pos', 'r', encoding='latin-1') as f:
    pos_all_lines = f.readlines()
    
for line in pos_all_lines:
    pos_data.append((line, 'pos'))

## Baseline: Count pos and neg words in a review

In [3]:
total = len(neg_data) + len(pos_data)
lexicon_cnt = []

correct_pos = 0
for review in pos_data:
    words = review[0].split(' ')
    pos_cnt = 0
    neg_cnt = 0
    for word in words:
        polarity = mpqa_dict.get(word)
        if polarity == 'positive':
            pos_cnt += 1
        elif polarity == 'negative':
            neg_cnt += 1
    if neg_cnt < pos_cnt:
        correct_pos += 1
    lexicon_cnt.append((pos_cnt, neg_cnt))

correct_neg = 0
for review in neg_data:
    words = review[0].split(' ')
    pos_cnt = 0
    neg_cnt = 0
    for word in words:
        polarity = mpqa_dict.get(word)
        if polarity == 'positive':
            pos_cnt += 1
        elif polarity == 'negative':
            neg_cnt += 1
    if neg_cnt > pos_cnt:
        correct_neg += 1
    lexicon_cnt.append((pos_cnt, neg_cnt))

correct = correct_neg + correct_pos
print('neg accuracy: ' + str(correct_neg/len(neg_data)))
print('pos accuracy: ' + str(correct_pos/len(pos_data)))
print('correct: ' + str(correct))
print('total:' + str(total))
print('overall accuracy: ' + str(correct/total))

neg accuracy: 0.41399362220971675
pos accuracy: 0.6126430313262052
correct: 5473
total:10662
overall accuracy: 0.513318326767961


## Bag of Words Machine Learning Approach

In [12]:
# build bag of words features
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
vectorizer = CountVectorizer()
corpus = pos_all_lines + neg_all_lines
X = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names()
print("number of features: " + str(len(feature_names)))
review_array = np.array(X.toarray())
print(review_array)

number of features: 18330
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [13]:
print(feature_names)



In [14]:
# build labels
review_labels = []
for i in range(5331):
    review_labels.append(1)

for i in range(5331):
    review_labels.append(-1)

review_labels = np.array(review_labels)
print(review_labels)

[ 1  1  1 ... -1 -1 -1]


In [15]:
# kfold BOW classification
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

logistic_accuracies = []
nb_accuracies = []
kf = KFold(5, True)
for train_index, test_index in kf.split(review_array):
    # get split data and labels
    x_train, x_test = review_array[train_index], review_array[test_index]
    y_train, y_test = review_labels[train_index], review_labels[test_index]
    # logisticRegression classifier
    logistic_clf = LogisticRegression(max_iter=200)
    logistic_clf.fit(x_train, y_train)
    logistic_accuracy = logistic_clf.score(x_test, y_test)
    print("logistic accuracy:" + str(logistic_accuracy))
    logistic_accuracies.append(logistic_accuracy)
    # Naive Bayes classifier
    nb_clf = MultinomialNB()
    nb_clf.fit(x_train, y_train)
    nb_accuracy = nb_clf.score(x_test, y_test)
    print("nb accuracy:" + str(nb_accuracy))
    nb_accuracies.append(nb_accuracy)

logistic_overall_acc = 0
for acc in logistic_accuracies:
    logistic_overall_acc += acc

logistic_overall_acc /= len(logistic_accuracies)
print("logistic overall accuracy: " + str(logistic_overall_acc))

nb_overall_acc = 0
for acc in nb_accuracies:
    nb_overall_acc += acc

nb_overall_acc /= len(nb_accuracies)
print("bayes overall accuracy: " + str(nb_overall_acc))
    

logistic accuracy:0.7712142522269104
nb accuracy:0.776840131270511
logistic accuracy:0.7759024847632443
nb accuracy:0.7796530707923113
logistic accuracy:0.7565666041275797
nb accuracy:0.775328330206379
logistic accuracy:0.7603189493433395
nb accuracy:0.7772045028142589
logistic accuracy:0.7617260787992496
nb accuracy:0.7725140712945591
logistic overall accuracy: 0.7651456738520648
bayes overall accuracy: 0.7763080212756039


## Expand Features Set
* add feature: how many words in a review come from positive lexicon
* add feature: how many words in a review come from negative lexicon
* filter feature: remove numbers
* (even worse): change word count ebedding to tfidf
* (not effective): dealing with negation

In [16]:
# build bag of words features
# from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = CountVectorizer()
corpus = pos_all_lines + neg_all_lines
for i in range(len(corpus)):
    for j in range(10):
        corpus[i] = corpus[i].replace(str(j), '')
#     words = corpus[i].split(" ")
#     new_review = []
#     ii = 0
#     while ii < len(words)-1:
#         new_review.append(words[ii])
#         if words[ii] == "not" or "n't" in words[ii] or words[ii] == "no" or words[ii] == 'never':
#             j = ii+1
#             while(j < len(words)-1):
#                 words[j] = "NOT_" + words[j]
#                 j += 1
#                 new_review.append(words[j])
#         ii += 1
#     new_review = ' '.join(new_review)
#     corpus[i] = new_review
X = vectorizer.fit_transform(corpus)
feature_names = vectorizer.get_feature_names()
print("number of features: " + str(len(feature_names)))
review_array = X.toarray().tolist()
for i in range(len(review_array)):
    review_array[i].append(lexicon_cnt[i][0])
    review_array[i].append(lexicon_cnt[i][1])
review_array = np.array(review_array)
print(review_array)

number of features: 18156
[[0 0 0 ... 0 2 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 1 0]
 ...
 [0 0 0 ... 0 0 3]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 2 2]]


In [17]:
print(feature_names)



In [18]:
# build labels
review_labels = []
for i in range(5331):
    review_labels.append(1)

for i in range(5331):
    review_labels.append(-1)

review_labels = np.array(review_labels)
print(review_labels)

[ 1  1  1 ... -1 -1 -1]


In [19]:
# kfold BOW classification
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

logistic_accuracies = []
nb_accuracies = []
kf = KFold(5, True)
for train_index, test_index in kf.split(review_array):
    # get split data and labels
    x_train, x_test = review_array[train_index], review_array[test_index]
    y_train, y_test = review_labels[train_index], review_labels[test_index]
    # logisticRegression classifier
    logistic_clf = LogisticRegression(max_iter=250)
    logistic_clf.fit(x_train, y_train)
    logistic_accuracy = logistic_clf.score(x_test, y_test)
    print("logistic accuracy:" + str(logistic_accuracy))
    logistic_accuracies.append(logistic_accuracy)
    # Naive Bayes classifier
    nb_clf = MultinomialNB()
    nb_clf.fit(x_train, y_train)
    nb_accuracy = nb_clf.score(x_test, y_test)
    print("nb accuracy:" + str(nb_accuracy))
    nb_accuracies.append(nb_accuracy)

logistic_overall_acc = 0
for acc in logistic_accuracies:
    logistic_overall_acc += acc

logistic_overall_acc /= len(logistic_accuracies)
print("logistic overall accuracy: " + str(logistic_overall_acc))

nb_overall_acc = 0
for acc in nb_accuracies:
    nb_overall_acc += acc

nb_overall_acc /= len(nb_accuracies)
print("bayes overall accuracy: " + str(nb_overall_acc))

logistic accuracy:0.7824660103141116
nb accuracy:0.8012189404594467
logistic accuracy:0.7623066104078763
nb accuracy:0.7716830754805438
logistic accuracy:0.7865853658536586
nb accuracy:0.7940900562851783
logistic accuracy:0.7664165103189493
nb accuracy:0.7908067542213884
logistic accuracy:0.7734521575984991
nb accuracy:0.7823639774859287
logistic overall accuracy: 0.774245330898619
bayes overall accuracy: 0.7880325607864972
