In [2]:
import nltk
from nltk import FreqDist
import re
import pandas as pd

df = pd.read_csv('movie_review_one.csv')
texts = df['Phrase'].str.lower().to_list()
labels = df['Sentiment'].astype(str).to_list()
test_train = df['label'].to_list()
word_tokens = texts


In [3]:
def print_eval_measures(gold, predicted):
    # get a list of labels
    labels = ["0","1","2","3","4"]
    # these lists have values for each label
    recall_list = []
    precision_list = []
    F1_list = []
    for lab in labels:
        print("Processing label :{}".format(lab))
        # for each label, compare gold and predicted lists and compute values
        TP = FP = FN = TN = 0
        for i, val in enumerate(gold):
            if val == lab and predicted[i] == lab:  TP += 1
            if val == lab and predicted[i] != lab:  FN += 1
            if val != lab and predicted[i] == lab:  FP += 1
            if val != lab and predicted[i] != lab:  TN += 1
        # use these to compute recall, precision, F1
        recall = TP / (TP + FP)  if(TP!=0 and FP!=0) else 0
        precision = TP / (TP + FN) if(TP!=0 and FN!=0) else 0
        recall_list.append(recall)
        precision_list.append(precision)
        if( recall != 0 and precision !=0):
            F1_list.append( 2 * (recall * precision) / (recall + precision))
        else:
            F1_list.append(0)

    # the evaluation measures in a table with one row per label
    print('\tPrecision\tRecall\t\tF1')
    # print measures for each label
    for i, lab in enumerate(labels):
        print(lab, '\t', "{:10.3f}".format(precision_list[i]),
              "{:10.3f}".format(recall_list[i]), "{:10.3f}".format(F1_list[i]))

    cm = nltk.ConfusionMatrix(gold, predicted)
    #print(cm.pretty_format(sort_by_count=False, truncate=9))
    # or show the results as percentages
    print(cm.pretty_format(sort_by_count=False,values_in_chart=True, show_percents=True, truncate=9))

In [4]:
def run_nb(featuresets, test_train):
    train = []
    test = []
    for t,f in  zip(test_train,featuresets):
        if(t == "train"):
            train.append(f)
        else:
            test.append(f)
    classifier = nltk.NaiveBayesClassifier.train(train)
    predicted=classifier.classify_many([fs for (fs, l) in test])
    gold =[]
    for f,l in test:
        gold.append(l)
    print("length predicted:{}, gold:{}".format(len(predicted),len(gold)))
    print_eval_measures(gold,predicted)

    correct = [l == r for  l, r in zip(gold, predicted)]
    accuracy = sum(correct) / len(correct)
    print("Accuracy: {}".format(accuracy))
    #print("Top 30 Features")
    #classifier.show_most_informative_features(30)
    return  classifier
    # evaluate the accuracy of the classifier
    #accuracy=nltk.classify.accuracy(classifier, test)
    #print("Accuracy: {}".format(accuracy))

    # the accuracy result may vary since we randomized the documents

    # show which features of classifier are most informative
def explain_model(classifier):
    classifier.show_most_informative_features(30)

#sen_tokens = [nltk.sent_tokenize(text) for text in texts]
#word_tokens = [nltk.word_tokenize(sent) for sent in sen_tokens]

#nltkstopwords = nltk.corpus.stopwords.words('english')
#morestopwords = ['ii','eh',"'",'?','*',"'ye",'ye','us','could','would','might','must','need','sha','wo','y',"'s","'d","'ll","'t","'m","'re","'ve", "n't"]

#stopwords = nltkstopwords + morestopwords

In [5]:
def run_cross_validation_nb(num_folds, featuresets):
    subset_size = int(len(featuresets)/num_folds)
    print('Folds: {} , Each fold size:{}'.format(num_folds,subset_size))
    accuracy_list = []
    # iterate over the folds
    for i in range(num_folds):
        test_this_round = featuresets[(i*subset_size):][:subset_size]
        train_this_round = featuresets[:(i*subset_size)] + featuresets[((i+1)*subset_size):]
        # train using train_this_round
        classifier = nltk.NaiveBayesClassifier.train(train_this_round)
        predicted=classifier.classify_many([fs for (fs, l) in test_this_round])
        gold =[]
        for f,l in test_this_round:
            gold.append(l)
        print_eval_measures(predicted,gold)

        correct = [l == r for  l, r in zip(gold, predicted)]
        accuracy_this_round = sum(correct) / len(correct)
        print("Accuracy: {}".format(accuracy_this_round))
        print (i, accuracy_this_round)
        accuracy_list.append(accuracy_this_round)
    # find mean accuracy over all rounds
    print ('mean accuracy', sum(accuracy_list) / num_folds)

In [32]:
lancaster = nltk.LancasterStemmer()
def sentiment_features(word):
    #org_word=word
    word=lancaster.stem(word)
    length = len(word)
    features = {
                "c1":word[0],
                "c2":word[1]  if(length>1) else "",
                "c3":word[2]  if(length>2) else "" ,
                #"c4":word[3]  if(length>3) else "" ,
                #"c5":word[4]  if(length>4) else "" ,
                #"c6":word[5]  if(length>5) else "" ,
                #"c7":word[6]  if(length>6) else "" ,
                #"c8":word[7]  if(length>7) else "" ,
                #"c9":word[8]  if(length>8) else "" ,
                #"c10":word[9]  if(length>9) else "" ,
                #"bi1":word[0:1]  if(length>1) else "" ,
                #"bi2":word[1:2]  if(length>1) else "" ,
                #"m0":word[round(length/2)-1] if(length>7) else "",
                #"m1":word[round(length/2)] if(length>7) else "",
                #"m2":word[round(length/2)+1] if(length>7) else "",
                "l1":word[-1],
                "l2":word[-2]  if(length>2) else "",
                "l3":word[-3]  if(length>3) else "",
                "l4":word[-4]  if(length>4) else "",
                #"l5":word[-5]  if(length>5) else "",
                #"bil1":word[-1:],
                #"bil2":word[length-2:length-1],
                #"pos":nltk.pos_tag([word])[0][1],
                #"len":length
                }
    # tag = nltk.pos_tag([org_word])[0][1]
    # tag_type=""
    # if tag.startswith('N'): tag_type='Noun'
    # if tag.startswith('V'): tag_type='Verb'
    # if tag.startswith('J'): tag_type='Adj'
    # if tag.startswith('R'): tag_type='Adverb'
    # if tag.startswith('M'): tag_type='Modal'
    # if tag.startswith('D'): tag_type='Determiner'
    # if tag.startswith('TO'): tag_type='TO'
    # if tag.startswith('W'): tag_type='Wh'
    # if tag.startswith('UH'): tag_type='Interjection'
    # if tag.startswith('CC'): tag_type='Corconjunction'
    # if tag.startswith('CD'): tag_type='Card'
    # features['tag_type'] = tag_type
    # features['Noun'] = True if tag.startswith('N') else False
    # features['Verb'] = True if tag.startswith('V') else False
    # features['Adj'] = True if tag.startswith('J') else False
    # features['Adverb'] = True if tag.startswith('R') else False
    # features['Modal'] = True if tag.startswith('M') else False
    # features['Determiner'] = True if tag.startswith('D') else False
    # features['TO'] = True if tag.startswith('TO') else False
    # features['Interjections'] = True if tag.startswith('UH') else False
    # features['Wh'] = True if tag.startswith('W') else False
    # features['Interjection'] = True if tag.startswith('UH') else False
    # features['Corconjunction'] = True if tag.startswith('CC') else False
    # features['CardNum'] = True if tag.startswith('CD') else False
    return features


# get features sets for a document, including keyword features and category feature
featuresets_sen1 = [(sentiment_features(text), label) for text, label in zip(word_tokens, labels)]

print(featuresets_sen1[0])
nb_sen=run_nb(featuresets_sen1,test_train)
#explain_model(nb_uni)
# perform the cross-validation on the featuresets with word features and generate accuracy
num_folds = 5

#run_cross_validation_nb(num_folds, featuresets_sen1)

({'c1': 'p', 'c2': 'r', 'c3': 'o', 'l1': 'd', 'l2': 'e', 'l3': 's', 'l4': 's'}, '2')
length predicted:4960, gold:4960
Processing label :0
Processing label :1
Processing label :2
Processing label :3
Processing label :4
	Precision	Recall		F1
0 	      0.000      0.000      0.000
1 	      0.064      0.319      0.107
2 	      0.966      0.748      0.843
3 	      0.010      0.158      0.020
4 	      0.030      0.080      0.044
  |      0      1      2      3      4 |
--+------------------------------------+
0 |     <.>  0.1%   1.1%      .   0.0% |
1 |   0.0%  <0.7%> 10.7%   0.1%      . |
2 |   0.4%   1.2% <71.8%>  0.5%   0.4% |
3 |      .   0.2%  11.2%  <0.1%>  0.1% |
4 |      .   0.1%   1.2%   0.0%  <0.0%>|
--+------------------------------------+
(row = reference; col = test)

Accuracy: 0.7268145161290323


In [33]:

featuresets_sen_all = [(sentiment_features(text), label) for text, label in zip(word_tokens, labels)]

classifier = nltk.NaiveBayesClassifier.train(featuresets_sen_all)
predicted=classifier.classify_many([fs for (fs, l) in featuresets_sen_all])
gold =[]
for f,l in featuresets_sen_all:
    gold.append(l)
print("length predicted:{}, gold:{}".format(len(predicted),len(gold)))
print_eval_measures(gold,predicted)

correct = [l == r for  l, r in zip(gold, predicted)]
accuracy = sum(correct) / len(correct)
print("Accuracy: {}".format(accuracy))

length predicted:16531, gold:16531
Processing label :0
Processing label :1
Processing label :2
Processing label :3
Processing label :4
	Precision	Recall		F1
0 	      0.005      0.032      0.009
1 	      0.097      0.390      0.155
2 	      0.972      0.752      0.848
3 	      0.019      0.324      0.035
4 	      0.041      0.158      0.065
  |      0      1      2      3      4 |
--+------------------------------------+
0 |  <0.0%>  0.1%   1.1%   0.0%   0.0% |
1 |      .  <1.1%> 10.4%   0.1%   0.0% |
2 |   0.2%   1.4% <72.2%>  0.4%   0.2% |
3 |   0.0%   0.2%  11.1%  <0.2%>  0.1% |
4 |      .   0.0%   1.2%   0.0%  <0.1%>|
--+------------------------------------+
(row = reference; col = test)

Accuracy: 0.7356481761538927


In [164]:
import math
df_all = pd.read_csv('movie_review.csv')
texts_all = df_all['Phrase'].str.lower().to_list()
labels_all = df_all['Sentiment'].astype(str).to_list()
word_tokens_all = [nltk.word_tokenize(t) for t in texts_all]
featuresets_sen_all = [(sentiment_features(text), label) for text, label in zip(word_tokens, labels)]
classifier_all = nltk.NaiveBayesClassifier.train(featuresets_sen_all)
predicted=[]
gold= []
for word_token,label in zip(word_tokens_all,labels_all):
    if not len(word_token)==0 :
        predicted_text=classifier_all.classify_many([sentiment_features(w) for w in word_token])
        predicted_avg = math.ceil(sum([int(word) for word in predicted_text]) / len(word_token))
        predicted.append(str(predicted_avg))
        gold.append(label)

print_eval_measures(gold,predicted)

correct = [l == r for  l, r in zip(gold, predicted)]
accuracy = sum(correct) / len(correct)
print("Accuracy: {}".format(accuracy))

Processing label :0
Processing label :1
Processing label :2
Processing label :3
Processing label :4
	Precision	Recall		F1
0 	      0.003      0.185      0.006
1 	      0.026      0.232      0.046
2 	      0.943      0.517      0.667
3 	      0.072      0.320      0.118
4 	      0.004      0.165      0.007
  |      0      1      2      3      4 |
--+------------------------------------+
0 |  <0.0%>  0.1%   4.2%   0.2%   0.0% |
1 |   0.0%  <0.4%> 16.2%   0.8%   0.0% |
2 |   0.0%   1.1% <48.1%>  1.7%   0.1% |
3 |   0.0%   0.2%  19.3%  <1.5%>  0.0% |
4 |      .   0.0%   5.3%   0.5%  <0.0%>|
--+------------------------------------+
(row = reference; col = test)

Accuracy: 0.5010028258543243


In [None]:
featuresets_sen1 = [(sentiment_features(text), label) for text, label in zip(word_tokens, labels)]

print(featuresets_sen1[0])
nb_sen=run_nb(featuresets_sen1,test_train)
#explain_model(nb_uni)
# perform the cross-validation on the featuresets with word features and generate accuracy
num_folds = 5

#run_cross_validation_nb(num_folds, featuresets_sen1)