In [61]:
import nltk
from nltk import FreqDist
import re
import pandas as pd

df = pd.read_csv('movie_review_fullsentence.csv')
texts = df['Phrase'].str.lower().to_list()
# clean_texts=[]
# for text in texts:
#     text=text.replace("-"," ")
#     #text=text.replace("n't","not")
#     text=text.strip()
#     clean_texts.append(text)
labels = df['Sentiment'].astype(str).to_list()
test_train = df['label'].to_list()
word_tokens = [nltk.word_tokenize(t) for t in texts]

nltkstopwords = nltk.corpus.stopwords.words('english')
morestopwords = ['sha','wo','y',"'s","'d","'ll","'t","'m","'re",
                 "'ve",".",",",'``', '--','-lrb-', '-rrb-',"?",
                 "`",":","...","'","-",
                 'a', 'and', 'of', 'it', 'to', 'is', 'that', 'in', 'on',
                 'at', 'ar', 'has',"film","movie","the"]

stopwords =    morestopwords
lancaster = nltk.LancasterStemmer()
stopped_word_tokens= []
for words in word_tokens:
    stopped_word_tokens.append([lancaster.stem(w) for w in words if not w in stopwords])


word_tokens = stopped_word_tokens


In [3]:
def print_eval_measures(gold, predicted):
    # get a list of labels
    labels = ["0","1","2","3","4"]
    # these lists have values for each label
    recall_list = []
    precision_list = []
    F1_list = []
    for lab in labels:
        # for each label, compare gold and predicted lists and compute values
        TP = FP = FN = TN = 0
        for i, val in enumerate(gold):
            if val == lab and predicted[i] == lab:  TP += 1
            if val == lab and predicted[i] != lab:  FN += 1
            if val != lab and predicted[i] == lab:  FP += 1
            if val != lab and predicted[i] != lab:  TN += 1
        # use these to compute recall, precision, F1
        recall = TP / (TP + FP)
        precision = TP / (TP + FN)
        recall_list.append(recall)
        precision_list.append(precision)
        F1_list.append( 2 * (recall * precision) / (recall + precision))

    # the evaluation measures in a table with one row per label
    print('\tPrecision\tRecall\t\tF1')
    # print measures for each label
    for i, lab in enumerate(labels):
        print(lab, '\t', "{:10.3f}".format(precision_list[i]),
              "{:10.3f}".format(recall_list[i]), "{:10.3f}".format(F1_list[i]))

    cm = nltk.ConfusionMatrix(gold, predicted)
    #print(cm.pretty_format(sort_by_count=False, truncate=9))
    # or show the results as percentages
    print(cm.pretty_format(sort_by_count=False,values_in_chart=True, show_percents=True, truncate=9))

In [69]:
def run_nb(featuresets, test_train):
    train = []
    test = []
    for t,f in  zip(test_train,featuresets):
        if(t == "train"):
            train.append(f)
        else:
            test.append(f)
    classifier = nltk.NaiveBayesClassifier.train(train)
    predicted=classifier.classify_many([fs for (fs, l) in test])
    gold =[]
    for f,l in test:
        gold.append(l)
    print_eval_measures(predicted,gold)

    correct = [l == r for  l, r in zip(gold, predicted)]
    accuracy = sum(correct) / len(correct)
    print("Accuracy: {}".format(accuracy))
    #print("Top 30 Features")
    classifier.show_most_informative_features(30)
    return  classifier
    # evaluate the accuracy of the classifier
    #accuracy=nltk.classify.accuracy(classifier, test)
    #print("Accuracy: {}".format(accuracy))

    # the accuracy result may vary since we randomized the documents

    # show which features of classifier are most informative
def explain_model(classifier):
    classifier.show_most_informative_features(30)

#sen_tokens = [nltk.sent_tokenize(text) for text in texts]
#word_tokens = [nltk.word_tokenize(sent) for sent in sen_tokens]

#nltkstopwords = nltk.corpus.stopwords.words('english')
#morestopwords = ['ii','eh',"'",'?','*',"'ye",'ye','us','could','would','might','must','need','sha','wo','y',"'s","'d","'ll","'t","'m","'re","'ve", "n't"]

#stopwords = nltkstopwords + morestopwords

In [5]:
def run_cross_validation_nb(num_folds, featuresets):
    subset_size = int(len(featuresets)/num_folds)
    print('Folds: {} , Each fold size:{}'.format(num_folds,subset_size))
    accuracy_list = []
    # iterate over the folds
    for i in range(num_folds):
        test_this_round = featuresets[(i*subset_size):][:subset_size]
        train_this_round = featuresets[:(i*subset_size)] + featuresets[((i+1)*subset_size):]
        # train using train_this_round
        classifier = nltk.NaiveBayesClassifier.train(train_this_round)
        predicted=classifier.classify_many([fs for (fs, l) in test_this_round])
        gold =[]
        for f,l in test_this_round:
            gold.append(l)
        print_eval_measures(predicted,gold)

        correct = [l == r for  l, r in zip(gold, predicted)]
        accuracy_this_round = sum(correct) / len(correct)
        print("Accuracy: {}".format(accuracy_this_round))
        print (i, accuracy_this_round)
        accuracy_list.append(accuracy_this_round)
    # find mean accuracy over all rounds
    print ('mean accuracy', sum(accuracy_list) / num_folds)

In [70]:

all_words_list = [word for text in word_tokens for word in text]
all_words = nltk.FreqDist(all_words_list)
# get the 2000 most frequently appearing keywords in the corpus
word_items = all_words.most_common(1000)
word_features = [word for (word,count) in word_items]
print(word_features[:50])

def unigram_features(document, word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['F_{}'.format(word)] = (word in document_words)
    return features

# get features sets for a document, including keyword features and category feature
featuresets_uni1 = [(unigram_features(text, word_features), label) for text, label in zip(word_tokens, labels)]

nb_uni=run_nb(featuresets_uni1,test_train)
#explain_model(nb_uni)
# perform the cross-validation on the featuresets with word features and generate accuracy
num_folds = 5

#run_cross_validation_nb(num_folds, featuresets_uni1)

['as', 'but', 'with', 'for', 'thi', 'an', 'it', 'you', 'on', "n't", 'be', 'not', 'lik', 'mor', 'by', 'about', 'ar', 'al', 'hav', 'from', 'than', 'mak', 'act', 'his', 'ev', 'i', 'so', 'if', 'story', 'or', 'charact', 'what', 'most', 'ther', 'out', 'direct', 'real', 'who', 'tim', 'just', 'too', 'doe', 'into', 'com', 'up', 'wil', 'work', 'us', 'good', 'comedy']
	Precision	Recall		F1
0 	      0.379      0.283      0.324
1 	      0.425      0.459      0.441
2 	      0.294      0.276      0.285
3 	      0.434      0.483      0.457
4 	      0.428      0.409      0.418
  |      0      1      2      3      4 |
--+------------------------------------+
0 |  <3.6%>  3.1%   1.4%   1.0%   0.3% |
1 |   4.8% <11.9%>  6.2%   3.8%   1.3% |
2 |   2.0%   5.0%  <5.3%>  4.2%   1.5% |
3 |   1.4%   4.8%   5.1% <13.2%>  5.8% |
4 |   0.7%   1.1%   1.2%   5.2%  <6.1%>|
--+------------------------------------+
(row = reference; col = test)

Accuracy: 0.400390625
Most Informative Features
                   F_bad =

In [52]:

all_words_list = [word for text in word_tokens for word in text]
all_words = nltk.FreqDist(all_words_list)
# get the 2000 most frequently appearing keywords in the corpus
# word_items_all = all_words.most_common(16000)
# word_features_all = [word for (word,count) in word_items_all]
# print(word_features_all[:50])
#
# def unigram_features(document, word_features):
#     document_words = set(document)
#     features = {}
#     for word in word_features:
#         features['F_{}'.format(word)] = (word in document_words)
#     return features
#
# # get features sets for a document, including keyword features and category feature
# featuresets_uni2 = [(unigram_features(text, word_features_all), label) for text, label in zip(word_tokens, labels)]
#
#nb_uni2=run_nb(featuresets_uni2,test_train)
#explain_model(nb_uni)
total = all_words.N()
for word in all_words:
    all_words[word] /= float(total)

import plotly.express as px
moby_df_norm=pd.DataFrame(all_words.items(), columns=['tokens', 'frequency'])
fig=px.bar(moby_df_norm.nlargest(50,'frequency'),x='tokens',y='frequency', width=1000,
           title="Normalized Frequency Distribution of Tokens")
fig.show()
# perform the cross-validation on the featuresets with word features and generate accuracy
num_folds = 5

#run_cross_validation_nb(num_folds, featuresets_uni1)

In [64]:
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()

all_words_list = [word for text in word_tokens for word in text]
all_words = nltk.FreqDist(all_words_list)
# get the 2000 most frequently appearing keywords in the corpus
word_items = all_words.most_common(1000)
word_features = [word for (word,count) in word_items]
# create the bigram finder on all the words in sequence
print(all_words_list[:50])
finder = BigramCollocationFinder.from_words(all_words_list)

# define the top 500 bigrams using the chi squared measure
bigram_features = finder.nbest(bigram_measures.chi_sq, 500)
print(bigram_features[:50])


# define features that include words as before
# add the most frequent significant bigrams
# this function takes the list of words in a document as an argument and returns a feature dictionary
# it depends on the variables word_features and bigram_features
def bigram_document_features(document, word_features, bigram_features):
    document_words = set(document)
    document_bigrams = nltk.bigrams(document)
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = (word in document_words)
    for bigram in bigram_features:
        features['B_{}_{}'.format(bigram[0], bigram[1])] = (bigram in document_bigrams)
    return features

# use this function to create feature sets for all sentences
bigram_featuresets = [(bigram_document_features(text, word_features, bigram_features), label) for text, label in zip(word_tokens, labels)]
# number of features for document 0
print(len(bigram_featuresets[0][0].keys()))

# features in document 0
print(bigram_featuresets[0][0])

nb_bi=run_nb(bigram_featuresets,test_train)
#explain_model(nb_uni)
# perform the cross-validation on the featuresets with word features and generate accuracy
num_folds = 5

#run_cross_validation_nb(num_folds, bigram_featuresets)

['pumpkin', 'struts', 'about', 'with', 'cour', 'pin', 'it', 'huckst', 'lapel', 'whil', 'yellow', 'streak', 'mil', 'wid', 'dec', 'it', 'back', 'mak', 'no', 'mistak', 'iv', 'xtc', 'piano', 'teach', 'not', 'an', 'easy', 'i', 'was', 'hop', 'would', 'be', 'sleazy', 'fun', 'but', 'was', 'neith', 'word', 'com', 'mind', 'whil', 'watch', 'er', 'rohm', 'tribut', 'cour', 'scot', 'lady', 'paint', 'for']
[("'30s", "'40s"), ("'80s", 'hatosy'), ('10-course', 'banquet'), ('18-year-old', 'mistress'), ('1950', 'dor'), ('1959', 'godzill'), ('1986', 'harlem'), ('50-something', 'lovebird'), ('75-minute', 'sampl'), ('\\', '*'), ('a.e.w', 'mason'), ('abbot', 'ernest'), ('abd', 'malik'), ('abderrahm', 'sissako'), ('achero', 'mana'), ('acr', 'haut'), ('admin', 'compl'), ('adry', 'brody'), ('age-wis', 'caviezel'), ('alain', 'choquart'), ('alchem', 'transmogr'), ('all-night', 'tequil'), ('american-russ', 'armageddon'), ('ann', 'mougl'), ('arithmet', 'camera'), ('artless', 'sytl'), ('as-it', 'thinks-it-'), ('as-n

In [65]:
all_words_list = [word for text in word_tokens for word in text]
all_words = nltk.FreqDist(all_words_list)
# get the 2000 most frequently appearing keywords in the corpus
word_items = all_words.most_common(1000)
word_features = [word for (word,count) in word_items]
# create the bigram finder on all the words in sequence
print(all_words_list[:50])
finder = BigramCollocationFinder.from_words(all_words_list)

# define the top 500 bigrams using the chi squared measure
bigram_features = finder.nbest(bigram_measures.chi_sq, 500)
print(bigram_features[:50])

def POS_features(document, word_features, bigram_features):
    document_words = set(document)
    tagged_words = nltk.pos_tag(document)
    document_bigrams = nltk.bigrams(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    for bigram in bigram_features:
        features['B_{}_{}'.format(bigram[0], bigram[1])] = (bigram in document_bigrams)
    numNoun = 0
    numVerb = 0
    numAdj = 0
    numAdverb = 0
    for (word, tag) in tagged_words:
        if tag.startswith('N'): numNoun += 1
        if tag.startswith('V'): numVerb += 1
        if tag.startswith('J'): numAdj += 1
        if tag.startswith('R'): numAdverb += 1
    features['nouns'] = numNoun
    features['verbs'] = numVerb
    features['adjectives'] = numAdj
    features['adverbs'] = numAdverb
    return features

# define feature sets using this function
POS_featuresets = [(POS_features(text, word_features,bigram_features), label) for text, label in zip(word_tokens, labels)]

# number of features for document 0
print(len(POS_featuresets[0][0].keys()))

# the first sentence
print(texts[0])
# the pos tag features for this sentence
print('num nouns', POS_featuresets[0][0]['nouns'])
print('num verbs', POS_featuresets[0][0]['verbs'])
print('num adjectives', POS_featuresets[0][0]['adjectives'])
print('num adverbs', POS_featuresets[0][0]['adverbs'])


nb_bi=run_nb(POS_featuresets,test_train)
#explain_model(nb_uni)
# perform the cross-validation on the featuresets with word features and generate accuracy
num_folds = 5

#run_cross_validation_nb(num_folds, POS_featuresets)

['pumpkin', 'struts', 'about', 'with', 'cour', 'pin', 'it', 'huckst', 'lapel', 'whil', 'yellow', 'streak', 'mil', 'wid', 'dec', 'it', 'back', 'mak', 'no', 'mistak', 'iv', 'xtc', 'piano', 'teach', 'not', 'an', 'easy', 'i', 'was', 'hop', 'would', 'be', 'sleazy', 'fun', 'but', 'was', 'neith', 'word', 'com', 'mind', 'whil', 'watch', 'er', 'rohm', 'tribut', 'cour', 'scot', 'lady', 'paint', 'for']
[("'30s", "'40s"), ("'80s", 'hatosy'), ('10-course', 'banquet'), ('18-year-old', 'mistress'), ('1950', 'dor'), ('1959', 'godzill'), ('1986', 'harlem'), ('50-something', 'lovebird'), ('75-minute', 'sampl'), ('\\', '*'), ('a.e.w', 'mason'), ('abbot', 'ernest'), ('abd', 'malik'), ('abderrahm', 'sissako'), ('achero', 'mana'), ('acr', 'haut'), ('admin', 'compl'), ('adry', 'brody'), ('age-wis', 'caviezel'), ('alain', 'choquart'), ('alchem', 'transmogr'), ('all-night', 'tequil'), ('american-russ', 'armageddon'), ('ann', 'mougl'), ('arithmet', 'camera'), ('artless', 'sytl'), ('as-it', 'thinks-it-'), ('as-n

In [32]:
# nn_tagger=nltk.PerceptronTagger()
# taggedtextPerceptron = [nn_tagger.tag(t) for t in word_tokens]
# taggedtextPerceptron[0]

[('now', 'RB'),
 ('here', 'RB'),
 ("'s", 'VBZ'),
 ('a', 'DT'),
 ('sadistic', 'JJ'),
 ('bike', 'NN'),
 ('flick', 'NN'),
 ('that', 'WDT'),
 ('would', 'MD'),
 ('have', 'VB'),
 ('made', 'VBN'),
 ('vittorio', 'NN'),
 ('de', 'IN'),
 ('sica', 'FW'),
 ('proud', 'NN'),
 ('.', '.')]

In [67]:
all_words_list = [word for text in word_tokens for word in text]

all_words = nltk.FreqDist(all_words_list)
# get the 2000 most frequently appearing keywords in the corpus
word_items = all_words.most_common(1000)
word_features = [word for (word,count) in word_items]
# create the bigram finder on all the words in sequence
print(all_words_list[:50])

def POS_features2(document, word_features):
    document_words = set(document)
    tagged_words = nltk.pos_tag(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)

    document_bigrams = nltk.bigrams(document)
    for bigram in bigram_features:
        features['B_{}_{}'.format(bigram[0], bigram[1])] = (bigram in document_bigrams)

    numNoun = 0
    numVerb = 0
    numAdj = 0
    numAdverb = 0
    numModal = 0
    numDeterminer =0
    numTO = 0
    numInterjections = 0
    numWh = 0
    numInterjection =0
    numCorconjunction =0
    numCardNum =0
    for (word, tag) in tagged_words:
        if tag.startswith('N'): numNoun += 1
        if tag.startswith('V'): numVerb += 1
        if tag.startswith('J'): numAdj += 1
        if tag.startswith('R'): numAdverb += 1
        if tag.startswith('M'): numModal += 1
        if tag.startswith('D'): numDeterminer += 1
        if tag.startswith('TO'): numTO += 1
        if tag.startswith('UH'): numInterjections += 1
        if tag.startswith('W'): numWh += 1
        if tag.startswith('UH'): numInterjection += 1
        if tag.startswith('CC'): numCorconjunction += 1
        if tag.startswith('CD'): numCardNum += 1

    features['nouns'] = numNoun
    features['verbs'] = numVerb
    features['adjectives'] = numAdj
    features['adverbs'] = numAdverb
    features['numModal'] =  numModal
    features['numDeterminer'] = numDeterminer
    features['numTO'] = numTO
    features['numInterjections'] = numInterjections
    features['numWh'] = numWh
    features['numInterjection'] =  numInterjection
    features['numCorconjunction'] = numCorconjunction
    features['numCardNum'] = numCardNum
    return features

# define feature sets using this function
POS_featuresets2 = [(POS_features2(text, word_features), label) for text, label in zip(word_tokens, labels)]

# number of features for document 0
print(len(POS_featuresets2[0][0].keys()))

# the first sentence
print(POS_featuresets2[0])
# the pos tag features for this sentence
print('num nouns', POS_featuresets2[0][0]['nouns'])
print('num verbs', POS_featuresets2[0][0]['verbs'])
print('num adjectives', POS_featuresets2[0][0]['adjectives'])
print('num adverbs', POS_featuresets2[0][0]['adverbs'])


nb_pos_2=run_nb(POS_featuresets2,test_train)
#explain_model(nb_uni)
# perform the cross-validation on the featuresets with word features and generate accuracy
num_folds = 5

#run_cross_validation_nb(num_folds, POS_featuresets2)


['pumpkin', 'struts', 'about', 'with', 'cour', 'pin', 'it', 'huckst', 'lapel', 'whil', 'yellow', 'streak', 'mil', 'wid', 'dec', 'it', 'back', 'mak', 'no', 'mistak', 'iv', 'xtc', 'piano', 'teach', 'not', 'an', 'easy', 'i', 'was', 'hop', 'would', 'be', 'sleazy', 'fun', 'but', 'was', 'neith', 'word', 'com', 'mind', 'whil', 'watch', 'er', 'rohm', 'tribut', 'cour', 'scot', 'lady', 'paint', 'for']
1512
({'contains(as)': False, 'contains(but)': False, 'contains(with)': True, 'contains(for)': False, 'contains(thi)': False, 'contains(an)': False, 'contains(it)': True, 'contains(you)': False, 'contains(on)': False, "contains(n't)": False, 'contains(be)': False, 'contains(not)': False, 'contains(lik)': False, 'contains(mor)': False, 'contains(by)': False, 'contains(about)': True, 'contains(ar)': False, 'contains(al)': False, 'contains(hav)': False, 'contains(from)': False, 'contains(than)': False, 'contains(mak)': False, 'contains(act)': False, 'contains(his)': False, 'contains(ev)': False, 'cont

In [68]:
all_words_list = [word for text in word_tokens for word in text]

all_words = nltk.FreqDist(all_words_list)
# get the 2000 most frequently appearing keywords in the corpus
word_items = all_words.most_common(1000)
word_features = [word for (word,count) in word_items]
# create the bigram finder on all the words in sequence
print(all_words_list[:50])


nn_tagger=nltk.PerceptronTagger()

taggedtextStanford = [nn_tagger.tag(t) for t in word_tokens]
taggedtextStanford[0]
tags_only=[]
for tag in taggedtextStanford:
    pos = [tup[1] for tup in tag]
    tags_only.append(pos)
tags_only[0]

# get the 2000 most frequently appearing keywords in the corpus
all_tags_list = [tag for tags in tags_only for tag in tags]
all_tags = nltk.FreqDist(all_tags_list)
tag_items = all_tags.most_common(2500)
tag_features = [word for (word,count) in tag_items]
# create the bigram finder on all the words in sequence

def POS_features3(document, tags, word_features,tag_features):
    document_words = set(document)
    #tagged_words = nn_tagger.tag(document)
    #tags = [tup[1] for tup in tagged_words]
    tagged_words_dist = nltk.FreqDist(tags)
    features = {}
    for tag in tag_features:
        count = tagged_words_dist.get(tag)
        if(count==None):
            count=0
        features['count({})'.format(tag)] = count

    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)

    document_bigrams = nltk.bigrams(document)
    for bigram in bigram_features:
        features['B_{}_{}'.format(bigram[0], bigram[1])] = (bigram in document_bigrams)

    return features

# define feature sets using this function
POS_featuresets3 = [(POS_features3(word,tag,word_features, tag_features), label) for word,tag, label in zip(word_tokens,tags_only, labels)]

# number of features for document 0
print(len(POS_featuresets3[0][0].keys()))

# the first sentence
print(POS_featuresets3[0])

nb_pos_3=run_nb(POS_featuresets3,test_train)
#explain_model(nb_uni)
# perform the cross-validation on the featuresets with word features and generate accuracy
num_folds = 5

#run_cross_validation_nb(num_folds, POS_featuresets2)

['pumpkin', 'struts', 'about', 'with', 'cour', 'pin', 'it', 'huckst', 'lapel', 'whil', 'yellow', 'streak', 'mil', 'wid', 'dec', 'it', 'back', 'mak', 'no', 'mistak', 'iv', 'xtc', 'piano', 'teach', 'not', 'an', 'easy', 'i', 'was', 'hop', 'would', 'be', 'sleazy', 'fun', 'but', 'was', 'neith', 'word', 'com', 'mind', 'whil', 'watch', 'er', 'rohm', 'tribut', 'cour', 'scot', 'lady', 'paint', 'for']
1537
({'count(NN)': 8, 'count(JJ)': 2, 'count(IN)': 2, 'count(RB)': 0, 'count(VB)': 0, 'count(VBP)': 0, 'count(PRP)': 2, 'count(DT)': 0, 'count(NNS)': 1, 'count(VBD)': 1, 'count(CC)': 0, 'count(VBZ)': 0, 'count(MD)': 0, 'count(PRP$)': 0, 'count(VBN)': 0, 'count(RP)': 1, 'count(JJS)': 0, 'count(WP)': 0, 'count(CD)': 0, 'count(VBG)': 0, 'count(WRB)': 0, 'count(FW)': 0, 'count(JJR)': 0, 'count(WDT)': 0, 'count(RBS)': 0, 'count(NNP)': 0, 'count(:)': 0, 'count(.)': 0, 'count(RBR)': 0, 'count(PDT)': 0, 'count($)': 0, 'count(POS)': 0, 'count(TO)': 0, 'count(#)': 0, 'count(UH)': 0, 'count(EX)': 0, 'count(S

In [50]:
print(all_words.get('RB'))

9649
