In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pickle
from collections import namedtuple

from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from nltk.corpus import stopwords

from Corpora import MovieReviewCorpus
from Lexicon import SentimentLexicon
from Statistics import SignTest
from Classifiers import NaiveBayesText, SVMText
from Extensions import SVMDoc2Vec
from Constants import PUNCTUATION



In [3]:
plt.rc('font', size=20) 

In [4]:
# whether to use corpus pickles
use_pickles = False

In [5]:
plot_dir = 'plots'

# Load Corpus

In [6]:
# # retrieve corpus
# corpus_pickle = "corpus.pkl"
# if use_pickles and os.path.isfile(corpus_pickle):
#     with open(corpus_pickle, 'rb') as f:
#         corpus = pickle.load(f)
# else:
#     corpus=MovieReviewCorpus(stemming=False)
#     if os.path.isfile(corpus_pickle):
#         os.remove(corpus_pickle)
#     with open(corpus_pickle, 'wb') as f:
#         pickle.dump(corpus, f)

In [7]:
corpus_tag=MovieReviewCorpus(stemming=False)
corpus_txt=MovieReviewCorpus(stemming=False,use_txt=True)
corpus_txt_upper=MovieReviewCorpus(stemming=False,use_txt=True,lower_case=False)
corpus_txt_token=MovieReviewCorpus(stemming=False,use_txt=True,tokenise=True)

Identified 1000 POS files to be processed
Identified 1000 NEG files to be processed
Processing POS files
Processing NEG files
Identified 1000 POS files to be processed
Identified 1000 NEG files to be processed
Processing POS files
Processing NEG files
Identified 1000 POS files to be processed
Identified 1000 NEG files to be processed
Processing POS files
Processing NEG files
Identified 1000 POS files to be processed
Identified 1000 NEG files to be processed
Processing POS files
Processing NEG files


# Sign Test

In [8]:
# use sign test for all significance testing
signTest=SignTest()

# Sentiment Lexicon

In [None]:
print("--- classifying reviews using sentiment lexicon  ---")

# read in lexicon
lexicon=SentimentLexicon()

In [None]:
lexicon_df = pd.DataFrame(lexicon.lexicon).T.reset_index().rename(columns={'index': 'word', 0: 'magnitude', 1: 'polarity'})
lexicon_df

## Question 0.1

### Using the tagged reviews

In [None]:
# on average there are more positive than negative words per review (~7.13 more positive than negative per review)
# to take this bias into account will use threshold (roughly the bias itself) to make it harder to classify as positive
threshold=8

lexicon.classify(corpus_tag.reviews,threshold,magnitude=False,weak_polarity=1,strong_polarity=1)
token_preds=lexicon.predictions
print(f"token-only results: {lexicon.getAccuracy():.5f}")

lexicon.classify(corpus_tag.reviews,threshold,magnitude=True,weak_polarity=1,strong_polarity=2)
magnitude_preds=lexicon.predictions
print(f"magnitude results: {lexicon.getAccuracy():.5f}")

### Using the text reviews

In [None]:
# on average there are more positive than negative words per review (~7.13 more positive than negative per review)
# to take this bias into account will use threshold (roughly the bias itself) to make it harder to classify as positive
threshold=8

lexicon.classify(corpus_txt.reviews,threshold,magnitude=False,weak_polarity=1,strong_polarity=1)
token_preds=lexicon.predictions
print(f"token-only results: {lexicon.getAccuracy():.5f}")

lexicon.classify(corpus_txt.reviews,threshold,magnitude=True,weak_polarity=1,strong_polarity=2)
magnitude_preds=lexicon.predictions
print(f"magnitude results: {lexicon.getAccuracy():.5f}")

### Using the tokenised text reviews

In [None]:
# on average there are more positive than negative words per review (~7.13 more positive than negative per review)
# to take this bias into account will use threshold (roughly the bias itself) to make it harder to classify as positive
threshold=8

lexicon.classify(corpus_txt_token.reviews,threshold,magnitude=False,weak_polarity=1,strong_polarity=1)
token_preds=lexicon.predictions
print(f"token-only results: {lexicon.getAccuracy():.5f}")

lexicon.classify(corpus_txt_token.reviews,threshold,magnitude=True,weak_polarity=1,strong_polarity=2)
magnitude_preds=lexicon.predictions
print(f"magnitude results: {lexicon.getAccuracy():.5f}")

### Grid Searches

#### Polarity

In [None]:
lexicon_thresholds = np.arange(-20,20)
lexicon_pol_gs_df = pd.DataFrame(lexicon_thresholds, columns=['threshold'])
def lexicon_pol_gs_func(x):
    lexicon.classify(corpus_tag.reviews,x.threshold,magnitude=False,weak_polarity=1,strong_polarity=1)
    return lexicon.getAccuracy()
lexicon_pol_gs_df['result'] = lexicon_pol_gs_df.apply(lexicon_pol_gs_func,axis=1)

In [None]:
lexicon_pol_gs_df.sort_values('result', ascending=False).head(3)

In [None]:
plt.plot(lexicon_pol_gs_df.threshold, lexicon_pol_gs_df.result)

#### Magnitude

In [None]:
lexicon_thresholds = np.arange(20)
lexicon_polarities = np.array(
    [[1, 1], [1, 2], [1, 3], [1, 4], [1, 5], [2, 5], [3 ,5]]
)
lexicon_mag_gs_df = pd.DataFrame(lexicon_thresholds, columns=['threshold']).merge(pd.DataFrame(lexicon_polarities, columns=['weak_polarity', 'strong_polarity']), how='cross')
def lexicon_mag_gs_func(x):
    lexicon.classify(corpus_tag.reviews,x.threshold,magnitude=True,weak_polarity=x.weak_polarity,strong_polarity=x.strong_polarity)
    return lexicon.getAccuracy()
lexicon_mag_gs_df['result'] = lexicon_mag_gs_df.apply(lexicon_mag_gs_func,axis=1)

In [None]:
lexicon_mag_gs_df.loc[lexicon_mag_gs_df['threshold'] == 8].sort_values('result', ascending=False).head(10)

In [None]:
lexicon_mag_gs_df.sort_values('result', ascending=False).head(10)

In [None]:
lexicon_thresholds = np.arange(-20,20)
lexicon_mag_gs_1_2_df = pd.DataFrame(lexicon_thresholds, columns=['threshold'])
def lexicon_pol_gs_func(x):
    lexicon.classify(corpus_tag.reviews,x.threshold,magnitude=True,weak_polarity=1,strong_polarity=2)
    return lexicon.getAccuracy()
lexicon_mag_gs_1_2_df['result'] = lexicon_mag_gs_1_2_df.apply(lexicon_pol_gs_func,axis=1)

In [None]:
fig, ax = plt.subplots(1, figsize=(10,10))
ax.plot(lexicon_pol_gs_df.threshold, lexicon_pol_gs_df.result, label='Polarity')
ax.plot(lexicon_mag_gs_1_2_df.threshold, lexicon_mag_gs_1_2_df.result, label='Magnitude')
ax.set_xlabel('Threshold')
ax.set_ylabel('Prediction Accuracy')
ax.legend()
fig.savefig(os.path.join(plot_dir, 'lexicon_thresholds.jpeg'), pad_inches=0.2, bbox_inches='tight')

## Question 0.2

In [None]:
p_value=signTest.getSignificance(token_preds,magnitude_preds)
significance = "significant" if p_value < 0.05 else "not significant"
print(f"magnitude lexicon results are {significance} with respect to token-only")

# Naive Bayes

## Question 1.0

#### Using the tagged reviews

In [None]:
print("--- classifying reviews using Naive Bayes on held-out test set ---")
NB=NaiveBayesText(smoothing=False,bigrams=False,trigrams=False,discard_closed_class=False)
NB.train(corpus_tag.train)
NB.test(corpus_tag.train, verbose=False)
print(f"Training accuracy without smoothing: {NB.getAccuracy():.5f}")
print(f"Number of ties: {NB.ties}")
NB.test(corpus_tag.test, verbose=False)
print(f"Test Accuracy without smoothing: {NB.getAccuracy():.5f}")
print(f"Number of ties: {NB.ties}")

In [None]:
print("--- classifying reviews with NB without smoothing using 10-fold cross-evaluation ---")
# using previous instantiated object
NB.crossValidate(corpus_tag)
# store predictions from classifier
nb_tag_non_smoothed_preds=NB.predictions
print(f"Accuracy: {NB.getAccuracy():.5f}")
print(f"Std. Dev: {NB.getStdDeviation():.5f}")

#### Using uppercase text reviews

In [None]:
print("--- classifying reviews using Naive Bayes on held-out test set ---")
NB=NaiveBayesText(smoothing=False,bigrams=False,trigrams=False,discard_closed_class=False)
NB.train(corpus_txt_upper.train)
NB.test(corpus_txt_upper.train, verbose=False)
print(f"Training accuracy without smoothing: {NB.getAccuracy():.5f}")
print(f"Number of ties: {NB.ties}")
NB.test(corpus_txt_upper.test, verbose=False)
print(f"Test Accuracy without smoothing: {NB.getAccuracy():.5f}")
print(f"Number of ties: {NB.ties}")

In [None]:
print("--- classifying reviews with NB without smoothing using 10-fold cross-evaluation ---")
# using previous instantiated object
NB.crossValidate(corpus_txt_upper)
# store predictions from classifier
nb_txt_upper_non_smoothed_preds=NB.predictions
print(f"Accuracy: {NB.getAccuracy():.5f}")
print(f"Std. Dev: {NB.getStdDeviation():.5f}")

#### Using the text reviews

In [None]:
print("--- classifying reviews using Naive Bayes on held-out test set ---")
NB=NaiveBayesText(smoothing=False,bigrams=False,trigrams=False,discard_closed_class=False)
NB.train(corpus_txt.train)
NB.test(corpus_txt.train, verbose=False)
print(f"Training accuracy without smoothing: {NB.getAccuracy():.5f}")
print(f"Number of ties: {NB.ties}")
NB.test(corpus_txt.test, verbose=False)
print(f"Test Accuracy without smoothing: {NB.getAccuracy():.5f}")
print(f"Number of ties: {NB.ties}")

In [None]:
print("--- classifying reviews with NB without smoothing using 10-fold cross-evaluation ---")
# using previous instantiated object
NB.crossValidate(corpus_txt)
# store predictions from classifier
nb_txt_non_smoothed_preds=NB.predictions
print(f"Accuracy: {NB.getAccuracy():.5f}")
print(f"Std. Dev: {NB.getStdDeviation():.5f}")

#### Using the tokenised text reviews

In [None]:
print("--- classifying reviews using Naive Bayes on held-out test set ---")
NB=NaiveBayesText(smoothing=False,bigrams=False,trigrams=False,discard_closed_class=False)
NB.train(corpus_txt_token.train)
NB.test(corpus_txt_token.train, verbose=False)
print(f"Training accuracy without smoothing: {NB.getAccuracy():.5f}")
print(f"Number of ties: {NB.ties}")
NB.test(corpus_txt_token.test, verbose=False)
print(f"Test Accuracy without smoothing: {NB.getAccuracy():.5f}")
print(f"Number of ties: {NB.ties}")

In [None]:
print("--- classifying reviews with NB without smoothing using 10-fold cross-evaluation ---")
# using previous instantiated object
NB.crossValidate(corpus_txt_token)
# store predictions from classifier
nb_txt_token_non_smoothed_preds=NB.predictions
print(f"Accuracy: {NB.getAccuracy():.5f}")
print(f"Std. Dev: {NB.getStdDeviation():.5f}")

### Document and Word Probabilities

In [None]:
NB=NaiveBayesText(smoothing=False,bigrams=False,trigrams=False,discard_closed_class=False)
NB.train(corpus_tag.train)

#### Document Prior

In [None]:
NB.prior

#### Word Probabilities

##### Tags

In [None]:
NB=NaiveBayesText(smoothing=False,bigrams=False,trigrams=False,discard_closed_class=False)
NB.train(corpus_tag.train)

In [None]:
nb_word_freq = pd.DataFrame(NB.condProb).reset_index().rename(columns={'level_0': 'word'})
# nb_word_freq

In [None]:
# nb_word_freq.sort_values('POS', ascending=False).head(5)

In [None]:
# nb_word_freq.sort_values('NEG', ascending=False).head(5)

In [None]:
cloud_title = lambda x: "Positive" if x == "POS" else "Negative"

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,10))

for i, sent in enumerate(['POS', 'NEG']):
    nb_word_freq_top = nb_word_freq.sort_values(sent, ascending=False)
    nb_word_freq_top_dict = nb_word_freq_top[['word', sent]].set_index('word').to_dict()[sent]
    wordcloud = WordCloud(background_color='white', collocations=False).generate_from_frequencies(nb_word_freq_top_dict)
    ax[i].imshow(wordcloud)
    ax[i].set_title(f'Class: {cloud_title(sent)} Reviews')
    ax[i].set_xticks([0])
    ax[i].set_xticklabels([])
    ax[i].set_yticks([])
fig.savefig(os.path.join(plot_dir, 'nb_tag_frequency_wordcloud.jpeg'), pad_inches=0.2, bbox_inches='tight')

##### Text

In [None]:
NB=NaiveBayesText(smoothing=False,bigrams=False,trigrams=False,discard_closed_class=False)
NB.train(corpus_txt.train)
nb_word_freq = pd.DataFrame(NB.condProb).reset_index().rename(columns={'level_0': 'word'})

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,10))

for i, sent in enumerate(['POS', 'NEG']):
    nb_word_freq_top = nb_word_freq.sort_values(sent, ascending=False)
    nb_word_freq_top_dict = nb_word_freq_top[['word', sent]].set_index('word').to_dict()[sent]
    wordcloud = WordCloud(background_color='white', collocations=False).generate_from_frequencies(nb_word_freq_top_dict)
    ax[i].imshow(wordcloud)
    ax[i].set_title(f'Class: {cloud_title(sent.title())} Reviews')
    ax[i].set_xticks([0])
    ax[i].set_xticklabels([])
    ax[i].set_yticks([])
fig.savefig(os.path.join(plot_dir, 'nb_txt_frequency_wordcloud.jpeg'), pad_inches=0.2, bbox_inches='tight')

##### Tokenised Text

In [None]:
NB=NaiveBayesText(smoothing=False,bigrams=False,trigrams=False,discard_closed_class=False)
NB.train(corpus_txt_token.train)
nb_word_freq = pd.DataFrame(NB.condProb).reset_index().rename(columns={'level_0': 'word'})

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,10))

for i, sent in enumerate(['POS', 'NEG']):
    nb_word_freq_top = nb_word_freq.sort_values(sent, ascending=False)
    nb_word_freq_top_dict = nb_word_freq_top[['word', sent]].set_index('word').to_dict()[sent]
    wordcloud = WordCloud(background_color='white', collocations=False).generate_from_frequencies(nb_word_freq_top_dict)
    ax[i].imshow(wordcloud)
    ax[i].set_title(f'Class: {cloud_title(sent.title())} Reviews')
    ax[i].set_xticks([0])
    ax[i].set_xticklabels([])
    ax[i].set_yticks([])
fig.savefig(os.path.join(plot_dir, 'nb_txt_token_frequency_wordcloud.jpeg'), pad_inches=0.2, bbox_inches='tight')

### Exclude punctuation

In [None]:
nb_word_freq_punct = nb_word_freq.loc[~nb_word_freq['word'].isin(PUNCTUATION)]
nb_word_freq_punct

In [None]:
nb_word_freq_punct.sort_values('POS', ascending=False).head(5)

In [None]:
nb_word_freq_punct.sort_values('NEG', ascending=False).head(5)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,10))

for i, sent in enumerate(['POS', 'NEG']):
    nb_word_freq_top = nb_word_freq_punct.sort_values(sent, ascending=False)
    nb_word_freq_top_dict = nb_word_freq_top[['word', sent]].set_index('word').to_dict()[sent]
    wordcloud = WordCloud(background_color='white', collocations=False).generate_from_frequencies(nb_word_freq_top_dict)
    ax[i].imshow(wordcloud)
    ax[i].set_title(f'Class: {cloud_title(sent.title())} Reviews')
    ax[i].set_xticks([0])
    ax[i].set_xticklabels([])
    ax[i].set_yticks([])
fig.savefig(os.path.join(plot_dir, 'nb_frequency_punct_wordcloud.jpeg'), pad_inches=0.2, bbox_inches='tight')

In [None]:
corpus_txt_token_punct=MovieReviewCorpus(stemming=False,use_txt=True,allowed_vocab=set(nb_word_freq_punct['word'].unique()))

In [None]:
print("--- classifying reviews using Naive Bayes on held-out test set ---")
NB=NaiveBayesText(smoothing=False,bigrams=False,trigrams=False,discard_closed_class=False)
NB.train(corpus_txt_token_punct.train)
NB.test(corpus_txt_token_punct.train, verbose=False)
print(f"Training accuracy without smoothing: {NB.getAccuracy():.5f}")
print(f"Number of ties: {NB.ties}")
NB.test(corpus_txt_token_punct.test, verbose=False)
print(f"Test Accuracy without smoothing: {NB.getAccuracy():.5f}")
print(f"Number of ties: {NB.ties}")

In [None]:
print("--- classifying reviews with NB without smoothing using 10-fold cross-evaluation ---")
# using previous instantiated object
NB.crossValidate(corpus_txt_token_punct)
# store predictions from classifier
nb_txt_token_punct_non_smoothed_preds=NB.predictions
print(f"Accuracy: {NB.getAccuracy():.5f}")
print(f"Std. Dev: {NB.getStdDeviation():.5f}")

### Exclude stopwords

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
nb_word_freq_stop = nb_word_freq.loc[~nb_word_freq['word'].isin(stopwords.words('english'))]

In [None]:
nb_word_freq_stop.sort_values('POS', ascending=False).head(5)

In [None]:
nb_word_freq_stop.sort_values('NEG', ascending=False).head(5)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,10))

for i, sent in enumerate(['POS', 'NEG']):
    nb_word_freq_top = nb_word_freq_stop.sort_values(sent, ascending=False)
    nb_word_freq_top_dict = nb_word_freq_top[['word', sent]].set_index('word').to_dict()[sent]
    wordcloud = WordCloud(background_color='white', collocations=False).generate_from_frequencies(nb_word_freq_top_dict)
    ax[i].imshow(wordcloud)
    ax[i].set_title(f'Class: {cloud_title(sent.title())} Reviews')
    ax[i].set_xticks([0])
    ax[i].set_xticklabels([])
    ax[i].set_yticks([])
fig.savefig(os.path.join(plot_dir, 'nb_frequency_stop_wordcloud.jpeg'), pad_inches=0.2, bbox_inches='tight')

In [None]:
corpus_txt_token_stop=MovieReviewCorpus(stemming=False,use_txt=True,tokenise=True,allowed_vocab=set(nb_word_freq_stop['word'].unique()))

In [None]:
print("--- classifying reviews using Naive Bayes on held-out test set ---")
NB=NaiveBayesText(smoothing=False,bigrams=False,trigrams=False,discard_closed_class=False)
NB.train(corpus_txt_token_stop.train)
NB.test(corpus_txt_token_stop.train, verbose=False)
print(f"Training accuracy without smoothing: {NB.getAccuracy():.5f}")
print(f"Number of ties: {NB.ties}")
NB.test(corpus_txt_token_stop.test, verbose=False)
print(f"Test Accuracy without smoothing: {NB.getAccuracy():.5f}")
print(f"Number of ties: {NB.ties}")

In [None]:
print("--- classifying reviews with NB without smoothing using 10-fold cross-evaluation ---")
# using previous instantiated object
NB.crossValidate(corpus_txt_token_stop)
# store predictions from classifier
nb_txt_token_stop_non_smoothed_preds=NB.predictions
print(f"Accuracy: {NB.getAccuracy():.5f}")
print(f"Std. Dev: {NB.getStdDeviation():.5f}")

### Exclude stopwords and punctuation

In [None]:
nb_word_freq_stop_punct = nb_word_freq.loc[~((nb_word_freq['word'].isin(stopwords.words('english'))) | (nb_word_freq['word'].isin(PUNCTUATION)))]

In [None]:
nb_word_freq_stop_punct.sort_values('POS', ascending=False).head(5)

In [None]:
nb_word_freq_stop_punct.sort_values('NEG', ascending=False).head(5)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,10))

for i, sent in enumerate(['POS', 'NEG']):
    nb_word_freq_top = nb_word_freq_stop_punct.sort_values(sent, ascending=False)
    nb_word_freq_top_dict = nb_word_freq_top[['word', sent]].set_index('word').to_dict()[sent]
    wordcloud = WordCloud(background_color='white', collocations=False).generate_from_frequencies(nb_word_freq_top_dict)
    ax[i].imshow(wordcloud)
    ax[i].set_title(f'Class: {cloud_title(sent.title())} Reviews')
    ax[i].set_xticks([0])
    ax[i].set_xticklabels([])
    ax[i].set_yticks([])
fig.savefig(os.path.join(plot_dir, 'nb_frequency_stop_punct_wordcloud.jpeg'), pad_inches=0.2, bbox_inches='tight')

In [None]:
corpus_txt_token_stop_punct=MovieReviewCorpus(stemming=False,use_txt=True,tokenise=True,allowed_vocab=set(nb_word_freq_stop_punct['word'].unique()))

In [None]:
print("--- classifying reviews using Naive Bayes on held-out test set ---")
NB=NaiveBayesText(smoothing=False,bigrams=False,trigrams=False,discard_closed_class=False)
NB.train(corpus_txt_token_stop_punct.train)
NB.test(corpus_txt_token_stop_punct.train, verbose=False)
print(f"Training accuracy without smoothing: {NB.getAccuracy():.5f}")
print(f"Number of ties: {NB.ties}")
NB.test(corpus_txt_token_stop_punct.test, verbose=False)
print(f"Test Accuracy without smoothing: {NB.getAccuracy():.5f}")
print(f"Number of ties: {NB.ties}")

In [None]:
print("--- classifying reviews with NB without smoothing using 10-fold cross-evaluation ---")
# using previous instantiated object
NB.crossValidate(corpus_txt_token_stop_punct)
# store predictions from classifier
nb_txt_token_stop_punct_non_smoothed_preds=NB.predictions
print(f"Accuracy: {NB.getAccuracy():.5f}")
print(f"Std. Dev: {NB.getStdDeviation():.5f}")

### Exclude words not in Lexicon

In [None]:
nb_word_freq_lexicon = lexicon_df.set_index('word').join(nb_word_freq.set_index('word'), how='left').reset_index()
nb_word_freq_lexicon

In [None]:
nb_word_freq_lexicon.loc[nb_word_freq_lexicon['polarity'] == 'positive'].sort_values('POS', ascending=False).head(5)

In [None]:
nb_word_freq_lexicon.loc[nb_word_freq_lexicon['polarity'] == 'negative'].sort_values('NEG', ascending=False).head(5)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,10))

for i, sent in enumerate([('POS', 'positive'), ('NEG', 'negative')]):
    nb_word_freq_top = nb_word_freq_lexicon.loc[nb_word_freq_lexicon['polarity'] == sent[1]].sort_values(sent[0], ascending=False)
    nb_word_freq_top_dict = nb_word_freq_top[['word', sent[0]]].set_index('word').to_dict()[sent[0]]
    wordcloud = WordCloud(background_color='white', collocations=False).generate_from_frequencies(nb_word_freq_top_dict)
    ax[i].imshow(wordcloud)
    ax[i].set_title(f'Class: {cloud_title(sent)} Reviews')
    ax[i].set_xticks([0])
    ax[i].set_xticklabels([])
    ax[i].set_yticks([])
fig.savefig(os.path.join(plot_dir, 'nb_frequency_lexicon_wordcloud.jpeg'), pad_inches=0.2, bbox_inches='tight')

In [None]:
corpus_txt_token_lexicon=MovieReviewCorpus(stemming=False,use_txt=True,tokenise=True,allowed_vocab=set(lexicon_df['word'].unique()))

In [None]:
print("--- classifying reviews using Naive Bayes on held-out test set ---")
NB=NaiveBayesText(smoothing=False,bigrams=False,trigrams=False,discard_closed_class=False)
NB.train(corpus_txt_token_lexicon.train)
NB.test(corpus_txt_token_lexicon.train, verbose=False)
print(f"Training accuracy without smoothing: {NB.getAccuracy():.5f}")
print(f"Number of ties: {NB.ties}")
NB.test(corpus_txt_token_lexicon.test, verbose=False)
print(f"Test Accuracy without smoothing: {NB.getAccuracy():.5f}")
print(f"Number of ties: {NB.ties}")

In [None]:
print("--- classifying reviews with NB without smoothing using 10-fold cross-evaluation ---")
# using previous instantiated object
NB.crossValidate(corpus_txt_token_lexicon)
# store predictions from classifier
nb_txt_token_lexicon_non_smoothed_preds=NB.predictions
print(f"Accuracy: {NB.getAccuracy():.5f}")
print(f"Std. Dev: {NB.getStdDeviation():.5f}")

In [None]:
# see if limiting the vocabulary to the lexicon significantly improves results
p_value=signTest.getSignificance(nb_txt_token_non_smoothed_preds,nb_txt_token_lexicon_non_smoothed_preds)
significance = "significant" if p_value < 0.05 else "not significant"
print(f"results limiting the vocabulary to the lexicon are {significance} with respect to not doing so")

### Only allow words that are in the lexicon with positive or negative polarity

In [None]:
corpus_txt_token_lexicon_pol=MovieReviewCorpus(stemming=False,use_txt=True,tokenise=True,allowed_vocab=set(lexicon_df.loc[lexicon_df['polarity'].isin(['negative', 'positive'])]['word'].unique()))

In [None]:
print("--- classifying reviews using Naive Bayes on held-out test set ---")
NB=NaiveBayesText(smoothing=False,bigrams=False,trigrams=False,discard_closed_class=False)
NB.train(corpus_txt_token_lexicon_pol.train)
NB.test(corpus_txt_token_lexicon_pol.train, verbose=False)
print(f"Training accuracy without smoothing: {NB.getAccuracy():.5f}")
print(f"Number of ties: {NB.ties}")
NB.test(corpus_txt_token_lexicon_pol.test, verbose=False)
print(f"Test Accuracy without smoothing: {NB.getAccuracy():.5f}")
print(f"Number of ties: {NB.ties}")

In [None]:
print("--- classifying reviews with NB without smoothing using 10-fold cross-evaluation ---")
# using previous instantiated object
NB.crossValidate(corpus_txt_token_lexicon_pol)
# store predictions from classifier
nb_txt_token_lexicon_pol_non_smoothed_preds=NB.predictions
print(f"Accuracy: {NB.getAccuracy():.5f}")
print(f"Std. Dev: {NB.getStdDeviation():.5f}")

## Question 2.0

#### Using the tagged reviews

In [None]:
# use smoothing
laplacian_k = 1
NB=NaiveBayesText(smoothing=True,bigrams=False,trigrams=False,discard_closed_class=False, laplacian_k=laplacian_k)
NB.train(corpus_tag.train)
NB.test(corpus_tag.train, verbose=False)
print(f"Training Accuracy using smoothing with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
print(f"Number of ties: {NB.ties}")
NB.test(corpus_tag.test, verbose=False)
print(f"Test Accuracy using smoothing with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
print(f"Number of ties: {NB.ties}")

In [None]:
print("--- classifying reviews with NB and smoothing using 10-fold cross-evaluation ---")
# using previous instantiated object
NB.crossValidate(corpus_tag)
print(f"Accuracy with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
print(f"Std. Dev with laplacian {laplacian_k}: {NB.getStdDeviation():.5f}")

#### Using uppercase text reviews

In [None]:
# use smoothing
laplacian_k = 1
NB=NaiveBayesText(smoothing=True,bigrams=False,trigrams=False,discard_closed_class=False, laplacian_k=laplacian_k)
NB.train(corpus_txt_upper.train)
NB.test(corpus_txt_upper.train, verbose=False)
print(f"Training Accuracy using smoothing with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
print(f"Number of ties: {NB.ties}")
NB.test(corpus_txt_upper.test, verbose=False)
print(f"Test Accuracy using smoothing with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
print(f"Number of ties: {NB.ties}")

In [None]:
print("--- classifying reviews with NB and smoothing using 10-fold cross-evaluation ---")
# using previous instantiated object
NB.crossValidate(corpus_txt_upper)
print(f"Accuracy with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
print(f"Std. Dev with laplacian {laplacian_k}: {NB.getStdDeviation():.5f}")

#### Using the text reviews

In [None]:
# use smoothing
laplacian_k = 1
NB=NaiveBayesText(smoothing=True,bigrams=False,trigrams=False,discard_closed_class=False, laplacian_k=laplacian_k)
NB.train(corpus_txt.train)
NB.test(corpus_txt.train, verbose=False)
print(f"Training Accuracy using smoothing with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
print(f"Number of ties: {NB.ties}")
NB.test(corpus_txt.test, verbose=False)
print(f"Test Accuracy using smoothing with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
print(f"Number of ties: {NB.ties}")

In [None]:
print("--- classifying reviews with NB and smoothing using 10-fold cross-evaluation ---")
# using previous instantiated object
NB.crossValidate(corpus_txt)
print(f"Accuracy with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
print(f"Std. Dev with laplacian {laplacian_k}: {NB.getStdDeviation():.5f}")

#### Using the tokenised text reviews

In [None]:
# use smoothing
laplacian_k = 1
NB=NaiveBayesText(smoothing=True,bigrams=False,trigrams=False,discard_closed_class=False, laplacian_k=laplacian_k)
NB.train(corpus_txt_token.train)
NB.test(corpus_txt_token.train, verbose=False)
print(f"Training Accuracy using smoothing with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
print(f"Number of ties: {NB.ties}")
NB.test(corpus_txt_token.test, verbose=False)
print(f"Test Accuracy using smoothing with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
print(f"Number of ties: {NB.ties}")

In [None]:
print("--- classifying reviews with NB and smoothing using 10-fold cross-evaluation ---")
# using previous instantiated object
NB.crossValidate(corpus_txt_token)
nb_smoothed_txt_token_preds = NB.predictions
print(f"Accuracy with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
print(f"Std. Dev with laplacian {laplacian_k}: {NB.getStdDeviation():.5f}")

#### Using the tokenised stop word text reviews

In [None]:
# use smoothing
laplacian_k = 1
NB=NaiveBayesText(smoothing=True,bigrams=False,trigrams=False,discard_closed_class=False, laplacian_k=laplacian_k)
NB.train(corpus_txt_token_stop.train)
NB.test(corpus_txt_token_stop.train, verbose=False)
print(f"Training Accuracy using smoothing with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
print(f"Number of ties: {NB.ties}")
NB.test(corpus_txt_token_stop.test, verbose=False)
print(f"Test Accuracy using smoothing with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
print(f"Number of ties: {NB.ties}")

In [None]:
print("--- classifying reviews with NB and smoothing using 10-fold cross-evaluation ---")
# using previous instantiated object
NB.crossValidate(corpus_txt_token_stop)
print(f"Accuracy with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
print(f"Std. Dev with laplacian {laplacian_k}: {NB.getStdDeviation():.5f}")

#### Using the tokenised stop punct text reviews

In [None]:
# use smoothing
laplacian_k = 1
NB=NaiveBayesText(smoothing=True,bigrams=False,trigrams=False,discard_closed_class=False, laplacian_k=laplacian_k)
NB.train(corpus_txt_token_stop_punct.train)
NB.test(corpus_txt_token_stop_punct.train, verbose=False)
print(f"Training Accuracy using smoothing with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
print(f"Number of ties: {NB.ties}")
NB.test(corpus_txt_token_stop_punct.test, verbose=False)
print(f"Test Accuracy using smoothing with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
print(f"Number of ties: {NB.ties}")

In [None]:
print("--- classifying reviews with NB and smoothing using 10-fold cross-evaluation ---")
# using previous instantiated object
NB.crossValidate(corpus_txt_token_stop_punct)
print(f"Accuracy with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
print(f"Std. Dev with laplacian {laplacian_k}: {NB.getStdDeviation():.5f}")

#### Using the tokenised text lexicon reviews

In [None]:
# use smoothing
laplacian_k = 1
NB=NaiveBayesText(smoothing=True,bigrams=False,trigrams=False,discard_closed_class=False, laplacian_k=laplacian_k)
NB.train(corpus_txt_token_lexicon.train)
NB.test(corpus_txt_token_lexicon.train, verbose=False)
print(f"Training Accuracy using smoothing with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
print(f"Number of ties: {NB.ties}")
NB.test(corpus_txt_token.test, verbose=False)
print(f"Test Accuracy using smoothing with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
print(f"Number of ties: {NB.ties}")

In [None]:
print("--- classifying reviews with NB and smoothing using 10-fold cross-evaluation ---")
# using previous instantiated object
NB.crossValidate(corpus_txt_token_lexicon)
print(f"Accuracy with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
print(f"Std. Dev with laplacian {laplacian_k}: {NB.getStdDeviation():.5f}")

#### Using the tokenised pos/neg text lexicon reviews

In [None]:
# use smoothing
laplacian_k = 1
NB=NaiveBayesText(smoothing=True,bigrams=False,trigrams=False,discard_closed_class=False, laplacian_k=laplacian_k)
NB.train(corpus_txt_token_lexicon_pol.train)
NB.test(corpus_txt_token_lexicon_pol.train, verbose=False)
print(f"Training Accuracy using smoothing with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
print(f"Number of ties: {NB.ties}")
NB.test(corpus_txt_token.test, verbose=False)
print(f"Test Accuracy using smoothing with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
print(f"Number of ties: {NB.ties}")

In [None]:
print("--- classifying reviews with NB and smoothing using 10-fold cross-evaluation ---")
# using previous instantiated object
NB.crossValidate(corpus_txt_token_lexicon_pol)
print(f"Accuracy with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
print(f"Std. Dev with laplacian {laplacian_k}: {NB.getStdDeviation():.5f}")

## Question 3.0
Moved this part up so that I could use the predictions from cross-evaluation on Q2.1

In [None]:
# print("--- classifying reviews with NB and smoothing using 10-fold cross-evaluation ---")
# # using previous instantiated object
# NB.crossValidate(corpus)
# print(f"Accuracy with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
# print(f"Std. Dev with laplacian {laplacian_k}: {NB.getStdDeviation():.5f}")

### Smoothing Grid Search

In [None]:
def nb_extract_frequencies(cond_prod):
    df = pd.DataFrame(cond_prod).reset_index().rename(columns={'level_0': 'word'}).sort_values('word')
    return np.array(df['word']), np.array(df['POS']), np.array(df['NEG'])

In [None]:
nb_laplace_smoothing_vals_large = np.arange(0.1,10,0.1)
nb_laplace_smoothing_test_accuracies = np.zeros_like(nb_laplace_smoothing_vals_large)
for i, laplace_smoothing in tqdm(enumerate(nb_laplace_smoothing_vals_large)):
    NB=NaiveBayesText(smoothing=True,bigrams=False,trigrams=False,discard_closed_class=False,laplacian_k=laplace_smoothing)
    NB.train(corpus_txt_token.train)
    NB.test(corpus_txt_token.test, verbose=False)
    words, _, _ = nb_extract_frequencies(NB.condProb)
    nb_laplace_smoothing_test_accuracies[i] = NB.getAccuracy()
nb_laplace_smoothing_test_accuracies

In [None]:
nb_laplace_smoothing_vals_large[np.argmax(nb_laplace_smoothing_test_accuracies)]

In [None]:
nb_laplace_smoothing_vals = [0, 0.1, 1, 10]
nb_laplace_smoothing_pos_frequencies = np.zeros((len(nb_laplace_smoothing_vals), 45974))
nb_laplace_smoothing_neg_frequencies = np.zeros((len(nb_laplace_smoothing_vals), 45974))
for i, laplace_smoothing in tqdm(enumerate(nb_laplace_smoothing_vals)):
    NB=NaiveBayesText(smoothing=laplace_smoothing > 0,bigrams=False,trigrams=False,discard_closed_class=False,laplacian_k=laplace_smoothing)
    NB.train(corpus_txt_token.train)
    NB.test(corpus_txt_token.test, verbose=False)
    words, pos_freq, neg_freq = nb_extract_frequencies(NB.condProb)

    nb_laplace_smoothing_pos_frequencies[i,:] = pos_freq
    nb_laplace_smoothing_neg_frequencies[i,:] = neg_freq

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(30,10))

nb_laplace_smoothing_pos_frequencies_argsort = np.argsort(-nb_laplace_smoothing_pos_frequencies[0,:])

ax[0].plot(nb_laplace_smoothing_pos_frequencies[:, nb_laplace_smoothing_pos_frequencies_argsort[:100]].T)
ax[0].set_title("Top 100 Words")
ax[0].set_xlabel("Word")
ax[0].set_ylabel("Smoothed Probability")
ax[0].set_xticks(np.arange(0,100,10))
ax[0].set_xlim(-1,100)
ax[0].set_xticklabels(words[nb_laplace_smoothing_pos_frequencies_argsort[0:100:10]], rotation=45)
ax[0].legend(['Maximum Likelihood', r'$\kappa=0.1$', r'$\kappa=1$', r'$\kappa=10$'], loc='center right')

ax[1].plot(nb_laplace_smoothing_pos_frequencies[:, nb_laplace_smoothing_pos_frequencies_argsort[-100:]].T)
ax[1].set_title("Bottom 100 Words")
ax[1].set_xlabel("Word")
ax[1].set_ylabel("Smoothed Probability")
ax[1].set_xticks(np.arange(0,100,10))
ax[1].set_xlim(-1,100)
ax[1].set_xticklabels(words[nb_laplace_smoothing_pos_frequencies_argsort[-100::10]], rotation=45)
ax[1].legend(['Maximum Likelihood', r'$\kappa=0.1$', r'$\kappa=1$', r'$\kappa=10$'], loc='center right')

ax[2].plot(nb_laplace_smoothing_vals_large, nb_laplace_smoothing_test_accuracies)
ax[2].set_title("Test Accuracy")
ax[2].set_xlabel(r'Laplace Smoothing Factor, $\kappa$')
ax[2].set_ylabel("Accuracy")
ax[2].set_xlim(0,10)

fig.savefig(os.path.join(plot_dir, 'nb_smoothed_word_probabilities.jpeg'), pad_inches=0.2, bbox_inches='tight')

In [None]:
# use smoothing
laplacian_k = 3.6
NB=NaiveBayesText(smoothing=True,bigrams=False,trigrams=False,discard_closed_class=False, laplacian_k=laplacian_k)
NB.train(corpus_txt_token.train)
NB.test(corpus_txt_token.train, verbose=False)
print(f"Training Accuracy using smoothing with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
print(f"Number of ties: {NB.ties}")
NB.test(corpus_txt_token.test, verbose=False)
print(f"Test Accuracy using smoothing with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
print(f"Number of ties: {NB.ties}")

In [None]:
print("--- classifying reviews with NB and smoothing using 10-fold cross-evaluation ---")
# using previous instantiated object
NB.crossValidate(corpus_txt_token)
# saving this for use later
num_non_stemmed_features=len(NB.vocabulary)
# using cross-eval for smoothed predictions from now on
nb_smoothed_txt_token_optimised_preds=NB.predictions
print(f"Accuracy with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
print(f"Std. Dev with laplacian {laplacian_k}: {NB.getStdDeviation():.5f}")

In [None]:
# see if hyperparameter tuning significantly improves results
p_value=signTest.getSignificance(nb_smoothed_txt_token_preds,nb_smoothed_txt_token_optimised_preds)
significance = "significant" if p_value < 0.05 else "not significant"
print(f"results using tuned hyperparameters are {significance} with respect to defaults")

### Look at other corpora

In [None]:
# laplacian_k_vals = [0.1, 1, 2, 3, 4, 5]

#### Remove punctuation

In [None]:
# nb_smoothed_test_accuracies = np.zeros(len(laplacian_k_vals))
# for i, laplacian_k in enumerate(laplacian_k_vals):
#     NB=NaiveBayesText(smoothing=True,bigrams=False,trigrams=False,discard_closed_class=False, laplacian_k=laplacian_k)
#     NB.train(corpus_punct.train)
#     NB.test(corpus_punct.test, verbose=False)
#     nb_smoothed_test_accuracies[i] = NB.getAccuracy()

# laplacian_k = laplacian_k_vals[np.argmax(nb_smoothed_test_accuracies)]

# NB=NaiveBayesText(smoothing=True,bigrams=False,trigrams=False,discard_closed_class=False, laplacian_k=laplacian_k)
# NB.train(corpus_punct.train)
# NB.test(corpus_punct.train, verbose=False)
# print(f"Training Accuracy using smoothing with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
# print(f"Number of ties: {NB.ties}")
# NB.test(corpus_punct.test, verbose=False)
# print(f"Test Accuracy using smoothing with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
# print(f"Number of ties: {NB.ties}")
# NB.crossValidate(corpus_punct)
# print(f"Accuracy with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
# print(f"Std. Dev with laplacian {laplacian_k}: {NB.getStdDeviation():.5f}")

#### Remove stop words

In [None]:
# nb_smoothed_test_accuracies = np.zeros(len(laplacian_k_vals))
# for i, laplacian_k in enumerate(laplacian_k_vals):
#     NB=NaiveBayesText(smoothing=True,bigrams=False,trigrams=False,discard_closed_class=False, laplacian_k=laplacian_k)
#     NB.train(corpus_stop.train)
#     NB.test(corpus_stop.test, verbose=False)
#     nb_smoothed_test_accuracies[i] = NB.getAccuracy()

# laplacian_k = laplacian_k_vals[np.argmax(nb_smoothed_test_accuracies)]

# NB=NaiveBayesText(smoothing=True,bigrams=False,trigrams=False,discard_closed_class=False, laplacian_k=laplacian_k)
# NB.train(corpus_stop.train)
# NB.test(corpus_stop.train, verbose=False)
# print(f"Training Accuracy using smoothing with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
# print(f"Number of ties: {NB.ties}")
# NB.test(corpus_stop.test, verbose=False)
# print(f"Test Accuracy using smoothing with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
# print(f"Number of ties: {NB.ties}")
# NB.crossValidate(corpus_stop)
# print(f"Accuracy with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
# print(f"Std. Dev with laplacian {laplacian_k}: {NB.getStdDeviation():.5f}")

#### Remove stop words and punctuation

In [None]:
# nb_smoothed_test_accuracies = np.zeros(len(laplacian_k_vals))
# for i, laplacian_k in enumerate(laplacian_k_vals):
#     NB=NaiveBayesText(smoothing=True,bigrams=False,trigrams=False,discard_closed_class=False, laplacian_k=laplacian_k)
#     NB.train(corpus_stop_punct.train)
#     NB.test(corpus_stop_punct.test, verbose=False)
#     nb_smoothed_test_accuracies[i] = NB.getAccuracy()

# laplacian_k = laplacian_k_vals[np.argmax(nb_smoothed_test_accuracies)]

# NB=NaiveBayesText(smoothing=True,bigrams=False,trigrams=False,discard_closed_class=False, laplacian_k=laplacian_k)
# NB.train(corpus_stop_punct.train)
# NB.test(corpus_stop_punct.train, verbose=False)
# print(f"Training Accuracy using smoothing with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
# print(f"Number of ties: {NB.ties}")
# NB.test(corpus_stop_punct.test, verbose=False)
# print(f"Test Accuracy using smoothing with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
# print(f"Number of ties: {NB.ties}")
# NB.crossValidate(corpus_stop_punct)
# print(f"Accuracy with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
# print(f"Std. Dev with laplacian {laplacian_k}: {NB.getStdDeviation():.5f}")

#### Lexicon

In [None]:
# nb_smoothed_test_accuracies = np.zeros(len(laplacian_k_vals))
# for i, laplacian_k in enumerate(laplacian_k_vals):
#     NB=NaiveBayesText(smoothing=True,bigrams=False,trigrams=False,discard_closed_class=False, laplacian_k=laplacian_k)
#     NB.train(corpus_lexicon.train)
#     NB.test(corpus_lexicon.test, verbose=False)
#     nb_smoothed_test_accuracies[i] = NB.getAccuracy()

# laplacian_k = laplacian_k_vals[np.argmax(nb_smoothed_test_accuracies)]

# NB=NaiveBayesText(smoothing=True,bigrams=False,trigrams=False,discard_closed_class=False, laplacian_k=laplacian_k)
# NB.train(corpus_lexicon.train)
# NB.test(corpus_lexicon.train, verbose=False)
# print(f"Training Accuracy using smoothing with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
# print(f"Number of ties: {NB.ties}")
# NB.test(corpus_lexicon.test, verbose=False)
# print(f"Test Accuracy using smoothing with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
# print(f"Number of ties: {NB.ties}")
# NB.crossValidate(corpus_lexicon)
# print(f"Accuracy with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
# print(f"Std. Dev with laplacian {laplacian_k}: {NB.getStdDeviation():.5f}")

## Question 2.1

In [None]:
# see if smoothing significantly improves results
p_value=signTest.getSignificance(nb_txt_token_non_smoothed_preds,nb_smoothed_txt_token_optimised_preds)
significance = "significant" if p_value < 0.05 else "not significant"
print(f"results using smoothing are {significance} with respect to no smoothing")

## Question 4.0

In [None]:
print("--- stemming corpus ---")
# retrieve corpus with tokenized text and stemming (using porter)
stemmed_corpus_pickle = "corpus_stem.pkl"
if use_pickles and os.path.isfile(stemmed_corpus_pickle):
    with open(stemmed_corpus_pickle, 'rb') as f:
        corpus_txt_token_stemmed = pickle.load(f)
else:
    corpus_txt_token_stemmed=MovieReviewCorpus(stemming=True,use_txt=True,tokenise=True)
    with open(stemmed_corpus_pickle, 'wb') as f:
        pickle.dump(corpus_txt_token_stemmed, f)

In [None]:
corpus_tag_stemmed = MovieReviewCorpus(stemming=True)

In [None]:
laplacian_k = 3.6
NB=NaiveBayesText(smoothing=True,bigrams=False,trigrams=False,discard_closed_class=False,laplacian_k=laplacian_k)
NB.train(corpus_txt_token_stemmed.train)
NB.test(corpus_txt_token_stemmed.train, verbose=False)
print(f"Training Accuracy using smoothing with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
print(f"Number of ties: {NB.ties}")
NB.test(corpus_txt_token_stemmed.test, verbose=False)
print(f"Test Accuracy using smoothing with laplacian {laplacian_k}: {NB.getAccuracy():.5f}")
print(f"Number of ties: {NB.ties}")

In [None]:
print("--- classifying reviews with NB, smoothing and stemming using 10-fold cross-evaluation ---")
NB.crossValidate(corpus_txt_token_stemmed)
# store predictions from classifier
nb_smoothed_txt_token_stemmed_preds=NB.predictions
print(f"Accuracy: {NB.getAccuracy():.5f}")
print(f"Std. Dev: {NB.getStdDeviation():.5f}")

## Question 4.1

In [None]:
# see if stemming significantly improves results on smoothed NB
p_value=signTest.getSignificance(nb_smoothed_txt_token_optimised_preds,nb_smoothed_txt_token_stemmed_preds)
significance = "significant" if p_value < 0.05 else "not significant"
print(f"results using stemming are {significance} with respect to no stemming")

## Question 4.2

In [None]:
NB=NaiveBayesText(smoothing=False,bigrams=False,trigrams=False,discard_closed_class=False)

NB.extractVocabulary(corpus_txt_token.reviews)
print(f"features before stemming (txt - complete): {len(NB.vocabulary)}")
NB.extractVocabulary(corpus_txt_token.train)
print(f"features before stemming (txt - train): {len(NB.vocabulary)}")
NB.extractVocabulary(corpus_txt_token.test)
print(f"features before stemming (txt - test): {len(NB.vocabulary)}")
num_unigrams_features = len(NB.vocabulary)

NB.extractVocabulary(corpus_txt_token_stemmed.reviews)
print(f"features after stemming (txt - complete): {len(NB.vocabulary)}")
NB.extractVocabulary(corpus_txt_token_stemmed.train)
print(f"features after stemming (txt - train): {len(NB.vocabulary)}")
NB.extractVocabulary(corpus_txt_token_stemmed.test)
print(f"features after stemming (txt - test): {len(NB.vocabulary)}")

NB.extractVocabulary(corpus_tag.reviews)
print(f"features before stemming (tag - complete): {len(NB.vocabulary)}")
NB.extractVocabulary(corpus_tag.train)
print(f"features before stemming (tag - train): {len(NB.vocabulary)}")
NB.extractVocabulary(corpus_tag.test)
print(f"features before stemming (tag - test): {len(NB.vocabulary)}")

NB.extractVocabulary(corpus_tag_stemmed.reviews)
print(f"features after stemming (tag - complete): {len(NB.vocabulary)}")
NB.extractVocabulary(corpus_tag_stemmed.train)
print(f"features after stemming (tag - train): {len(NB.vocabulary)}")
NB.extractVocabulary(corpus_tag_stemmed.test)
print(f"features after stemming (tag - test): {len(NB.vocabulary)}")

## Question 5.0

### Unigrams and bigrams

In [None]:
# use smoothing and unigrams and bigrams
print("--- classifying reviews using Naive Bayes using smoothing with unigrams and bigrams on held-out test set ---")
NB=NaiveBayesText(smoothing=True,bigrams=True,trigrams=False,discard_closed_class=False)
NB.train(corpus_txt_token.train)
NB.test(corpus_txt_token.train, verbose=False)
print(f"Training accuracy using smoothing and bigrams: {NB.getAccuracy():.5f}")
NB.test(corpus_txt_token.test, verbose=False)
num_bigrams_features=len(NB.vocabulary)
print(f"Test accuracy using smoothing and bigrams: {NB.getAccuracy():.5f}")

In [None]:
# cross-validate model using smoothing and unigrams and bigrams
print("--- cross-validating naive bayes using smoothing and unigrams and bigrams ---")
NB=NaiveBayesText(smoothing=True,bigrams=True,trigrams=False,discard_closed_class=False)
NB.crossValidate(corpus_txt_token)
nb_smoothed_txt_token_bigram_preds=NB.predictions
print(f"Accuracy: {NB.getAccuracy():.5f}") 
print(f"Std. Dev: {NB.getStdDeviation():.5f}")

In [None]:
# see if unigrams and bigrams significantly improves results on smoothed NB only
p_value=signTest.getSignificance(nb_smoothed_txt_token_preds,nb_smoothed_txt_token_bigram_preds)
significance = "significant" if p_value < 0.05 else "not significant"
print(f"results using smoothing and unigrams and bigrams are {significance} with respect to smoothing only")

### Unigrams, bigrams and trigrams

In [None]:
# use smoothing and unigrams, bigrams and trigrams
print("--- classifying reviews using Naive Bayes using smoothing with unigrams, bigrams and trigrams on held-out test set ---")
NB=NaiveBayesText(smoothing=True,bigrams=True,trigrams=True,discard_closed_class=False)
NB.train(corpus_txt_token.train)
NB.test(corpus_txt_token.train, verbose=False)
num_bigrams_and_trigrams_features=len(NB.vocabulary)
print(f"Training accuracy using smoothing and unigrams, bigrams and trigrams: {NB.getAccuracy():.5f}")
NB.test(corpus_txt_token.test, verbose=False)
print(f"Testing accuracy using smoothing and unigrams, bigrams and trigrams: {NB.getAccuracy():.5f}")

In [None]:
# cross-validate model using smoothing and bigrams and trigrams
print("--- cross-validating naive bayes using smoothing and bigrams and trigrams ---")
NB=NaiveBayesText(smoothing=True,bigrams=True,trigrams=True,discard_closed_class=False)
NB.crossValidate(corpus_txt_token)
nb_smoothed_txt_token_bigram_and_trigram_preds=NB.predictions
print(f"Accuracy: {NB.getAccuracy():.5f}") 
print(f"Std. Dev: {NB.getStdDeviation():.5f}")

In [None]:
# see if unigrams, bigrams and trigrams significantly improves results on smoothed NB only
p_value=signTest.getSignificance(nb_smoothed_txt_token_preds,nb_smoothed_txt_token_bigram_and_trigram_preds)
significance = "significant" if p_value < 0.05 else "not significant"
print(f"results using smoothing and bigrams and trigrams are {significance} with respect to smoothing only")

In [None]:
# see if unigrams, bigrams and trigrams significantly improves results on unigrams and bigrams NB only
p_value=signTest.getSignificance(nb_smoothed_txt_token_preds,nb_smoothed_txt_token_bigram_and_trigram_preds)
significance = "significant" if p_value < 0.05 else "not significant"
print(f"results using smoothing and bigrams and trigrams are {significance} with respect to bigrams only")

### Bigrams only

In [None]:
# use smoothing and bigrams only
print("--- classifying reviews using Naive Bayes using smoothing with bigrams on held-out test set ---")
NB=NaiveBayesText(smoothing=True,unigrams=False,bigrams=True,trigrams=False,discard_closed_class=False)
NB.train(corpus_txt_token.train)
NB.test(corpus_txt_token.train, verbose=False)
num_bigrams_only_features=len(NB.vocabulary)
print(f"Training accuracy using smoothing and bigrams only: {NB.getAccuracy():.5f}")
NB.test(corpus_txt_token.test, verbose=False)
print(f"Test accuracy using smoothing and bigrams only: {NB.getAccuracy():.5f}")

In [None]:
# cross-validate model using smoothing and bigrams
print("--- cross-validating naive bayes using smoothing and bigrams ---")
NB=NaiveBayesText(smoothing=True,unigrams=False,bigrams=True,trigrams=False,discard_closed_class=False)
NB.crossValidate(corpus_txt_token)
# saving this for use later
num_bigrams_only_features=len(NB.vocabulary)
# store predictions from classifier
nb_smoothed_txt_token_bigram_only_preds=NB.predictions
print(f"Accuracy: {NB.getAccuracy():.5f}") 
print(f"Std. Dev: {NB.getStdDeviation():.5f}")

In [None]:
# see if bigrams significantly improves results on smoothed NB only
p_value=signTest.getSignificance(nb_smoothed_txt_token_preds,nb_smoothed_txt_token_bigram_only_preds)
significance = "significant" if p_value < 0.05 else "not significant"
print(f"results using smoothing and bigrams are {significance} with respect to smoothing only")

### Trigrams only

In [None]:
# use smoothing and trigrams only
print("--- classifying reviews using Naive Bayes using smoothing with trigrams on held-out test set ---")
NB=NaiveBayesText(smoothing=True,unigrams=False,bigrams=False,trigrams=True,discard_closed_class=False)
NB.train(corpus_txt_token.train)
NB.test(corpus_txt_token.train, verbose=False)
num_trigrams_only_features=len(NB.vocabulary)
print(f"Training accuracy using smoothing and trigrams only: {NB.getAccuracy():.5f}")
NB.test(corpus_txt_token.test, verbose=False)
print(f"Test accuracy using smoothing and trigrams only: {NB.getAccuracy():.5f}")

In [None]:
# cross-validate model using smoothing and trigrams
print("--- cross-validating naive bayes using smoothing and trigrams ---")
NB=NaiveBayesText(smoothing=True,unigrams=False,bigrams=False,trigrams=True,discard_closed_class=False)
NB.crossValidate(corpus_txt_token)
# store predictions from classifier
nb_smoothed_txt_token_trigram_only_preds=NB.predictions
print(f"Accuracy: {NB.getAccuracy():.5f}") 
print(f"Std. Dev: {NB.getStdDeviation():.5f}")

In [None]:
# see if trigrams significantly improves results on smoothed NB only
p_value=signTest.getSignificance(nb_smoothed_txt_token_optimised_preds,nb_smoothed_txt_token_trigram_only_preds)
significance = "significant" if p_value < 0.05 else "not significant"
print(f"results using smoothing and trigrams are {significance} with respect to smoothing only")

## Question 5.1

In [None]:
print(f"features with unigrams: {num_unigrams_features}")
print(f"features with unigrams and bigrams: {num_bigrams_features}")
print(f"features with unigrams, bigrams and trigrams: {num_bigrams_and_trigrams_features}")
print(f"features with bigrams: {num_bigrams_only_features}")
print(f"features with trigrams: {num_trigrams_only_features}")

In [None]:
plt.plot([num_unigrams_features, num_bigrams_only_features, num_trigrams_only_features])
n_bi, n_tri = np.log(num_bigrams_only_features)/np.log(num_unigrams_features), np.log(num_trigrams_only_features)/np.log(num_unigrams_features)
plt.plot([num_unigrams_features, num_unigrams_features**n_bi, num_unigrams_features**n_tri], label='Fit')
plt.plot([num_unigrams_features, num_bigrams_features, num_bigrams_and_trigrams_features])
plt.plot([num_unigrams_features, num_unigrams_features+num_unigrams_features**n_bi, num_unigrams_features+num_unigrams_features**n_bi+num_unigrams_features**n_tri], label='Fit')
plt.legend()
n_bi, n_tri

# SVM

## Question 6 and 6.1

### Word Counts

In [None]:
print("--- classifying reviews using SVM on held-out test set ---")
SVM=SVMText(bigrams=False,trigrams=False,discard_closed_class=False,tf=False,idf=False)
SVM.train(corpus_txt_token.train)
SVM.test(corpus_txt_token.train)
print(f"Training accuracy with SVM using unigrams: {SVM.getAccuracy():.5f}")
SVM.test(corpus_txt_token.test)
print(f"Test accuracy with SVM using unigrams: {SVM.getAccuracy():.5f}")

In [None]:
print("--- classifying reviews using SVM with unigrams and 10-fold cross-eval ---")
SVM.crossValidate(corpus_txt_token,verbose=False)
svm_og_preds=SVM.predictions
print(f"Accuracy: {SVM.getAccuracy():.5f}") 
print(f"Std. Dev: {SVM.getStdDeviation():.5f}")

### Term Frequency

In [None]:
print("--- classifying reviews using SVM on held-out test set ---")
SVM=SVMText(bigrams=False,trigrams=False,discard_closed_class=False,tf=True,idf=False)
SVM.train(corpus_txt_token.train)
SVM.test(corpus_txt_token.train)
print(f"Training accuracy with SVM using unigrams: {SVM.getAccuracy():.5f}")
SVM.test(corpus_txt_token.test)
print(f"Test accuracy with SVM using unigrams: {SVM.getAccuracy():.5f}")

In [None]:
print("--- classifying reviews using SVM with unigrams and 10-fold cross-eval ---")
SVM.crossValidate(corpus_txt_token,verbose=False)
svm_tf_preds=SVM.predictions
print(f"Accuracy: {SVM.getAccuracy():.5f}") 
print(f"Std. Dev: {SVM.getStdDeviation():.5f}")

In [None]:
# see if using term frequency significantly improves results on words counts
p_value=signTest.getSignificance(svm_og_preds,svm_tf_preds)
significance = "significant" if p_value < 0.05 else "not significant"
print(f"results using tf {significance} with respect to word counts")

### tf-idf

In [9]:
print("--- classifying reviews using SVM on held-out test set ---")
SVM=SVMText(bigrams=False,trigrams=False,discard_closed_class=False,tf=True,idf=True)
SVM.train(corpus_txt_token.train)
SVM.test(corpus_txt_token.train)
print(f"Training accuracy with SVM using unigrams: {SVM.getAccuracy():.5f}")
SVM.test(corpus_txt_token.test)
print(f"Test accuracy with SVM using unigrams: {SVM.getAccuracy():.5f}")

--- classifying reviews using SVM on held-out test set ---
Training accuracy with SVM using unigrams: 0.99944
Test accuracy with SVM using unigrams: 0.84500


In [10]:
print("--- classifying reviews using SVM with unigrams and 10-fold cross-eval ---")
SVM.crossValidate(corpus_txt_token,verbose=False)
# store predictions from classifier
svm_tf_idf_preds=SVM.predictions
print(f"Accuracy: {SVM.getAccuracy():.5f}") 
print(f"Std. Dev: {SVM.getStdDeviation():.5f}")

--- classifying reviews using SVM with unigrams and 10-fold cross-eval ---
Accuracy: 0.83250
Std. Dev: 0.01365


In [None]:
# see if using tf-idf significantly improves results on words counts
p_value=signTest.getSignificance(svm_og_preds,svm_tf_idf_preds)
significance = "significant" if p_value < 0.05 else "not significant"
print(f"results using tf-idf {significance} with respect to word counts")

In [None]:
# see if using tf-idf significantly improves results on tf
p_value=signTest.getSignificance(svm_tf_preds,svm_tf_idf_preds)
significance = "significant" if p_value < 0.05 else "not significant"
print(f"results using tf-idf {significance} with respect to tf")

### Grid Search

In [None]:
svm_gs_params = {
    "C": np.arange(0.2, 10.2, 0.4),
    "kernel": ["linear", "poly", "rbf", "sigmoid"]
}
SVM=SVMText(bigrams=False,trigrams=False,discard_closed_class=False,tf=True,idf=True)
svc_gs = SVM.train(corpus_txt_token.train, grid_search_params=svm_gs_params)

In [None]:
svc_gs.best_params_, svc_gs.best_score_

In [None]:
svc_gs.cv_results_

In [11]:
print("--- classifying reviews using SVM on held-out test set ---")
SVM=SVMText(bigrams=False,trigrams=False,discard_closed_class=False,tf=True,idf=True,C=1.8,kernel='linear')
SVM.train(corpus_txt_token.train)
SVM.test(corpus_txt_token.train)
print(f"Training accuracy with SVM using unigrams: {SVM.getAccuracy():.5f}")
SVM.test(corpus_txt_token.test)
print(f"Test accuracy with SVM using unigrams: {SVM.getAccuracy():.5f}")

--- classifying reviews using SVM on held-out test set ---
Training accuracy with SVM using unigrams: 0.99944
Test accuracy with SVM using unigrams: 0.89000


In [12]:
print("--- classifying reviews using SVM with unigrams and 10-fold cross-eval ---")
SVM.crossValidate(corpus_txt_token,verbose=False)
# store predictions from classifier
svm_tf_idf_opt_preds=SVM.predictions
print(f"Accuracy: {SVM.getAccuracy():.5f}") 
print(f"Std. Dev: {SVM.getStdDeviation():.5f}")

--- classifying reviews using SVM with unigrams and 10-fold cross-eval ---
Accuracy: 0.86100
Std. Dev: 0.01997


In [16]:
# see if using optimised hyperameters significantly improves default results
p_value=signTest.getSignificance(svm_tf_idf_preds, svm_tf_idf_opt_preds)
significance = "significant" if p_value < 0.05 else "not significant"
print(f"results using optimised SVC hyperparameters {significance} with respect to defaults")

results using optimised SVC hyperparameters not significant with respect to defaults


In [None]:
# see if SVM significantly improves results on smoothed NB
p_value=signTest.getSignificance(nb_smoothed_txt_token_preds,svm_tf_idf_opt_preds)
significance = "significant" if p_value < 0.05 else "not significant"
print(f"results using SVM {significance} with respect to smoothed NB")

### Bigrams

In [None]:
# SVM=SVMText(bigrams=True,trigrams=False,discard_closed_class=False,tf=True,idf=True,C=1.8,kernel='linear')
SVM=SVMText(bigrams=True,trigrams=False,discard_closed_class=False,tf=True,idf=True)
SVM.train(corpus_txt_token.train)
SVM.test(corpus_txt_token.train)
print(f"Training accuracy with SVM using additional bigrams: {SVM.getAccuracy():.5f}")
SVM.test(corpus_txt_token.test)
print(f"Test accuracy with SVM using additional bigrams: {SVM.getAccuracy():.5f}")

In [None]:
print("--- classifying reviews using SVM with unigrams and 10-fold cross-eval ---")
SVM.crossValidate(corpus_txt_token,verbose=False)
print(f"Accuracy: {SVM.getAccuracy():.5f}") 
print(f"Std. Dev: {SVM.getStdDeviation():.5f}")

### Trigrams

In [None]:
# SVM=SVMText(bigrams=True,trigrams=True,discard_closed_class=False,tf=True,idf=True,C=1.8,kernel='linear')
SVM=SVMText(bigrams=True,trigrams=True,discard_closed_class=False,tf=True,idf=True)
SVM.train(corpus_txt_token.train)
SVM.test(corpus_txt_token.train)
print(f"Training accuracy with SVM using additional bigrams and trigrams: {SVM.getAccuracy():.5f}")
SVM.test(corpus_txt_token.test)
print(f"Teat accuracy with SVM using additional bigrams and trigrams: {SVM.getAccuracy():.5f}")

In [None]:
print("--- classifying reviews using SVM with unigrams and 10-fold cross-eval ---")
SVM.crossValidate(corpus_txt_token,verbose=False)
print(f"Accuracy: {SVM.getAccuracy():.5f}") 
print(f"Std. Dev: {SVM.getStdDeviation():.5f}")

# POS

## Question 7

In [None]:
print("--- adding in POS information to corpus ---")

In [None]:
# print("--- pos corpus ---")
# # retrieve corpus with tokenized text and pos
# pos_corpus_pickle = "corpus_pos.pkl"
# if use_pickles and os.path.isfile(pos_corpus_pickle):
#     with open(pos_corpus_pickle, 'rb') as f:
#         pos_corpus = pickle.load(f)
# else:
#     pos_corpus=MovieReviewCorpus(pos=True)
#     with open(pos_corpus_pickle, 'wb') as f:
#         pickle.dump(pos_corpus, f)

In [None]:
# pos_corpus=MovieReviewCorpus(pos=True)
corpus_tag_pos=MovieReviewCorpus(pos=True,stemming=False)

In [None]:
print("--- training nb on word+pos features ----")
NB=NaiveBayesText(smoothing=False,bigrams=False,trigrams=False,discard_closed_class=False)
NB.train(corpus_tag_pos.train)
NB.test(corpus_tag_pos.test, verbose=False)
print(f"Accuracy using NB on unigrams without smoothing and with POS: {NB.getAccuracy():.5f}")

In [None]:
NB=NaiveBayesText(smoothing=True,bigrams=False,trigrams=False,discard_closed_class=False)
NB.train(corpus_tag_pos.train)
NB.test(corpus_tag_pos.test, verbose=False)
print(f"Accuracy using NB on unigrams with smoothing and POS: {NB.getAccuracy():.5f}")

In [None]:
print("--- training svm on word+pos features ----")
# SVM=SVMText(bigrams=False,trigrams=False,discard_closed_class=False,tf=True,idf=True,C=1.8,kernel='linear')
SVM=SVMText(bigrams=False,trigrams=False,discard_closed_class=False,tf=True,idf=True)
SVM.train(corpus_tag_pos.train)
SVM.test(corpus_tag_pos.train)
print(f"Training accuracy with SVM with POS: {SVM.getAccuracy():.5f}")
SVM.test(corpus_tag_pos.test)
print(f"Test accuracy with SVM with POS: {SVM.getAccuracy():.5f}")

In [None]:
print("--- classifying reviews using svm on word+pos and 10-fold cross-eval ---")
SVM.crossValidate(corpus_tag_pos)
# store predictions from classifier
svm_pos_preds=SVM.predictions
print(f"Accuracy: {SVM.getAccuracy():.5f}") 
print(f"Std. Dev: {SVM.getStdDeviation():.5f}")

In [None]:
# see if POS significantly improves results on SVM only
p_value=signTest.getSignificance(svm_tf_idf_preds,svm_pos_preds)
significance = "significant" if p_value < 0.05 else "not significant"
print(f"results using POS tags {significance} with respect to SVM")

# Discard Closed Class

In [None]:
print("--- training nb discarding closed-class words ---")
NB=NaiveBayesText(smoothing=False,bigrams=False,trigrams=False,discard_closed_class=True)
NB.train(corpus_tag_pos.train)
NB.test(corpus_tag_pos.train, verbose=False)
print(f"Training accuracy using NB without smoothing and discarding closed-class words: {NB.getAccuracy():.5f}")
NB.test(corpus_tag_pos.test, verbose=False)
print(f"Test accuracy using NB without smoothing and discarding closed-class words: {NB.getAccuracy():.5f}")

In [None]:
NB=NaiveBayesText(smoothing=True,bigrams=False,trigrams=False,discard_closed_class=True)
NB.train(corpus_tag_pos.train)
NB.test(corpus_tag_pos.train, verbose=False)
print(f"Training accuracy using NB with smoothing and discarding closed-class words: {NB.getAccuracy():.5f}")
NB.test(corpus_tag_pos.test, verbose=False)
print(f"Test accuracy using NB with smoothing and discarding closed-class words: {NB.getAccuracy():.5f}")

In [None]:
print("--- training svm discarding closed-class words ---")
# SVM=SVMText(bigrams=False,trigrams=False,discard_closed_class=True,C=1.8,kernel='linear')
SVM=SVMText(bigrams=False,trigrams=False,discard_closed_class=True)
SVM.train(corpus_tag_pos.train)
SVM.test(corpus_tag_pos.train)
print(f"Training accuracy with SVM discarding closed-class word: {SVM.getAccuracy():.5f}")
SVM.test(corpus_tag_pos.test)
print(f"Test accuracy with SVM discarding closed-class word: {SVM.getAccuracy():.5f}")

In [None]:
print("--- classifying reviews using svm discarding closed-class words and 10-fold cross-eval ---")
SVM.crossValidate(corpus_tag_pos)
# store predictions from classifier
svm_closed_class=SVM.predictions
print(f"Accuracy: {SVM.getAccuracy():.5f}") 
print(f"Std. Dev: {SVM.getStdDeviation():.5f}")

In [None]:
# see if discarding closed-class words significantly improves results on SVM only
p_value=signTest.getSignificance(svm_tf_idf_preds,svm_closed_class)
significance = "significant" if p_value < 0.05 else "not significant"
print(f"results discardig closed-class words {significance} with respect to SVM")

## Question 8.0

In [None]:
# print("--- using document embeddings ---")