In [None]:
import pandas as pd
import numpy as np
import nltk
from matplotlib import pyplot as plt
from matplotlib import pyplot
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer 
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from collections import defaultdict
from nltk.corpus import wordnet as wn
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_recall_curve,f1_score,auc
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, learning_curve

In [None]:
# Set seed for reproduceable results
np.random.seed(500)
# Read in to Pandas DataFrame and drop the first row(which contained column names as I have assigned new names)
reviews = pd.read_csv(r"rt_reviews.csv", names = ['target', 'review'], encoding = 'latin-1')
reviews = reviews.iloc[1:]

In [None]:
# Choose number of samples to use
reviews = reviews.iloc[:2000]

In [None]:
# Standardize target data types to numeric
for i in range(reviews.shape[0]):
    if(reviews['target'].values[i] == '0'):
        reviews['target'].values[i] = 0
    elif(reviews['target'].values[i] == '1'):
        reviews['target'].values[i] = 1
for i in range(reviews.shape[0]):
    if(isinstance(reviews['target'].values[i],str)):
        print("Caught: ",reviews['target'].values[i])
    if(reviews['target'].values[i] != 0 and reviews['target'].values[i] != 1):
        print("Error")

In [None]:
# Remove and row where either datafield is blank, no rows contained blank data so the shape remains 480000,2 
reviews.dropna(inplace = True)
reviews.shape

In [None]:
# Convert all text to lowercase
reviews['review'] = [entry.lower() for entry in reviews['review']]

In [None]:
# Tokenize each review: this process converts each review into a set of words. 
reviews['review'] = [word_tokenize(entry) for entry in reviews['review']]

##### Stemming Words: We improved upon this with Lemmatizing 

for index, entry in enumerate(reviews['review']):
    index = index + 1
    Final_words = []
        
    word_Stemmer = PorterStemmer()
        
    for word, tag in pos_tag(entry):
        if word not in stopwords.words('english') and word.isalpha() and wordnet.synsets(word):
            word_Final = word_Stemmer.stem(word)  #This is where a stemmer would go
            Final_words.append(word_Final)
        reviews.loc[index, 'stemmed_words'] = str(Final_words) 
        
        

#### Lemmatizing: We improved upon this with Unaltered Words
###### Creating tags so that lemmatizer can understand verbs from nouns from adjectives 
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

for index,entry in enumerate(reviews['review']):
    index = index+1 # Index seems to off by one, this fixes it
    # Words that follow the rules will end up in this list
    Final_words = []
    
    word_Lemmatized = WordNetLemmatizer()
    
    for word, tag in pos_tag(entry):
        if word not in stopwords.words('english') and word.isalpha() and wordnet.synsets(word):
            word_Final = word_Lemmatized.lemmatize(word, tag_map[tag[0]])
            Final_words.append(word_Final)
        reviews.loc[index, 'lemmatized_words'] = str(Final_words)
        
        

In [None]:
################ Unaltered Words ############

for index, entry in enumerate(reviews['review']):
    index = index + 1
    Final_words = []
        
    for word, tag in pos_tag(entry):
        if word not in stopwords.words('english') and word.isalpha() and wordnet.synsets(word):
            word_Final = word  #This is where a stemmer would go
            Final_words.append(word_Final)
        reviews.loc[index, 'unaltered_words'] = str(Final_words)        

In [None]:
reviews

In [None]:
# Create Train and Test splits 
test_s = .15
train_x, test_x, train_y, test_y = model_selection.train_test_split(reviews['unaltered_words'], 
                                                                    reviews['target'], test_size=test_s)

In [None]:
# Encode target data
Encoder = LabelEncoder()
train_y = Encoder.fit_transform(train_y)
test_y = Encoder.fit_transform(test_y)

#### Term Frequency-Inverse Document Frequency: Improved upon this with bag of words
max_f = 100000
Tfidf_vect = TfidfVectorizer(stop_words='english', max_features=max_f)
Tfidf_vect.fit_transform(reviews['unaltered_words'])
train_x_Tfidf = Tfidf_vect.transform(train_x)
test_x_Tfidf = Tfidf_vect.transform(test_x)
print(Tfidf_vect.vocabulary_)
len(Tfidf_vect.vocabulary_)


nb = naive_bayes.MultinomialNB()
nb.fit(train_x_Tfidf, train_y)


pred_nb = nb.predict(test_x_Tfidf)


print("Accuracy: ", accuracy_score(pred_nb, test_y))
nb_probs = nb.predict_proba(test_x_Tfidf)
nb_probs = nb_probs[:,1]
print(classification_report(test_y, pred_nb, labels=[0,1]))

In [None]:
##### Bag of Words #######
max_f = 100000
#ngram_range default is (1,1)
#ngram_range(1,1) -> uni-gram
#ngram_range(2,2) -> bi-gram
#ngram_range(3,3) -> tri-gram
#ngram_range(1,2) -> uni, bi-grams -> BEST SO FAR
Count_vect = CountVectorizer(stop_words='english', max_features=max_f, ngram_range=(1,2))
Count_vect.fit_transform(reviews['unaltered_words'])
train_x_Count = Count_vect.transform(train_x)
test_x_Count = Count_vect.transform(test_x)
#print(Tfidf_vect.vocabulary_)
len(Count_vect.vocabulary_)

In [None]:
###### Bag of Words Frequency #######
nb = naive_bayes.MultinomialNB()
nb.fit(train_x_Count, train_y)

print(train_x_Count[0])

# predict the target on validation data
pred_nb = nb.predict(test_x_Count)

# output accuracy just to show it works
print("Accuracy: ", accuracy_score(pred_nb, test_y))
nb_probs = nb.predict_proba(test_x_Count)
nb_probs = nb_probs[:,1]
print(classification_report(test_y, pred_nb, labels=[0,1]))
# svm = SVC(probability=True)
# svm.fit(train_x_Count, train_y)
# pred_svm = svm.predict(test_x_Count)
# print("Accuracy: ", accuracy_score(pred_svm, test_y))

In [None]:

############### Lemma vs Stem vs Unaltered 480k ###################

# performed on 480k samples with non english words removed
width = .3
pltData = pd.DataFrame({ 
'accuracy': [.7833, .7792, .7945], 
'f1-score' : [.78, .78,.795]})

pltData[['f1-score', 'accuracy']].plot(kind = 'bar', width = width)

ax = plt.gca()

plt.xlim([-width, len(pltData['accuracy'])-width])

ax.set_ylim([.77, .8])
ax.set_xticklabels(('Lemmatized', 'Stemmed', 'Unaltered'))
ax.legend(labels = ['f1-score', 'accuracy'])
ax.set_title("Performance by Word Alteration type")
ax.set_ylabel('Performance')
ax.set_xlabel('Word Alteration type')


plt.show()

################ Graphs of the Different grams for Bag of Words ############

#performed on all 100,000 examples w/ unaltered words

#looking at all grams
width = .3
pltData = pd.DataFrame({ 
'accuracy': [.7871, .7294, .5991, .5746, .8055, .8047, .8049], 
'f1-score' : [.79, .73, .54, .48, .81, .80, .80]})

pltData[['f1-score', 'accuracy']].plot(kind = 'bar', width = width)

ax = plt.gca()

plt.xlim([-width, len(pltData['accuracy'])-width])

ax.set_ylim([.45, .85])
ax.set_xticklabels(('Uni', 'Bi', 'Tri', 'Quad', 'Uni & Bi', 'Uni, Bi & Tri', 'Uni, Bi, Tri, &Quad'))
ax.legend(labels = ['f1-score', 'accuracy'])
ax.set_title("Performance by Gram Type")
ax.set_ylabel('Performance')
ax.set_xlabel('Gram type')

plt.show()



#looking at bag of words vs. Term frequency inverse document freqeuncy 
width = .3
pltData = pd.DataFrame({ 
'accuracy': [.8055, .7945], 
'f1-score' : [.81, .795]})

pltData[['f1-score', 'accuracy']].plot(kind = 'bar', width = width)

ax = plt.gca()

plt.xlim([-width, len(pltData['accuracy'])-width])

ax.set_ylim([.75, .85])
ax.set_xticklabels(('Bag of Words','TFIDF'))
ax.legend(labels = ['f1-score', 'accuracy'])
ax.set_title("Performance by Vectorization Technique")
ax.set_ylabel('Performance')
ax.set_xlabel('Vectorization Technique')

plt.show()

In [None]:
############### Alternative Plot for minumum document frequency ###################

width = .25 
pltData = pd.DataFrame({ 
'accuracy': [.7845, .7833, .7811, .7748, .7673], 
'f1-score' : [.785, .78, .78, .775, .77], 
'avg' : [.78475, .7817, .7805, .7749, .7686]})

pltData[['f1-score', 'accuracy']].plot(kind = 'bar', width = width)
pltData['avg'].plot(color = "tab:cyan")

ax = plt.gca()

plt.xlim([-width, len(pltData['accuracy'])-width])

ax.set_ylim([.76, .79])
ax.set_xticklabels(('1', '5', '10', '50', '100'))
ax.legend(labels = ['avg','f1-score', 'accuracy'])
ax.set_title("Performance by Minimum document frequency")
ax.set_ylabel('Performance')
ax.set_xlabel('Minimum Document Frequency')

plt.show()

In [None]:
############### Plot for Max features ###################

width = .25 
pltData = pd.DataFrame({ 
'accuracy': [.7748, .7810, .7841, .7845, .7845, .7845], 
'f1-score' : [.775, .78, .78, .785, .785, .785], 
'avg' : [.7749, .7805, .7820, .78475, .78475, .78475]})

pltData[['f1-score', 'accuracy']].plot(kind = 'bar', width = width)
pltData['avg'].plot(color = "tab:cyan")

ax = plt.gca()

plt.xlim([-width, len(pltData['accuracy'])-width])

ax.set_ylim([.77, .79])
ax.set_xticklabels(('25%', '50%', '75%', '85%', '95%', '100%'))
ax.legend(labels = ['avg','f1-score', 'accuracy'])
ax.set_title("Performance by Percentage of Max Features")
ax.set_ylabel('Performance')
ax.set_xlabel('Percentage of Max Features')

plt.show()

In [None]:
# First Attempt at classifier is Naive Bays 
#nbm = naive_bayes.MultinomialNB()
#nbm.fit(train_x_Count, train_y)

#Bernoulli Naive Bayes
nbb = naive_bayes.BernoulliNB()
nbb.fit(train_x_Count, train_y)

# predict the target on validation data
#pred_nbm = nbm.predict(test_x_Count)
pred_nbb = nbb.predict(test_x_Count)

# output accuracy just to show it works
#print("NB Multinomial: Accuracy: ", accuracy_score(pred_nbm, test_y))
print("NB Bernoulli: Accuracy: ", accuracy_score(pred_nbb, test_y))

#svm = SVC(probability=True)
#svm.fit(train_x_Count, train_y)
#pred_svm = svm.predict(test_x_Count)
#print("SVM: Accuracy: ", accuracy_score(pred_svm, test_y))

In [None]:
#Analysis

#Predict Probability
#nbm_probs = nbm.predict_proba(test_x_Count)
#nbm_probs = nbm_probs[:,1]
nbb_probs = nbb.predict_proba(test_x_Count)
nbb_probs = nbb_probs[:,1]
#svm_probs = svm.predict_proba(test_x_Count)
#svm_probs = svm_probs[:,1]

#Classification Report
#print(classification_report(test_y, pred_nbm, labels=[0,1]))
print(classification_report(test_y, pred_nbb, labels=[0,1]))
#print(classification_report(test_y, pred_svm, labels=[0,1]))

#Calculate precision-recall
#precision_nbm, recall_nbm, thresholds_nbm = precision_recall_curve(test_y, nbm_probs)
precision_nbb, recall_nbb, thresholds_nbb = precision_recall_curve(test_y, nbb_probs)
#precision_svm, recall_svm, thresholds_svm = precision_recall_curve(test_y, svm_probs)

#Calculate F1
#f1_nbm = f1_score(test_y, pred_nbm)
f1_nbb = f1_score(test_y, pred_nbb)
#f1_svm = f1_score(test_y, pred_svm)

#Calculate precision recal auc
# auc_nb = auc(recall_nb, precision_nb)
# auc_svm = auc(recall_svm, precision_svm)

# summarize scores
print("Test Split: ", test_s)
print("Max Features: ", max_f)
#print('Naive-Bayes-Multinomial: f1=%.3f' % (f1_nbm))
#print("Accuracy: ", accuracy_score(pred_nbm, test_y))
print('Naive-Bayes-Bernoulli: f1=%.3f' % (f1_nbb))
print("Accuracy: ", accuracy_score(pred_nbb, test_y))
#print('Support Vector Machine: f1=%.3f' % (f1_svm))
#print("Accuracy: ", accuracy_score(pred_svm, test_y))


# plot the precision-recall curves
no_skill = len(test_y[test_y==1]) / len(test_y)
#pyplot.plot(recall_nbm, precision_nbm, marker='.', label='Naive-Bayes Multinomial')
pyplot.plot(recall_nbb, precision_nbb, marker='.', label='Naive-Bayes Bernoulli')
#pyplot.plot(recall_svm, precision_svm, marker='.', label='SVM')

# axis labels
pyplot.xlabel('Recall')
pyplot.ylabel('Precision')

# show the legend
pyplot.legend()

# show the plot
pyplot.show()

In [None]:
#Learning curve
#svm_train_sizes, svm_train_scores, svm_valid_scores = learning_curve(SVC(probability=True), train_x_Count, train_y,  cv=5)
#nbm_train_sizes, nbm_train_scores, nbm_valid_scores = learning_curve(naive_bayes.MultinomialNB(), train_x_Count, train_y,  cv=5)
nbb_train_sizes, nbb_train_scores, nbb_valid_scores = learning_curve(naive_bayes.BernoulliNB(), train_x_Count, train_y,  cv=5)

#nbm_valid_scores = np.mean(nbm_valid_scores, axis=1)
#nbm_train_scores = np.mean(nbm_train_scores, axis=1)
nbb_valid_scores = np.mean(nbb_valid_scores, axis=1)
nbb_train_scores = np.mean(nbb_train_scores, axis=1)
#svm_valid_scores = np.mean(svm_valid_scores, axis=1)
#svm_train_scores = np.mean(svm_train_scores, axis=1)

#pyplot.plot(nbm_train_sizes, nbm_train_scores, marker='.', label='Naive-Bayes Multinomial - Training Score')
pyplot.plot(nbb_train_sizes, nbb_train_scores, marker='.', label='Naive-Bayes Bernoulli - Training Score')
#pyplot.plot(svm_train_sizes, svm_train_scores, marker='.', label='SVM - Training Score')

pyplot.xlabel('Training Examples')
pyplot.ylabel('Score')
pyplot.legend()

# show the plot
pyplot.show()

#pyplot.plot(nbm_train_sizes, nbm_valid_scores, marker='.', label='Naive-Bayes Multinomial - Cross Validation Score')
pyplot.plot(nbb_train_sizes, nbb_valid_scores, marker='.', label='Naive-Bayes Bernoulli - Cross Validation Score')
#pyplot.plot(svm_train_sizes, svm_valid_scores, marker='.', label='SVM - Cross Validation Score')
pyplot.xlabel('Training Examples')
pyplot.ylabel('Score')
pyplot.legend()

# show the plot
pyplot.show()

In [None]:
# classifier must be fit on training data before method is run
def classifyPhrase(phrase, classifier, Count_vect):
    # convert all text to lowercase
    phrase = phrase.lower()
    # tokenize each review
    phrase = word_tokenize(phrase)
    
    # unaltered words -> removes all words that arnt english, and removes stop words
    Final_words = []
    for word, tag in pos_tag(phrase):
        if word not in stopwords.words('english') and word.isalpha() and wordnet.synsets(word):
            word_Final = word  #This is where a stemmer would go
            Final_words.append(word_Final)
        #reviews.loc[index, 'unaltered_words'] = str(Final_words) 
        phrase = str(Final_words)
        
    print(phrase)
    
    phrase = Count_vect.transform([phrase])
        
    
#     #nb.fit(train_x, train_y)
    outcome = classifier.predict(phrase)
    print(outcome)
    
    if outcome == 1:
        return "Positive Review"
    else:
        return "Negative Review"

    
review = classifyPhrase("The Movie Was Incredible and I Highly Recommend It to a Friend.  I loved it.", nb, Count_vect)
print(review)

review = classifyPhrase("The Movie Was Terrible and I hated it.  Don't watch it.", nb, Count_vect)
print(review)