In [1]:
# let start with a bunch of imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import re
import scipy

from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

from spellchecker import SpellChecker
pd.set_option('max_colwidth', 275)
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\willa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\willa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
#importing the Train Data using pandas
column_names = ["sentiment", "review"]


df = pd.read_csv('Data/Train_Data2.csv')
df = df.reindex(columns=column_names)
# modify value in column b where a is 3
df = df.replace(to_replace ="negative", 
                 value =-1) 
df = df.replace(to_replace ="positive", 
                 value =1) 
df

Unnamed: 0,sentiment,review
0,1,"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set i..."
1,1,"A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not o..."
2,1,"I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. The plot is simplistic, but the dialogue is witty and the characters are likable (even the well bread suspected seria..."
3,-1,"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br /><br />OK, first o..."
4,1,"Petter Mattei's ""Love in the Time of Money"" is a visually stunning film to watch. Mr. Mattei offers us a vivid portrait about human relations. This is a movie that seems to be telling us what money, power and success do to people in the different situations we encounter...."
...,...,...
49995,1,"I thought this movie did a down right good job. It wasn't as creative or original as the first, but who was expecting it to be. It was a whole lotta fun. the more i think about it the more i like it, and when it comes out on DVD I'm going to pay the money for it very pro..."
49996,-1,"Bad plot, bad dialogue, bad acting, idiotic directing, the annoying porn groove soundtrack that ran continually over the overacted script, and a crappy copy of the VHS cannot be redeemed by consuming liquor. Trust me, because I stuck this turkey out to the end. It was so..."
49997,-1,"I am a Catholic taught in parochial elementary schools by nuns, taught by Jesuit priests in high school & college. I am still a practicing Catholic but would not be considered a ""good Catholic"" in the church's eyes because I don't believe certain things or act certain wa..."
49998,-1,"I'm going to have to disagree with the previous comment and side with Maltin on this one. This is a second rate, excessively vicious Western that creaks and groans trying to put across its central theme of the Wild West being tamed and kicked aside by the steady march of..."


In [3]:
# I want to check out all the reviews from the data frame
#all_reviews = df['review']
#all_reviews.head()

# cleaning the data of noise
### Here I will be cleaning the data from HTML tags, puntuation, numbers, stop words, and stem words.
### I will also be splititng words into sentences 
### this takes a while to process, give it like 40 seconds

In [4]:
ps = PorterStemmer()
wnl = WordNetLemmatizer()
clean = re.compile('<.*?>')
clean_reviews = df['review'].str.replace(clean,'',regex=True)
clean_reviews = clean_reviews.str.lower()
clean_reviews = clean_reviews.apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))
clean_reviews = clean_reviews.apply(lambda elem: re.sub(r"\d+", "", elem))
stop = stopwords.words('english')
clean_reviews = clean_reviews.apply(lambda elem: ' '.join([word for word in elem.split() if word not in (stop)]))
clean_reviews = clean_reviews.apply(lambda elem: word_tokenize(elem))
#this does some heavy lifting, probably not necessary for data cleaning but oh well..
spell = SpellChecker()
clean_reviews = clean_reviews.apply(lambda elem: [spell.correction(word) if spell.unknown(word) else word for word in elem])
clean_reviews = clean_reviews.apply(lambda elem: [wnl.lemmatize(word) if wnl.lemmatize(word).endswith('e') else ps.stem(word) for word in elem])
clean_reviews

0        [one, review, mention, watch, oz, episode, youll, hook, right, exactli, happen, methe, first, thing, struck, oz, brutal, unflinch, scene, violence, set, right, word, go, trust, show, faint, heart, timid, show, pull, punch, regard, drug, sex, violence, hardcore, classic, ...
1        [wonder, little, product, film, technique, unassum, oldtimebbc, fashion, give, comfort, sometim, discomfort, sense, realism, entire, piece, actor, extrem, well, chosen, michael, sheen, got, polari, voice, pat, truli, see, seamless, edit, guid, reference, william, diari, ...
2        [thought, wonder, way, spend, time, hot, summer, weekend, sit, air, condit, theater, watch, lightheart, comedi, plot, simplist, dialogue, witti, charact, likable, even, well, bread, suspect, serial, killer, may, disappoint, realize, match, point, risk, addict, thought, p...
3        [basic, there, famili, little, boy, jake, think, there, zombie, closet, parent, fight, timethi, movie, slower, soap, opera, suddenli, jake,

# Here I am comparing the cleaned and tokenized bag of words to regular reviews for precission

In [5]:
df['clean_reviews'] = clean_reviews
df

Unnamed: 0,sentiment,review,clean_reviews
0,1,"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set i...","[one, review, mention, watch, oz, episode, youll, hook, right, exactli, happen, methe, first, thing, struck, oz, brutal, unflinch, scene, violence, set, right, word, go, trust, show, faint, heart, timid, show, pull, punch, regard, drug, sex, violence, hardcore, classic, ..."
1,1,"A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not o...","[wonder, little, product, film, technique, unassum, oldtimebbc, fashion, give, comfort, sometim, discomfort, sense, realism, entire, piece, actor, extrem, well, chosen, michael, sheen, got, polari, voice, pat, truli, see, seamless, edit, guid, reference, william, diari, ..."
2,1,"I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. The plot is simplistic, but the dialogue is witty and the characters are likable (even the well bread suspected seria...","[thought, wonder, way, spend, time, hot, summer, weekend, sit, air, condit, theater, watch, lightheart, comedi, plot, simplist, dialogue, witti, charact, likable, even, well, bread, suspect, serial, killer, may, disappoint, realize, match, point, risk, addict, thought, p..."
3,-1,"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br /><br />OK, first o...","[basic, there, famili, little, boy, jake, think, there, zombie, closet, parent, fight, timethi, movie, slower, soap, opera, suddenli, jake, decid, become, rambo, kill, zombieok, first, youre, go, make, film, must, decide, thriller, drama, drama, movie, watchable, parent,..."
4,1,"Petter Mattei's ""Love in the Time of Money"" is a visually stunning film to watch. Mr. Mattei offers us a vivid portrait about human relations. This is a movie that seems to be telling us what money, power and success do to people in the different situations we encounter....","[petter, mattei, love, time, money, visual, stun, film, watch, mr, mattei, offer, us, vivid, portrait, human, relat, movie, seem, tell, us, money, power, success, people, differ, situat, encount, variat, arthur, schnitzler, play, theme, director, transfer, action, presen..."
...,...,...,...
49995,1,"I thought this movie did a down right good job. It wasn't as creative or original as the first, but who was expecting it to be. It was a whole lotta fun. the more i think about it the more i like it, and when it comes out on DVD I'm going to pay the money for it very pro...","[thought, movie, right, good, job, wasnt, creative, origin, first, expect, whole, lotta, fun, think, like, come, dvd, im, go, pay, money, proudli, everi, last, cent, sharon, stone, great, alway, even, movie, horriblecatwoman, movie, isnt, one, movie, underr, lifetime, pr..."
49996,-1,"Bad plot, bad dialogue, bad acting, idiotic directing, the annoying porn groove soundtrack that ran continually over the overacted script, and a crappy copy of the VHS cannot be redeemed by consuming liquor. Trust me, because I stuck this turkey out to the end. It was so...","[bad, plot, bad, dialogue, bad, act, idiot, direct, annoy, porn, groove, soundtrack, ran, continu, overact, script, crappi, copi, vh, can, not, redeem, consum, liquor, trust, stuck, turkey, end, pathet, bad, figure, fourthrate, spoof, springtime, hitlerthe, girl, play, j..."
49997,-1,"I am a Catholic taught in parochial elementary schools by nuns, taught by Jesuit priests in high school & college. I am still a practicing Catholic but would not be considered a ""good Catholic"" in the church's eyes because I don't believe certain things or act certain wa...","[cathol, taught, parochi, elementari, school, nun, taught, jesuit, priest, high, school, college, still, practic, cathol, would, consid, good, cathol, church, eye, dont, believe, certain, thing, act, certain, way, church, tell, toso, back, movieit, bad, two, people, kill..."
49998,-1,"I'm going to have to disagree with the previous comment and side with Maltin on this one. This is a second rate, excessively vicious Western that creaks and groans trying to put across its central theme of the Wild West being tamed and kicked aside by the steady march of...","[im, go, disagree, previou, comment, side, maltin, one, second, rate, excess, viciou, western, creak, groan, tri, put, across, central, theme, wild, west, tame, kick, aside, steadi, march, time, would, like, tradit, butch, cassidi, sundance, kid, lack, film, poignanc, ch..."


# split my training data into sets for training and testing for creating my model, folding, and to help vectorization

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df['clean_reviews'], df['sentiment'], test_size = 0.2, random_state = 97 )
print('X_train:',len(X_train))
print('X_test:',len(X_test))
print('y_train:',len(y_train))
print('y_test:',len(y_test))

X_train: 40000
X_test: 10000
y_train: 40000
y_test: 10000


# Here I will be creating my model 
### I fix a tokenization issue for TF-IDF
### Use K Nearest Neighbor and setting the weights to the distance to of vectorized values using the euclidian metric
### p is the sentiment value of 1 or -1 thus 2 possiblities

In [7]:
# to fix a tokenization issue for TF-IDF
def identity_tokenizer(text):
    return text
# Vectorize using TFI-DF
tfid_vec = TfidfVectorizer(analyzer='word', tokenizer=identity_tokenizer,lowercase=False)
# Using K Nearest Neighbor and setting the weights to the distance to of vectorized values using the euclidian metric
# with the use of the 211 nearest neighbors given the vast and sparse data for training minimizes over fitting my model
# p is the sentiment value options of 1 or -1 thus 2 possiblities
knn = KNeighborsClassifier(n_neighbors=171, p = 2, weights='distance',metric='euclidean')

In [8]:
# creates a re-usale cached model to run my vectorizer and my classifier
model = Pipeline([('vectorizer',tfid_vec ),('classifier', knn)])
# asserts the fit function of the KNN algorithm t tun my model for training using the data provided
model.fit(X_train,y_train)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(lowercase=False,
                                 tokenizer=<function identity_tokenizer at 0x000001C0ADF30438>)),
                ('classifier',
                 KNeighborsClassifier(metric='euclidean', n_neighbors=171,
                                      weights='distance'))])

In [9]:
# prediction model 
predictions = model.predict(X_test)
# to test the performance of my classification model 
confusion_matrix(predictions, y_test)

array([[4079,  867],
       [ 955, 4099]], dtype=int64)

In [10]:
# the Model prediction results for accuracy , precission, and recall
print('Accuracy:',accuracy_score(predictions,y_test))
print('Precission:',precision_score(predictions,y_test, average = 'weighted'))
print('Recall:',recall_score(predictions,y_test,average = 'weighted'))


Accuracy: 0.8178
Precission: 0.8179330804576405
Recall: 0.8178


# Some toy examples:

In [11]:
## testing some example text for bad sentiment :( 
example = ["I Hated every moment of this movie, Never pay for this "]
result = model.predict(example)
print(result)

[-1]


In [12]:
## testing some example text for good sentiment :)
example2 = ["really enjoyable movie would recomend everyone check it out"]
result2 = model.predict(example2)
print(result2)

[1]


# Now to perform predictions of the test data set using my model ...

In [13]:
#test_df = pd.read_table('Data/Test_Data.txt', header = None)
#test_df
with open('Data/Test_Data.txt', encoding='utf-8') as f:
    content = f.readlines()
content = [x.strip() for x in content] 

test_df = pd.DataFrame([*zip(content)])
test_df

Unnamed: 0,0
0,"""This film should have never been made. Honestly, I must admit that before I saw it I had some serious doubts. The director is not a great actress, though she did a lot of movies in Holland, and the young woman who took the main part is a TV-personality with a constant s..."
1,This movie was bad from the start. The only purpose of the movie was that Angela wanted to get a high body count. The acting was horrible. The killings were acted out very badly. Like when Ally got stuffed down that toilet I guess it was in the abandoned cabin. But when ...
2,"""God, I never felt so insulted in my whole life than with this crap. There are so many ways to describe this piece of crap, that I think that if I said everything that came to mind, I would get banned by this site.<br /><br />How do I begin? Well, for one, it doesn't tak..."
3,"""Not being a fan of the Coen Brothers or George Clooney, anyone can see the skepticism I took into the theater. Once again, someone in Hollywood dares to create something different. This time it was those zanie (for a temporary lack of a better word) Coens doing """"their ..."
4,"""The movie Andaz Apna Apna in my books is the top 5 intelligent comedy movies ever made in Bollywood perhaps even Hollywood. <br /><br />When the movie released i was a 8 year old and I heard it was a flop but I never understood till now why was it a flop...but let me te..."
...,...
14995,"""Family Guy has to be my all time favorite cartoon.It is definitely the funniest TV show ever made and is better than The Simpsons.I have never laughed so hard at a TV show in my life The things that make the show so funny is the plot,characters and themes that are dealt..."
14996,"""This was a marvelously funny comedy with a great cast. John Ritter and Katey Sagal were perfectly cast as the parents, and the kids were great too. Kaley Cuoco was a good choice to play Bridget, who was sort of a toned-down version of Kelly Bundy from Married with Child..."
14997,"""There is no plot. There are no central characters. There are no moving cameras or close-ups. In fact, this film does not follow any of the conventional storytelling techniques used by mainstream film. However, Roy Andersson's Du Levande is a remarkable piece of cinemati..."
14998,"""This show is awesome! I love all the actors! It has great story lines and characters. It is the perfect drama. James Caan and Josh Duhamel have great dialogue. They both can be really funny.I miss Vanessa Marcil on General Hospital, but she's great on here. James Lesure..."


In [14]:
ps = PorterStemmer()
wnl = WordNetLemmatizer()
clean = re.compile('<.*?>')
clean_reviews = test_df[0].str.replace(clean,'',regex=True)
clean_reviews = clean_reviews.str.lower()
clean_reviews = clean_reviews.apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))
clean_reviews = clean_reviews.apply(lambda elem: re.sub(r"\d+", "", elem))
stop = stopwords.words('english')
clean_reviews = clean_reviews.apply(lambda elem: ' '.join([word for word in elem.split() if word not in (stop)]))
clean_reviews = clean_reviews.apply(lambda elem: word_tokenize(elem))
#this does some heavy lifting, probably not necessary for data cleaning but oh well..
spell = SpellChecker()
clean_reviews = clean_reviews.apply(lambda elem: [spell.correction(word) if spell.unknown(word) else word for word in elem])
clean_reviews = clean_reviews.apply(lambda elem: [wnl.lemmatize(word) if wnl.lemmatize(word).endswith('e') else ps.stem(word) for word in elem])
clean_reviews

0        [film, never, made, honestli, must, admit, saw, seriou, doubt, director, great, actress, though, lot, movie, holland, young, woman, took, main, part, tvperson, constant, smile, face, much, selfcritic, actor, play, main, part, recent, saw, bride, flight, although, film, b...
1        [movie, bad, start, purpose, movie, angela, want, get, high, bodi, count, act, horrible, kill, act, badli, like, alli, got, stuf, toilet, guess, abandon, cabin, end, movie, come, molli, guy, cabin, see, alli, angela, must, gone, get, part, realli, got, black, girl, angel...
2        [god, never, felt, insult, whole, life, crap, mani, way, describe, piece, crap, think, said, everyth, came, mind, would, get, ban, sitehow, begin, well, one, doesnt, take, knowledge, origin, seri, know, movie, slap, face, people, whove, seen, biggest, butcher, theme, son...
3        [fan, coen, brother, george, clooney, anyone, see, skeptic, took, theater, someone, hollywood, dare, create, someth, differ, time, zanie, t

In [15]:
test_df[1] = clean_reviews
test_df[2] = model.predict(test_df[1])

test_df

Unnamed: 0,0,1,2
0,"""This film should have never been made. Honestly, I must admit that before I saw it I had some serious doubts. The director is not a great actress, though she did a lot of movies in Holland, and the young woman who took the main part is a TV-personality with a constant s...","[film, never, made, honestli, must, admit, saw, seriou, doubt, director, great, actress, though, lot, movie, holland, young, woman, took, main, part, tvperson, constant, smile, face, much, selfcritic, actor, play, main, part, recent, saw, bride, flight, although, film, b...",-1
1,This movie was bad from the start. The only purpose of the movie was that Angela wanted to get a high body count. The acting was horrible. The killings were acted out very badly. Like when Ally got stuffed down that toilet I guess it was in the abandoned cabin. But when ...,"[movie, bad, start, purpose, movie, angela, want, get, high, bodi, count, act, horrible, kill, act, badli, like, alli, got, stuf, toilet, guess, abandon, cabin, end, movie, come, molli, guy, cabin, see, alli, angela, must, gone, get, part, realli, got, black, girl, angel...",-1
2,"""God, I never felt so insulted in my whole life than with this crap. There are so many ways to describe this piece of crap, that I think that if I said everything that came to mind, I would get banned by this site.<br /><br />How do I begin? Well, for one, it doesn't tak...","[god, never, felt, insult, whole, life, crap, mani, way, describe, piece, crap, think, said, everyth, came, mind, would, get, ban, sitehow, begin, well, one, doesnt, take, knowledge, origin, seri, know, movie, slap, face, people, whove, seen, biggest, butcher, theme, son...",-1
3,"""Not being a fan of the Coen Brothers or George Clooney, anyone can see the skepticism I took into the theater. Once again, someone in Hollywood dares to create something different. This time it was those zanie (for a temporary lack of a better word) Coens doing """"their ...","[fan, coen, brother, george, clooney, anyone, see, skeptic, took, theater, someone, hollywood, dare, create, someth, differ, time, zanie, temporari, lack, better, word, coen, thing, one, great, work, literari, histori, wouldve, ever, thought, homer, mind, dont, know, fil...",1
4,"""The movie Andaz Apna Apna in my books is the top 5 intelligent comedy movies ever made in Bollywood perhaps even Hollywood. <br /><br />When the movie released i was a 8 year old and I heard it was a flop but I never understood till now why was it a flop...but let me te...","[movie, andaz, apna, apna, book, top, intellig, comedi, movie, ever, made, bollywood, perhap, even, hollywood, movie, releas, year, old, heard, flop, never, understood, till, flopbut, let, tell, one, thingthi, movie, would, money, sell, home, cassette, dvd, show, tv, mov...",1
...,...,...,...
14995,"""Family Guy has to be my all time favorite cartoon.It is definitely the funniest TV show ever made and is better than The Simpsons.I have never laughed so hard at a TV show in my life The things that make the show so funny is the plot,characters and themes that are dealt...","[famili, guy, time, favorite, cartoonit, definit, funniest, tv, show, ever, made, better, simpsonsi, never, laugh, hard, tv, show, life, thing, make, show, funni, plotcharact, theme, dealt, showthere, theme, dealt, famili, guyi, seen, everi, episode, famili, guy, ever, m...",1
14996,"""This was a marvelously funny comedy with a great cast. John Ritter and Katey Sagal were perfectly cast as the parents, and the kids were great too. Kaley Cuoco was a good choice to play Bridget, who was sort of a toned-down version of Kelly Bundy from Married with Child...","[marvel, funni, comedi, great, cast, john, ritter, katey, sagal, perfectli, cast, parent, kid, great, kaley, cuoco, good, choice, play, bridget, sort, toneddown, version, kelli, bundi, marri, children, write, performance, firstratesadli, john, ritter, die, seri, put, dam...",1
14997,"""There is no plot. There are no central characters. There are no moving cameras or close-ups. In fact, this film does not follow any of the conventional storytelling techniques used by mainstream film. However, Roy Andersson's Du Levande is a remarkable piece of cinemati...","[plot, central, charact, move, camera, closeup, fact, film, follow, convent, storytel, technique, use, mainstream, film, howev, roy, andersson, du, levande, remarkable, piece, cinemat, storytel, touch, look, human, psychecompris, seri, vignette, roy, andersson, give, us,...",1
14998,"""This show is awesome! I love all the actors! It has great story lines and characters. It is the perfect drama. James Caan and Josh Duhamel have great dialogue. They both can be really funny.I miss Vanessa Marcil on General Hospital, but she's great on here. James Lesure...","[show, awesome, love, actor, great, stori, line, charact, perfect, drama, jame, caan, josh, duhamel, great, dialogue, realli, funnyi, miss, vanessa, marcil, gener, hospit, she, great, jame, lesure, great, hilari, molli, sim, play, dimwit, well, write, awesomethey, keep, ...",1


In [16]:
#small sanity check
#test_df.iloc[25:45] 

In [17]:
#np.savetxt(r'Data/Soutput.txt',test_df[2],fmt='%d')
test_df[2]

0       -1
1       -1
2       -1
3        1
4        1
        ..
14995    1
14996    1
14997    1
14998    1
14999   -1
Name: 2, Length: 15000, dtype: int64

In [18]:
np.savetxt(r'Data/output.txt',test_df[2],fmt='%d')