In [83]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

np.random.seed(500)

In [104]:
def getData(file_path):
    """
    @param: file_path, str, path to the data file
    @return: an np array with each element as a line in the file
    """
    with open(file_path) as f:
        lines=f.readlines()          
        dd = np.array(lines)
    return dd

def getCorpus(neg, pos):
    corpus = {"text": [],
              "label": []}
    corpus["text"] = np.concatenate((neg, pos))
    corpus["label"] = np.array(["-1" for i in range(len(neg))]+["1" for i in range(len(pos))], dtype='str')
    return corpus

In [118]:
#load data
neg = getData("rt-polaritydata/rt-polarity.neg")
poo = getData("rt-polaritydata/rt-polarity.pos")

corpus = getCorpus(neg,pos)
print(corpus['text'].shape)
print(corpus['label'].shape)

(10662,)
(10662,)


In [119]:
#preprocessing
corpus['text'] = [entry.lower() for entry in corpus['text']]
corpus['text'] = [word_tokenize(entry) for entry in corpus['text']]


def preprocess_entry(entry):
    tag_map = defaultdict(lambda : wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    Final_words = [word_Lemmatized.lemmatize(word,tag_map[tag[0]])\
        for word, tag in pos_tag(entry) \
        if word not in stopwords.words('english') and word.isalpha()]
    # The final processed set of words for each iteration will be stored in 'text_final'
    return str(Final_words)

corpus['final_text'] = [preprocess_entry(entry) for entry in corpus['text']]

In [120]:
for a in zip(corpus['final_text'], corpus['label']):
    print (a)

("['simplistic', 'silly', 'tedious']", '-1')
("['laddish', 'juvenile', 'teenage', 'boy', 'could', 'possibly', 'find', 'funny']", '-1')
("['exploitative', 'largely', 'devoid', 'depth', 'sophistication', 'would', 'make', 'watching', 'graphic', 'treatment', 'crime', 'bearable']", '-1')
("['garbus', 'discard', 'potential', 'pathological', 'study', 'exhume', 'instead', 'skewed', 'melodrama', 'circumstantial', 'situation']", '-1')
("['visually', 'flashy', 'narratively', 'opaque', 'emotionally', 'vapid', 'exercise', 'style', 'mystification']", '-1')
("['story', 'also', 'unoriginal', 'come', 'already', 'recycle', 'time', 'care', 'count']", '-1')
("['thing', 'give', 'movie', 'point', 'bravado', 'take', 'entirely', 'stale', 'concept', 'push', 'audience', 'meat', 'grinder', 'one', 'time']", '-1')
("['much', 'farcical', 'sour']", '-1')
("['unfortunately', 'story', 'actor', 'serve', 'hack', 'script']", '-1')
("['disquieting', 'relatively', 'allusion', 'serial', 'murder', 'fall', 'attempt', 'humaniz

("['clichéd', 'shallow', 'cautionary', 'tale', 'life', 'gay', 'men']", '-1')
("['fetid', 'underbelly', 'fame', 'never', 'look', 'uglier']", '-1')
("['little', 'weak', 'funny']", '-1')
("['welcome', 'see', 'chinese', 'film', 'depict', 'homosexual', 'relationship', 'mature', 'frank', 'fashion', 'lan', 'yu', 'never', 'catch', 'dramatic', 'fire']", '-1')
("['script', 'boast', 'tart', 'humor', 'film', 'trace', 'humanity', 'empathy']", '-1')
("['despite', 'pyrotechnic', 'narc', 'strictly', 'book']", '-1')
("['writing', 'cutting', 'achieve', 'kind', 'dramatic', 'unity', 'transport', 'end', 'simply', 'admire', 'bit', 'performance']", '-1')
("['cacoyannis', 'perhaps', 'effective', 'create', 'atmosphere', 'stagnation', 'labored', 'gentility']", '-1')
("['worth', 'see', 'charm', 'quickly', 'fade']", '-1')
("['original', 'good', 'movie', 'remake', 'make', 'look', 'like', 'masterpiece']", '-1')
("['one', 'suspect', 'craven', 'endorse', 'simply', 'movie', 'make', 'look', 'much', 'well', 'comparison'

("['cheat', 'retreat', 'comfortable', 'territory', 'bad']", '-1')
("['frenetic', 'really', 'funny']", '-1')
("['take', 'individually', 'collectively', 'story', 'never', 'add', 'much', 'promise']", '-1')
("['prepubescent', 'girl', 'laugh', 'britney', 'spear', 'debut', 'whenever', 'impatiently', 'squint', 'watch']", '-1')
("['didactic', 'dull', 'documentary', 'glorifying', 'software', 'anarchy']", '-1')
("['awful', 'snooze']", '-1')
("['sluggishly', 'direct', 'episodic', 'tv', 'veteran', 'joe', 'zwick', 'sitcom', 'without']", '-1')
("['could', 'nap', 'hour', 'miss', 'thing']", '-1')
("['director', 'clare', 'kilner', 'debut', 'never', 'daft']", '-1')
("['new', 'best', 'friend', 'go', 'straight', 'video', 'go', 'straight', 'mystery', 'science', 'theater', 'video']", '-1')
("['wallace', 'seem', 'less', 'like', 'burn', 'tell', 'war', 'story', 'itch', 'somehow', 'tack', 'one', 'together']", '-1')
("['thrill', 'long', 'go']", '-1')
("['plain', 'silly']", '-1')
("['begin', 'life', 'computer', '

("['thin', 'line', 'likably', 'count', 'monte', 'cristo', 'never', 'quite', 'settle', 'either', 'side']", '-1')
("['emotional', 'overload', 'female', 'angst', 'irreparably', 'drag', 'film']", '-1')
("['schaefer', 'determination', 'inject', 'farcical', 'raunch', 'drown', 'promise', 'romantic', 'angle']", '-1')
("['like', 'showgirl', 'glitter', 'entertaining', 'moment', 'unintentional']", '-1')
("['camera', 'work', 'interesting', 'film', 'budget', 'betray', 'surprisingly', 'shoddy', 'makeup', 'work']", '-1')
("['origin', 'story', 'well', 'tell', 'character', 'disappoint', 'anyone', 'value', 'original', 'comic', 'book', 'action', 'scenes', 'thing', 'fall', 'apart']", '-1')
("['impostor', 'step', 'director', 'gary', 'fleder']", '-1')
("['seagal', 'look', 'like', 'danny', 'aiello', 'day', 'mumble', 'way', 'movie']", '-1')
("['movie', 'negligible', 'work', 'manipulation', 'exploitation', 'piece', 'usual', 'worst', 'parent']", '-1')
("['lack', 'dramatic', 'punch', 'depth']", '-1')
("['moment'

("['even', 'predictable', 'endeavor', 'predecessor']", '-1')
("['whole', 'thing', 'play', 'like', 'tired', 'tyco', 'ad']", '-1')
("['film', 'show', 'enough', 'creative', 'process', 'even', 'create', 'figure', 'make', 'wilco', 'big', 'deal']", '-1')
("['soupy', 'end', 'result', 'odd', 'distinction', 'playful', 'without', 'fun']", '-1')
("['know', 'steven', 'seagal', 'consider', 'star', 'keep', 'cast', 'action', 'film', 'none', 'ever', 'good', 'make', 'money']", '-1')
("['even', 'intentionally', 'low', 'standard', 'humor', 'sorority', 'boy', 'bowser']", '-1')
("['one', 'explosion', 'movie', 'knockout', 'hundred', 'numb', 'proof', 'ballistic', 'eck', 'vs', 'sever']", '-1')
("['halfway', 'however', 'suck', 'dry', 'undead', 'action', 'flick', 'formula', 'blade', 'ii', 'mutates', 'monster', 'movie', 'effect', 'silly', 'scary']", '-1')
("['weight', 'slow', 'uninvolving', 'storytelling', 'flat', 'acting']", '-1')
("['ca', 'accuse', 'kung', 'pow', 'misfiring', 'since', 'exactly', 'want', 'atroc

("['rule', 'attraction', 'get', 'u', 'drink', 'party', 'favor', 'sober', 'u', 'transparent', 'attempt', 'moralizing']", '-1')
("['though', 'many', 'tense', 'scene', 'trap', 'prove', 'distressing', 'suspenseful']", '-1')
("['film', 'least', 'see', 'study', 'contrast', 'wide', 'range', 'one', 'actor', 'limited', 'range', 'comedian']", '-1')
("['feel', 'strangely', 'hollow', 'emotional', 'core']", '-1')
("['surprise']", '-1')
("['enter', 'bizarre', 'realm', 'director', 'adrian', 'lyne', 'hold', 'sway', 'relationship', 'simultaneously', 'broadly', 'metaphorical', 'oddly', 'abstract', 'excruciatingly', 'literal']", '-1')
("['scenario', 'soon', 'prove', 'preposterous', 'acting', 'robotically', 'italicize', 'hound', 'take', 'note', 'little', 'hustle', 'view']", '-1')
("['director', 'cut', 'add', 'minute', 'take', 'great', 'film', 'turn', 'mundane', 'soap', 'opera']", '-1')
("['characterisation', 'sacrifice', 'sake', 'spectacle']", '-1')
("['venezuelan', 'say', 'thing', 'like', 'si', 'pretty',

("['special', 'effect', 'many', 'scene', 'weightlessness', 'look', 'good', 'good', 'original', 'sound', 'jam', 'horner', 'rouse', 'score', 'make', 'good', 'use', 'hefty', 'audio', 'system']", '1')
("['heel', 'ring', 'come', 'similarly', 'morose', 'humorless', 'horror', 'movie', 'although', 'flaw', 'commend', 'approach', 'creepiness']", '1')
("['fence', 'noyce', 'tailor', 'epic', 'tale', 'lean', 'economical', 'movie']", '1')
("['n', 'utterly', 'charming', 'hilarious', 'film', 'remind', 'best', 'disney', 'comedy']", '1')
("['preaches', 'two', 'completely', 'different', 'choir', 'time', 'pretty', 'amazing', 'accomplishment']", '1')
("['thanks', 'haynes', 'absolute', 'control', 'film', 'mood', 'buoy', 'three', 'terrific', 'performance', 'far', 'heaven', 'actually', 'pull', 'stylistic', 'juggling', 'act']", '1')
("['birthday', 'girl', 'amuse', 'joy', 'ride', 'surprisingly', 'violent', 'moment']", '1')
("['romantic', 'emotional', 'ultimately', 'satisfying', 'original']", '1')
("['appealingly

("['white', 'oleander', 'movie', 'akin', 'reader', 'digest', 'condense', 'version', 'source', 'material']", '1')
("['like', 'go', 'house', 'party', 'watch', 'host', 'defend', 'frothing', 'want', 'call', 'cop', 'want', 'call', 'domino']", '1')
("['refresh', 'real', 'woman', 'curve', 'unforced', 'relaxed', 'actor']", '1')
("['direction', 'pleasingly', 'emphatic', 'properly', 'intense', 'claustrophobic', 'tale', 'obsessive', 'love']", '1')
("['secretary', 'original', 'ignore']", '1')
("['rare', 'film', 'whose', 'basis', 'fact', 'interesting', 'embellishment', 'need']", '1')
("['smart', 'fun', 'far', 'witty', 'wise']", '1')
("['stand', 'cheer', 'flick', 'sit', 'ponder', 'affair', 'thanks', 'kline', 'superbly', 'nuanced', 'performance', 'ponder', 'highly', 'pleasurable']", '1')
("['originality', 'ai', 'menu', 'never', 'dull', 'moment', 'giant', 'spider', 'invasion', 'comic', 'chiller']", '1')
("['walter', 'hill', 'undisputed', 'like', 'warner', 'bros', 'b', 'picture', 'mean', 'compliment']"

("['richly', 'entertaining', 'suggestive', 'number', 'metaphorical', 'reading']", '1')
("['compelling', 'allegory', 'last', 'day', 'germany', 'democratic', 'weimar', 'republic']", '1')
("['offer', 'trip', 'territory']", '1')
("['sit', 'enjoy', 'certain', 'level', 'forget']", '1')
("['devos', 'deliver', 'perfect', 'performance', 'capture', 'innocence', 'budding', 'demon', 'within', 'wallflower']", '1')
("['disappointingly', 'character', 'strange', 'dysfunctional', 'tom', 'include', 'ever', 'get', 'skin', 'compensate', 'large', 'part', 'dialogue', 'visual', 'playfulness', 'outlandishness', 'idea']", '1')
("['director', 'todd', 'solondz', 'make', 'movie', 'critical', 'reaction', 'two', 'previous', 'movie', 'responsibility', 'character', 'create']", '1')
("['word', 'come', 'mind', 'watch', 'eric', 'rohmer', 'tribute', 'courageous', 'scottish', 'lady', 'painterly']", '1')
("['fascinating', 'case', 'study', 'liberation', 'price', 'pay']", '1')
("['bluer', 'atlantic', 'biologically', 'detail'

("['arliss', 'howard', 'ambitious', 'move', 'adventurous', 'directorial', 'debut', 'big', 'bad', 'love', 'meet', 'many', 'challenge', 'pose', 'one', 'forgive', 'film', 'flaw']", '1')
("['critic', 'need', 'good', 'laugh', 'rendition', 'notorious', 'mtv', 'show', 'deliver', 'outrageous', 'sicken', 'sidesplitting', 'good', 'steaming', 'visceral', 'heap']", '1')
("['dumb', 'fun', 'curiously', 'adolescent', 'movie']", '1')
("['many', 'insightful', 'moment']", '1')
("['charm', 'lead', 'performance', 'allow', 'u', 'forget', 'film', 'problem']", '1')
("['vivid', 'sometimes', 'surreal', 'glimpse', 'mystery', 'human', 'behavior']", '1')
("['tour', 'de', 'force', 'modern', 'cinema']", '1')
("['peralta', 'capture', 'luminous', 'interview', 'amazingly', 'evocative', 'film', 'three', 'decade', 'ago', 'essence', 'dogtown', 'experience']", '1')
("['lively', 'appeal', 'last', 'kiss', 'lie', 'ease', 'integrate', 'thoughtfulness', 'comedy']", '1')
("['without', 'resort', 'camp', 'parody', 'haynes', 'like

In [121]:
#split test and training set
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(corpus['final_text'],corpus['label'],test_size=0.3)

In [122]:
sum([1 for i in Test_Y if i == '-1'])


1580

In [123]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [128]:
print (Test_Y)
Tfidf_vect = TfidfVectorizer(max_features=10000)
Tfidf_vect.fit(corpus['final_text'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)
print(Tfidf_vect.vocabulary_)

[0 0 1 ... 1 0 0]


In [129]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)

Naive Bayes Accuracy Score ->  76.05501719287278


In [130]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)


SVM Accuracy Score ->  74.3044701469209
