In [1]:
#Sentiment Analysis on IMDb Large Movie Review Dataset using Natural Language Processing (NLP). 

#Importing the required libraries 
import numpy as np
import pandas as pd
import os
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier

Important Note: 
The Large Reviews Dataset consists of train and test datasets, each with 25k reviews including positive and negative reviews that are present as multiple text files. 
These text files are merged into full_train.txt and full_test.txt (which we have used in this code) using shell commands mentioned in 'combine.docx' word document. 

In [2]:
#Loading the datasets
reviews_train = []
for line in open('full_train.txt', 'r', encoding = "utf-8"):
    
    reviews_train.append(line.strip())
    
reviews_test = []
for line in open('full_test.txt', 'r', encoding = "utf-8"):
    
    reviews_test.append(line.strip())
    
target = [1 if i < 12500 else 0 for i in range(25000)]


In [3]:
#Data cleaning: Removing punctuation and HTML tags and making everything to lower-case for easy processing 
#We are using regular expressions/pattern matching approach

REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
NO_SPACE = ""
SPACE = " "
reviews = []
def preprocess_reviews(reviews):
    
    reviews = [REPLACE_NO_SPACE.sub(NO_SPACE, line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(SPACE, line) for line in reviews]
    
    return reviews

reviews_train_clean = preprocess_reviews(reviews_train)
reviews_test_clean = preprocess_reviews(reviews_test)
reviews_train_clean[:1]


["bromwell high is a cartoon comedy it ran at the same time as some other programs about school life such as teachers my  years in the teaching profession lead me to believe that bromwell high's satire is much closer to reality than is teachers the scramble to survive financially the insightful students who can see right through their pathetic teachers' pomp the pettiness of the whole situation all remind me of the schools i knew and their students when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled  at  high a classic line inspector i'm here to sack one of your teachers student welcome to bromwell high i expect that many adults of my age think that bromwell high is far fetched what a pity that it isn't"]

In [4]:
#Data Preparation
#Text Preprocessing

#1.Removing Stop words
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

english_stop_words = stopwords.words('english')
def remove_stop_words(corpus):
    removed_stop_words = ['in','of','at','a','the']
    for review in corpus:
        removed_stop_words.append(
            ' '.join([word for word in review.split() 
                      if word not in english_stop_words])
        )
    return removed_stop_words

no_stop_words = remove_stop_words(reviews_train_clean)



[nltk_data] Downloading package stopwords to C:\Users\Sushmitha
[nltk_data]     Kenkare\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
#2.Normalization 

#Stemming 
def get_stemmed_text(corpus):
    from nltk.stem.porter import PorterStemmer
    stemmer = PorterStemmer()
    return [' '.join([stemmer.stem(word) for word in review.split()]) for review in corpus]

stemmed_reviews = get_stemmed_text(reviews_train_clean)
stemmed_reviews[:1]

["bromwel high is a cartoon comedi it ran at the same time as some other program about school life such as teacher my year in the teach profess lead me to believ that bromwel high' satir is much closer to realiti than is teacher the scrambl to surviv financi the insight student who can see right through their pathet teachers' pomp the petti of the whole situat all remind me of the school i knew and their student when i saw the episod in which a student repeatedli tri to burn down the school i immedi recal at high a classic line inspector i'm here to sack one of your teacher student welcom to bromwel high i expect that mani adult of my age think that bromwel high is far fetch what a piti that it isn't"]

In [6]:
# Lemmatization 

nltk.download('wordnet')
def get_lemmatized_text(corpus):
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    return [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus]

lemmatized_reviews = get_lemmatized_text(reviews_train_clean)
lemmatized_reviews[:1]

[nltk_data] Downloading package wordnet to C:\Users\Sushmitha
[nltk_data]     Kenkare\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


["bromwell high is a cartoon comedy it ran at the same time a some other program about school life such a teacher my year in the teaching profession lead me to believe that bromwell high's satire is much closer to reality than is teacher the scramble to survive financially the insightful student who can see right through their pathetic teachers' pomp the pettiness of the whole situation all remind me of the school i knew and their student when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled at high a classic line inspector i'm here to sack one of your teacher student welcome to bromwell high i expect that many adult of my age think that bromwell high is far fetched what a pity that it isn't"]

# Bag of Words

In [7]:
# Bag of words 
# Initialize a bag of words  
vectorizer = CountVectorizer(analyzer = "word",   
                             tokenizer = None,    
                             preprocessor = None, 
                             stop_words = None,   
                             max_features = 500) 

# Fit transform the data
X = vectorizer.fit_transform(reviews_train_clean).toarray()
X_val = vectorizer.transform(reviews_test_clean).toarray()

In [8]:
# Take a look at the vocabulary
vocab = vectorizer.get_feature_names()
vocab

['able',
 'about',
 'absolutely',
 'act',
 'acting',
 'action',
 'actor',
 'actors',
 'actress',
 'actually',
 'after',
 'again',
 'against',
 'all',
 'almost',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'amazing',
 'american',
 'an',
 'and',
 'another',
 'any',
 'anyone',
 'anything',
 'are',
 'around',
 'art',
 'as',
 'at',
 'audience',
 'away',
 'awful',
 'back',
 'bad',
 'based',
 'be',
 'beautiful',
 'because',
 'become',
 'becomes',
 'been',
 'before',
 'beginning',
 'behind',
 'being',
 'believe',
 'best',
 'better',
 'between',
 'big',
 'bit',
 'black',
 'book',
 'boring',
 'both',
 'boy',
 'budget',
 'but',
 'by',
 'called',
 'came',
 'camera',
 'can',
 'car',
 'care',
 'case',
 'cast',
 'certainly',
 'character',
 'characters',
 'child',
 'children',
 'cinema',
 'classic',
 'close',
 'come',
 'comedy',
 'comes',
 'completely',
 'could',
 'couldn',
 'couple',
 'course',
 'dark',
 'day',
 'days',
 'dead',
 'death',
 'definitely',
 'despite',
 'dialogue',
 'd

In [9]:
# Split data to test preprocessing and modeling techniques
X_train, X_test, y_train, y_val = train_test_split(X, target,train_size=0.75)
    
final_rf = RandomForestClassifier(n_estimators = 10)
final_rf.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_rf.predict(X_val)))
print ('F1 score:', f1_score(target, final_rf.predict(X_val),
                            average='weighted'))
print ('\n Confusion Matrix:\n',confusion_matrix(target,final_rf.predict(X_val)))


Final Accuracy: 0.74828
F1 score: 0.7475471155429844

 Confusion Matrix:
 [[10027  2473]
 [ 3820  8680]]


In [10]:
#Logistic Regression
    
final_log = LogisticRegression(solver='lbfgs', max_iter=1000)
final_log.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_log.predict(X_val)))
print ('F1 score:', f1_score(target, final_log.predict(X_val),
                            average='weighted'))
print ('\n Confusion Matrix:\n',confusion_matrix(target,final_log.predict(X_val)))



Final Accuracy: 0.84216
F1 score: 0.8421413135727094

 Confusion Matrix:
 [[10391  2109]
 [ 1837 10663]]


In [11]:
#Naive Bayes Classifier
    
final_nb = MultinomialNB()
final_nb.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_nb.predict(X_val)))
print ('F1 score:', f1_score(target, final_nb.predict(X_val),
                            average='weighted'))
print ('\n Confusion Matrix:\n',confusion_matrix(target,final_nb.predict(X_val)))



Final Accuracy: 0.78552
F1 score: 0.7854882810959772

 Confusion Matrix:
 [[9971 2529]
 [2833 9667]]


In [12]:
#SVM Classifier
    
final_svm = LinearSVC(max_iter=1500)
final_svm.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_svm.predict(X_val)))
print ('F1 score:', f1_score(target, final_svm.predict(X_val),
                            average='weighted'))
print ('\n Confusion Matrix:\n',confusion_matrix(target,final_svm.predict(X_val)))


Final Accuracy: 0.83728
F1 score: 0.8372291206878945

 Confusion Matrix:
 [[10687  1813]
 [ 2255 10245]]




In [13]:
#Decision Tree classifier
    
final_dec = DecisionTreeClassifier(criterion = "entropy")
final_dec.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_dec.predict(X_val)))
print ('F1 score:', f1_score(target, final_dec.predict(X_val),
                            average='weighted'))
print ('\n Confusion Matrix:\n',confusion_matrix(target,final_dec.predict(X_val)))



Final Accuracy: 0.69312
F1 score: 0.6931154748035454

 Confusion Matrix:
 [[8712 3788]
 [3884 8616]]


# N-Gram 

In [14]:
#n-gram feature extraction

#Logistic Regression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

stop_words = ['in', 'of', 'at', 'a', 'the']
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 3), stop_words = stop_words)
ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)
X_val = ngram_vectorizer.transform(reviews_test_clean)

X_train, X_test, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)
    
final_ngram = LogisticRegression(solver='lbfgs', max_iter=1000)
final_ngram.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_ngram.predict(X_val)))
print ('F1 score:', f1_score(target, final_ngram.predict(X_val),
                            average='weighted'))
print ('\n Confusion Matrix:\n',confusion_matrix(target,final_ngram.predict(X_val)))



Final Accuracy: 0.89944
F1 score: 0.8994371964699379

 Confusion Matrix:
 [[11177  1323]
 [ 1191 11309]]


In [15]:
#Random Forest Classifier

    
final_ngram = RandomForestClassifier(n_estimators = 10)
final_ngram.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_ngram.predict(X_val)))
print ('F1 score:', f1_score(target, final_ngram.predict(X_val),
                            average='weighted'))
print ('\n Confusion Matrix:\n',confusion_matrix(target,final_ngram.predict(X_val)))



Final Accuracy: 0.76076
F1 score: 0.7599151745348601

 Confusion Matrix:
 [[10251  2249]
 [ 3732  8768]]


In [16]:
#Naive Bayes Classifier
    
final_ngram = MultinomialNB()
final_ngram.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_ngram.predict(X_val)))
print ('F1 score:', f1_score(target, final_ngram.predict(X_val),
                            average='weighted'))
print ('\n Confusion Matrix:\n',confusion_matrix(target,final_ngram.predict(X_val)))



Final Accuracy: 0.88164
F1 score: 0.8814567246992848

 Confusion Matrix:
 [[11512   988]
 [ 1971 10529]]


In [17]:
#Linear SVC
    
final = LinearSVC(max_iter=1500)
final.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final.predict(X_val)))
print ('F1 score:', f1_score(target, final_ngram.predict(X_val),
                            average='weighted'))
print ('\n Confusion Matrix:\n',confusion_matrix(target,final_ngram.predict(X_val)))






Final Accuracy: 0.89904
F1 score: 0.8814567246992848

 Confusion Matrix:
 [[11512   988]
 [ 1971 10529]]


In [18]:
#Decision Tree Classifier
    
final_ngram = DecisionTreeClassifier()
final_ngram.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_ngram.predict(X_val)))
print ('F1 score:', f1_score(target, final_ngram.predict(X_val),
                            average='weighted'))
print ('\n Confusion Matrix:\n',confusion_matrix(target,final_ngram.predict(X_val)))



Final Accuracy: 0.724
F1 score: 0.7239999841023991

 Confusion Matrix:
 [[9053 3447]
 [3453 9047]]


# Word Counts

In [19]:
#Logistic Regression

wc_vectorizer = CountVectorizer(binary=False)
wc_vectorizer.fit(reviews_train_clean)
X = wc_vectorizer.transform(reviews_train_clean)
X_val = wc_vectorizer.transform(reviews_test_clean)

X_train, X_test, y_train, y_val = train_test_split(
    X, target, train_size = 0.75, 
)
    
final_wc = LogisticRegression(solver='lbfgs', max_iter=1000)
final_wc.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_wc.predict(X_val)))
print ('F1 score:', f1_score(target, final_wc.predict(X_val),
                            average='weighted'))
print ('\n Confusion Matrix:\n',confusion_matrix(target,final_wc.predict(X_val)))


Final Accuracy: 0.86748
F1 score: 0.867471088756008

 Confusion Matrix:
 [[10946  1554]
 [ 1759 10741]]


In [20]:
#Random Forest Classifier
    
final_wc = RandomForestClassifier(n_estimators = 10)
final_wc.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_wc.predict(X_val)))
print ('F1 score:', f1_score(target, final_wc.predict(X_val),
                            average='weighted'))
print ('\n Confusion Matrix:\n',confusion_matrix(target,final_wc.predict(X_val)))


Final Accuracy: 0.7462
F1 score: 0.7449488205258901

 Confusion Matrix:
 [[10203  2297]
 [ 4048  8452]]


In [21]:
#Naive Bayes Classifier
    
final_wc = MultinomialNB()
final_wc.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_wc.predict(X_val)))
print ('F1 score:', f1_score(target, final_wc.predict(X_val),
                            average='weighted'))
print ('\n Confusion Matrix:\n',confusion_matrix(target,final_wc.predict(X_val)))


Final Accuracy: 0.81512
F1 score: 0.8143538797494231

 Confusion Matrix:
 [[10992  1508]
 [ 3114  9386]]


In [22]:
#Linear SVC
    
final_wc = LinearSVC(max_iter=1500)
final_wc.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_wc.predict(X_val)))
print ('F1 score:', f1_score(target, final_wc.predict(X_val),
                            average='weighted'))
print ('\n Confusion Matrix:\n',confusion_matrix(target,final_wc.predict(X_val)))


Final Accuracy: 0.84588
F1 score: 0.8458613492232561

 Confusion Matrix:
 [[10711  1789]
 [ 2064 10436]]




In [23]:
#Decision Tree Classifier
   
final_wc = DecisionTreeClassifier()
final_wc.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_wc.predict(X_val)))
print ('F1 score:', f1_score(target, final_wc.predict(X_val),
                            average='weighted'))
print ('\n Confusion Matrix:\n',confusion_matrix(target,final_wc.predict(X_val)))


Final Accuracy: 0.71008
F1 score: 0.7100792578029002

 Confusion Matrix:
 [[8896 3604]
 [3644 8856]]


# TF-IDF

In [24]:
#Logistic Regression
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(reviews_train_clean)
X = tfidf_vectorizer.transform(reviews_train_clean)
X_val = tfidf_vectorizer.transform(reviews_test_clean)

X_train, X_test, y_train, y_val = train_test_split(
    X, target, train_size = 0.75
)

    
final_tfidf = LogisticRegression()
final_tfidf.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_tfidf.predict(X_val)))
print ('F1 score:', f1_score(target, final_tfidf.predict(X_val),
                            average='weighted'))
print ('\n Confusion Matrix:\n',confusion_matrix(target,final_tfidf.predict(X_val)))


Final Accuracy: 0.88204
F1 score: 0.8820396510261037

 Confusion Matrix:
 [[11047  1453]
 [ 1496 11004]]


In [25]:
#Random Forest Classifier
    
final_tfidf = RandomForestClassifier(n_estimators = 10)
final_tfidf.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_tfidf.predict(X_val)))
print ('F1 score:', f1_score(target, final_tfidf.predict(X_val),
                            average='weighted'))
print ('\n Confusion Matrix:\n',confusion_matrix(target,final_tfidf.predict(X_val)))


Final Accuracy: 0.73056
F1 score: 0.728656323969551

 Confusion Matrix:
 [[10179  2321]
 [ 4415  8085]]


In [26]:
#Naive Bayes Classifier
    
final_tfidf = MultinomialNB()
final_tfidf.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_tfidf.predict(X_val)))
print ('F1 score:', f1_score(target, final_tfidf.predict(X_val),
                            average='weighted'))
print ('\n Confusion Matrix:\n',confusion_matrix(target,final_tfidf.predict(X_val)))


Final Accuracy: 0.83024
F1 score: 0.8295866203788163

 Confusion Matrix:
 [[11152  1348]
 [ 2896  9604]]


In [27]:
#Linear SVC
    
final_tfidf = LinearSVC()
final_tfidf.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_tfidf.predict(X_val)))
print ('F1 score:', f1_score(target, final_tfidf.predict(X_val),
                            average='weighted'))
print ('\n Confusion Matrix:\n',confusion_matrix(target,final_tfidf.predict(X_val)))


Final Accuracy: 0.87704
F1 score: 0.8770245739625577

 Confusion Matrix:
 [[11103  1397]
 [ 1677 10823]]


In [28]:
#Decision Tree Classifier
    
final_tfidf = DecisionTreeClassifier(criterion = "entropy")
final_tfidf.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_tfidf.predict(X_val)))
print ('F1 score:', f1_score(target, final_tfidf.predict(X_val),
                            average='weighted'))
print ('\n Confusion Matrix:\n',confusion_matrix(target,final_tfidf.predict(X_val)))


Final Accuracy: 0.70488
F1 score: 0.704879771458895

 Confusion Matrix:
 [[8800 3700]
 [3678 8822]]


# Toxic Comment Classification

In [29]:
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import sys

In [30]:
#defining regular expression for preprocessing
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")


In [31]:
#function to load the train and test data sets
def load():
	reviews_train = []
	for line in open('full_train.txt', 'r', encoding = "utf8"):
		reviews_train.append(line.strip())

	reviews_test = []
	for line in open('full_test.txt', 'r', encoding = "utf8"):
	    reviews_test.append(line.strip())

	return(reviews_train,reviews_test)

In [32]:
#function to clean up the cluttered review set
def preprocess_reviews(reviews):
	reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
	reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews]
	return reviews

In [33]:
#function to test human generated review for cross validation

def test_new_review(final_model,cv):
    
    while True:
        pred = final_model.predict(cv.transform([input("Check Your Own Review :")]))[0]
        if(pred == 0):
            print("Negative Review!")
            a = input("Do you want to continue?[Y/N]:").lower()
            if a =="y":
                test_new_review(final_model,cv)
            elif a=="n":
                break
        else:
            print("Positive Review!")
            a = input("Do you want to continue?[Y/N]:").lower()
            if a =="y":
                test_new_review(final_model,cv)
            elif a=="n":
                break
    
    print("GoodBye!")
    quit()    

In [34]:
#function to give example of some positive and negative tokens
def token_example(feature_to_coef):
	print("Example of some positive words and their weightage:")
	for best_positive in sorted(
	    feature_to_coef.items(), 
	    key=lambda x: x[1], 
	    reverse=True)[:5]:
	    print (best_positive)
    
	print("Example of some negative words and their weightage")
	for best_negative in sorted(
	    feature_to_coef.items(), 
	    key=lambda x: x[1])[:5]:
	    print (best_negative)

def Regularisation_parameter(X_train, y_train, y_val, X_val):
	z = 0
	for c in [0.01, 0.05, 0.25, 0.5, 1]:
	    lr = LogisticRegression(C=c, solver='lbfgs', max_iter=1000)
	    lr.fit(X_train, y_train)
	    print ("Accuracy for C=%s: %s" 
	           % (c, accuracy_score(y_val, lr.predict(X_val))))
	    if(z < accuracy_score(y_val, lr.predict(X_val))):
	    	z = accuracy_score(y_val, lr.predict(X_val))
	
	return z


In [None]:
def main():
	#loading the train and test data sets
	reviews_train,reviews_test = load()

	#preprocessing the given data
	reviews_train_clean = preprocess_reviews(reviews_train)
	reviews_test_clean = preprocess_reviews(reviews_test)


	#vectorization of the reviews
	cv = CountVectorizer(binary=True)
	cv.fit(reviews_train_clean) 
	X = cv.transform(reviews_train_clean) # will give a sparse matrix find a way to make this efficient
	X_test = cv.transform(reviews_test_clean)
	target = [1 if i < 12500 else 0 for i in range(25000)]


	#splitting the train and test data
	X_train, X_val, y_train, y_val = train_test_split(X, target, train_size = 0.75)

	#choosing the regularisaton parameter (logistic regression) for the greatest accuracy value
	best_c_value = Regularisation_parameter(X_train, y_train, y_val, X_val)

	#training the final logistic model for the best accuracy
	final_model = LogisticRegression(C=best_c_value, solver='lbfgs', max_iter=1000)
	final_model.fit(X, target)
	print ("Final Accuracy: %s" % accuracy_score(target, final_model.predict(X_test)))

	feature_to_coef = {
    word: coef for word, coef in zip(
        cv.get_feature_names(), final_model.coef_[0]
    )
	}

    #example of pos and neg sentiment
	token_example(feature_to_coef) 
	
	while(True):
		test_new_review(final_model,cv) #checking the model with human data

if __name__ == '__main__':
	main()

Accuracy for C=0.01: 0.86704
Accuracy for C=0.05: 0.87664
Accuracy for C=0.25: 0.87472
Accuracy for C=0.5: 0.87328
Accuracy for C=1: 0.8712
Final Accuracy: 0.87008
Example of some positive words and their weightage:
('excellent', 1.5291790338605942)
('refreshing', 1.4764307717726397)
('perfect', 1.3507169574739248)
('appreciated', 1.3389001046838678)
('superb', 1.3027981975619096)
Example of some negative words and their weightage
('worst', -2.2955474361954353)
('waste', -2.1342313303990426)
('disappointment', -1.938409888254895)
('poorly', -1.9148824673258413)
('awful', -1.7183549900412898)
Check Your Own Review :Horrible
Negative Review!
Do you want to continue?[Y/N]:Y
Check Your Own Review :Very good movie
Positive Review!


Conclusion from comparing the results: Logistic Regression performs better acrosss all feature extraction methods. 
For more details: refer to the report document.