In [112]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


In [71]:
data_sentiment = pd.read_csv('movies/IMDB-Dataset-sentiment.csv')
data_sentiment.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [70]:
data_table = pd.read_csv('movies/IMDB-Movie-Data-1000.csv')
data_table.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0


In [121]:
from text_prepare import html_docode, clean_text, stemm_text, lemmatize_text

reviews_clean = pd.DataFrame()
reviews_clean['review'] = html_docode(data_sentiment['review'])
reviews_clean['review'] = clean_text(reviews_clean['review'])
reviews_clean['review']  = lemmatize_text(reviews_clean['review'])
reviews_clean ['sentiment'] = [1 if x =='positive' else 0 for x in data_sentiment['sentiment']]
reviews_train_clean = reviews_clean.iloc[:25000]
reviews_test_clean = reviews_clean.iloc[25000:]



  return [BeautifulSoup(article, "html").text for article in text]# HTML decoding. BeautifulSoup's text attribute will return a string stripped of any HTML tags and metadata.


In [124]:
cv = CountVectorizer(binary=True)
cv.fit(reviews_train_clean['review'])
X = cv.transform(reviews_train_clean['review'])
X_test = cv.transform(reviews_test_clean['review'])

In [125]:
X_train, X_val, y_train, y_val = train_test_split(
    X, reviews_train_clean['sentiment'], train_size = 0.75
)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))




Accuracy for C=0.01: 0.87264
Accuracy for C=0.05: 0.8808
Accuracy for C=0.25: 0.87936
Accuracy for C=0.5: 0.8776
Accuracy for C=1: 0.8744


In [126]:
final_model = LogisticRegression(C=0.05)
final_model.fit(X, reviews_train_clean['sentiment'])
print ("Final Accuracy: %s" 
       % accuracy_score(reviews_test_clean['sentiment'], final_model.predict(X_test)))


Final Accuracy: 0.87952


In [127]:
X_test

<25000x126056 sparse matrix of type '<class 'numpy.int64'>'
	with 2355039 stored elements in Compressed Sparse Row format>

In [128]:
feature_to_coef = {
    word: coef for word, coef in zip(
        cv.get_feature_names(), final_model.coef_[0]
    )
}
for best_positive in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1], 
    reverse=True)[:5]:
    print (best_positive)
    
for best_negative in sorted(
    feature_to_coef.items(), 
    key=lambda x: x[1])[:5]:
    print (best_negative)

('excellent', 0.8997158240087245)
('loved', 0.722123670297358)
('great', 0.699054023024935)
('perfect', 0.6657403807806586)
('favorite', 0.6617173413398936)
('worst', -1.377327557964645)
('waste', -1.248105066486168)
('awful', -1.1418926325530925)
('boring', -0.9073051925981613)
('horrible', -0.7854712697473282)
