# MOVIE REVIEW

This is basic classifier with logistic regression as the classifier

# IMPORTS

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# READ THE DATA

In [12]:
reviews_train = [line.strip() for line in open('movie_data/full_train.txt', 'r')]

In [15]:
reviews_train[0]

'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'

In [13]:
reviews_test = [line.strip() for line in open('movie_data/full_test.txt', 'r')]

# CLEAN AND PROCESS

In [14]:
import re

REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def preprocess_reviews(reviews):
    reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews]
    
    return reviews

reviews_train_clean = preprocess_reviews(reviews_train)
reviews_test_clean = preprocess_reviews(reviews_test)

# VECTORIZATION

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

cv =CountVectorizer(binary=True)
cv.fit(reviews_train_clean)
x = cv.transform(reviews_train_clean)
x_test = cv.transform(reviews_test_clean)

# CLASSIFIER

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

target = [1 if i < 12500 else 0 for i in range(25000)]

In [19]:
x_train, x_val, y_train, y_val = train_test_split(x,target, train_size = 0.75)

In [20]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LogisticRegression(C=c)
    lr.fit(x_train,y_train)
    print("Accuracy for c=%s : %s" %(c, accuracy_score(y_val, lr.predict(x_val))))



Accuracy for c=0.01 : 0.87344
Accuracy for c=0.05 : 0.87808
Accuracy for c=0.25 : 0.87728
Accuracy for c=0.5 : 0.87632
Accuracy for c=1 : 0.87216


In [23]:
final_model = LogisticRegression(C=0.05)
final_model.fit(x, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_model.predict(x_test)))



Final Accuracy: 0.88152


In [25]:
feature_to_coef = {word : coef for word, coef in zip(cv.get_feature_names(),final_model.coef_[0])}
feature_to_coef

{'00': 0.0014948418008428562,
 '000': -3.528957368113216e-06,
 '0000000000001': -0.0037195150377489186,
 '000001': -0.00970672327946288,
 '0001': 0.00019633292265635937,
 '00015': -8.048873802269473e-05,
 '001': -0.004159573544627439,
 '002': -0.0013281055496048603,
 '003830': 0.018744505679835058,
 '006': 0.003170696661843455,
 '007': 0.014625346339754772,
 '0079': 0.005369416164161192,
 '0080': 0.020903803355872445,
 '0083': 0.013397678869536765,
 '00s': 0.012487280390480736,
 '01': 0.03392859658519787,
 '010': 0.005813712173209905,
 '0130': -0.007981806993444791,
 '02': 0.020152206981135613,
 '020410html': 3.68027132948347e-06,
 '029': -0.008657677259189535,
 '03': 0.011482132368010172,
 '04': -0.008103279568666653,
 '041': -8.250374747923012e-05,
 '048': 0.014099730704636123,
 '05': -0.015429731821803438,
 '050': -0.00018093867060236515,
 '06': -0.06980312599747356,
 '06th': 0.0025247475311859823,
 '07': 0.04240661533175121,
 '079': 0.01990237135273817,
 '08': -0.03992935913074067,

In [27]:
for best_positive in sorted(feature_to_coef.items(), key=lambda x:x[1],reverse=True)[:5]:
    print(best_positive)

for best_negative in sorted(feature_to_coef.items(), key=lambda x:x[1])[:5]:
    print(best_negative)
    

('excellent', 0.9292548936291094)
('perfect', 0.7907005707986388)
('great', 0.6745323603248643)
('amazing', 0.6127039745377998)
('superb', 0.6019367892558157)
('worst', -1.3645958397668139)
('waste', -1.1664241646706806)
('awful', -1.032418880046021)
('poorly', -0.8752018604972962)
('boring', -0.8563543288133768)
