# Modeling 

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('trainDataFeatures.tsv', sep='\t', index_col=0)

In [3]:
df

Unnamed: 0,id,sentiment,review,review_length,awesome,good,amazing,interesting,terrible,bad,awful,boring
0,5814_8,1,With all this stuff going down at the moment w...,2302,0,0,0,0,0,3,0,1
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi...",946,0,0,0,0,0,0,0,0
2,7759_3,0,The film starts with a manager (Nicholas Bell)...,2455,0,0,0,0,0,0,0,1
3,3630_4,0,It must be assumed that those who praised this...,2245,0,1,0,0,0,0,0,0
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...,2233,0,0,0,0,0,0,0,0
5,8196_8,1,I dont know why people think this is such a ba...,441,0,3,0,0,0,1,0,0
6,7166_2,0,"This movie could have been very good, but come...",605,0,1,0,0,0,1,0,0
7,10633_1,0,I watched this video at a friend's house. I'm ...,728,0,0,0,0,0,1,0,0
8,319_1,0,"A friend of mine bought this film for £1, and ...",964,0,0,0,0,0,0,0,0
9,8713_10,1,<br /><br />This movie is full of references. ...,245,0,0,0,0,0,0,0,0


In [4]:
columns = df.columns[3:]

In [5]:
columns

Index([u'review_length', u'good', u'bad', u'interesting', u'awesome',
       u'terrible', u'amazing', u'boring', u'awful'],
      dtype='object')

In [6]:
X = np.asarray(df[columns])

In [7]:
X

array([[2302,    0,    0, ...,    3,    0,    1],
       [ 946,    0,    0, ...,    0,    0,    0],
       [2455,    0,    0, ...,    0,    0,    1],
       ..., 
       [ 641,    0,    0, ...,    0,    0,    0],
       [1145,    0,    0, ...,    0,    0,    0],
       [ 942,    0,    0, ...,    0,    0,    0]])

In [8]:
y = np.asarray(df.sentiment.transpose())

In [9]:
y

array([1, 1, 0, ..., 0, 0, 1])

In [1]:
from sklearn.naive_bayes import MultinomialNB

In [2]:
MultinomialNB?

In [17]:
nb = MultinomialNB()

In [18]:
nb.fit(X, y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [19]:
nb.predict(np.asarray(df[columns]))

array([0, 1, 0, ..., 1, 1, 1])

In [20]:
np.mean(nb.predict(np.asarray(df[columns])) == df.sentiment)

0.67023999999999995

## Make Kaggle submission with test data

In [21]:
test_data = pd.read_csv('testData.tsv', sep='\t')

In [22]:
test_data

Unnamed: 0,id,review
0,12311_10,Naturally in a film who's main themes are of m...
1,8348_2,This movie is a disaster within a disaster fil...
2,5828_4,"All in all, this is a movie for kids. We saw i..."
3,7186_2,Afraid of the Dark left me with the impression...
4,12128_7,A very accurate depiction of small time mob li...
5,2913_8,"...as valuable as King Tut's tomb! (OK, maybe ..."
6,4396_1,This has to be one of the biggest misfires eve...
7,395_2,"This is one of those movies I watched, and won..."
8,10616_1,The worst movie i've seen in years (and i've s...
9,9074_9,"Five medical students (Kevin Bacon, David Labr..."


In [23]:
test_data['review_length'] = test_data.review.apply(len)

In [24]:
# Do the same feature extraction with the test data

from sklearn.feature_extraction.text import CountVectorizer
vocab_expand = ['awesome', 'good', 'amazing', 'interesting', 'terrible', 'bad', 'awful','boring']
expand_vectorizer = CountVectorizer(vocabulary=vocab_expand)
bow_expand = expand_vectorizer.fit_transform(test_data.review).todense()
words_expand = list(expand_vectorizer.vocabulary_.keys())
bow_expand_df = pd.DataFrame(bow_expand, index=test_data.index, columns = words_expand)
test_data_expand = test_data.join(bow_expand_df)
test_data_expand.to_csv('testDataFeatures.tsv', sep='\t')

In [29]:
predictions = nb.predict(np.asarray(test_data_expand[columns]))

In [30]:
submission = pd.DataFrame({"id": test_data.id, "sentiment": predictions})

In [34]:
submission

Unnamed: 0,id,sentiment
0,12311_10,1
1,8348_2,0
2,5828_4,0
3,7186_2,1
4,12128_7,0
5,2913_8,1
6,4396_1,1
7,395_2,1
8,10616_1,0
9,9074_9,1


In [38]:
submission.to_csv('submission_kaggle.tsv', index=False, sep='\t')

In [36]:
!ls

0-Exploration.ipynb	    environment.yml	      testDataFeatures.tsv
1-Feature_extraction.ipynb  labeledTrainData.tsv      testData.tsv
2-Modeling.ipynb	    labeledTrainData.tsv.zip  testData.tsv.zip
3-Validation.ipynb	    README.md		      trainDataFeatures.tsv
4-Ensemble.ipynb	    submission_kaggle.tsv
