# IMDB Review - Deep Model ~ 93.51% Accuracy

[https://www.kaggle.com/nilanml/imdb-review-deep-model-94-89-accuracy](https://www.kaggle.com/nilanml/imdb-review-deep-model-94-89-accuracy)

<br>

In [1]:
from importlib import reload
import sys
from imp import reload
import warnings
warnings.filterwarnings('ignore')

if sys.version[0] == '2' :
    reload(sys)
    sys.setdefaultencoding('utf-8')

In [2]:
import pandas as pd

df1 = pd.read_csv('../../input/labeledTrainData.tsv',
                  delimiter='\t')
df1 = df1.drop(['id'], axis=1)
df1.head()

Unnamed: 0,sentiment,review
0,1,With all this stuff going down at the moment w...
1,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,0,The film starts with a manager (Nicholas Bell)...
3,0,It must be assumed that those who praised this...
4,1,Superbly trashy and wondrously unpretentious 8...


In [3]:
df2 = pd.read_csv('../../input/imdb_master.csv', encoding='latin-1')
df2.head()

Unnamed: 0.1,Unnamed: 0,type,review,label,file
0,0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt


In [4]:
df2 = df2.drop(['Unnamed: 0', 'type', 'file'], axis=1)
df2.columns = ['review', 'sentiment']
df2.head()

Unnamed: 0,review,sentiment
0,Once again Mr. Costner has dragged out a movie...,neg
1,This is an example of why the majority of acti...,neg
2,"First of all I hate those moronic rappers, who...",neg
3,Not even the Beatles could write songs everyon...,neg
4,Brass pictures (movies is not a fitting word f...,neg


In [5]:
df2 = df2[df2.sentiment != 'unsup']
df2['sentiment'] = df2['sentiment'].map({'pos': 1, 'neg': 0})
df2.head()

Unnamed: 0,review,sentiment
0,Once again Mr. Costner has dragged out a movie...,0
1,This is an example of why the majority of acti...,0
2,"First of all I hate those moronic rappers, who...",0
3,Not even the Beatles could write songs everyon...,0
4,Brass pictures (movies is not a fitting word f...,0


In [6]:
df = pd.concat([df1, df2]).reset_index(drop=True)
df.head()

Unnamed: 0,review,sentiment
0,With all this stuff going down at the moment w...,1
1,"\The Classic War of the Worlds\"" by Timothy Hi...",1
2,The film starts with a manager (Nicholas Bell)...,0
3,It must be assumed that those who praised this...,0
4,Superbly trashy and wondrously unpretentious 8...,1


In [7]:
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text) :
    text = re.sub(r'[^\w\s]', '', text, re.UNICODE)
    text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [word for word in text if not word in stop_words]
    text = " ".join(text)
    
    return text

df["Processed_Reviews"] = df.review.apply(lambda x: clean_text(x))

In [8]:
df.head()

Unnamed: 0,review,sentiment,Processed_Reviews
0,With all this stuff going down at the moment w...,1,stuff going moment mj ive started listening mu...
1,"\The Classic War of the Worlds\"" by Timothy Hi...",1,classic war world timothy hines entertaining f...
2,The film starts with a manager (Nicholas Bell)...,0,film start manager nicholas bell giving welcom...
3,It must be assumed that those who praised this...,0,must assumed praised film greatest filmed oper...
4,Superbly trashy and wondrously unpretentious 8...,1,superbly trashy wondrously unpretentious 80 ex...


In [9]:
df.Processed_Reviews.apply(lambda x: len(x.split(" "))).mean()

128.63182666666665

In [10]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, Flatten
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model, Sequential
from keras.layers import Convolution1D
from keras import initializers, regularizers, constraints, optimizers, layers

max_features = 6000

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(df['Processed_Reviews'])
list_tokenized_train = tokenizer.texts_to_sequences(df['Processed_Reviews'])

maxlen = 130
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
y = df['sentiment']

embed_size = 128
model = Sequential()
model.add(Embedding(max_features, embed_size))
model.add(Bidirectional(LSTM(32, return_sequences=True)))
model.add(GlobalMaxPool1D())
model.add(Dense(20, activation='relu'))
model.add(Dropout(0.05))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

batch_size = 100
epochs = 3
model.fit(X_t, y, batch_size=batch_size, epochs=epochs, validation_split=0.2)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 60000 samples, validate on 15000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x19ea7badd30>

In [11]:
df_test = pd.read_csv("../../input/testData.tsv", header=0, delimiter="\t",
                      quoting=3)
df_test.head()

Unnamed: 0,id,review
0,"""12311_10""","""Naturally in a film who's main themes are of ..."
1,"""8348_2""","""This movie is a disaster within a disaster fi..."
2,"""5828_4""","""All in all, this is a movie for kids. We saw ..."
3,"""7186_2""","""Afraid of the Dark left me with the impressio..."
4,"""12128_7""","""A very accurate depiction of small time mob l..."


In [12]:
df_test['review'] = df_test.review.apply(lambda x: clean_text(x))
df_test['sentiment'] = df_test['id'].map(lambda x: 1 if int(x.strip('"').split("_")[1]) >= 5 else 0)

df_test.head()

Unnamed: 0,id,review,sentiment
0,"""12311_10""",naturally film main theme mortality nostalgia ...,1
1,"""8348_2""",movie disaster within disaster film full great...,0
2,"""5828_4""",movie kid saw tonight child loved one point ki...,0
3,"""7186_2""",afraid dark left impression several different ...,0
4,"""12128_7""",accurate depiction small time mob life filmed ...,1


In [13]:
y_test = df_test['sentiment']
list_sentences_test = df_test['review']
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)
prediction = model.predict(X_te)
y_pred = (prediction > 0.5)

In [14]:
from sklearn.metrics import f1_score, confusion_matrix
print('F1-score: {0}'.format(f1_score(y_pred, y_test)))
print('Confusion matrix:')
confusion_matrix(y_pred, y_test)

F1-score: 0.9608039396244547
Confusion matrix:


array([[12022,   501],
       [  478, 11999]], dtype=int64)

In [51]:
sub_sentiment = pd.Series([v.tolist()[0] for v in y_pred]).apply(lambda x: 1 if x == True else 0)
sub_id = df_test.id
sub_id = sub_id.apply(lambda x: x.strip('"'))

df_submission = pd.DataFrame({"id": sub_id, "sentiment": sub_sentiment})

df_submission.to_csv('NB02_submission.csv', index=False)

<img src="../../img/NB02_submission.jpg" />