In [57]:
import pandas as pd
df = pd.read_csv('imdb_master.csv', encoding = "ISO-8859-1")
print(df.shape)
df.head()

(100000, 5)


Unnamed: 0.1,Unnamed: 0,type,review,label,file
0,0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt


In [58]:
df = df.drop(columns='Unnamed: 0') # don't need
df = df.drop(columns='file') # don't need
print(df.isna().sum(), end = '\n\n') # check if any NA values
print(df['label'].value_counts(), end = '\n\n')
print(df['type'].value_counts(), end = '\n\n')
print('Average review length:', int(df['review'].apply(lambda x: len(x)).mean()), 'words')

type      0
review    0
label     0
dtype: int64

unsup    50000
pos      25000
neg      25000
Name: label, dtype: int64

train    75000
test     25000
Name: type, dtype: int64

Average review length: 1319 words


In [59]:
df = df[df['label'] != 'unsup'] # not using any ratings that don't have pos/neg labels

In [60]:
# clean up punctuation so a review is turned into words only 

import re

REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\|)|(\()|(\))|(\[)|(\])|(\*)|(\$)|(\&)|(\#)")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def preprocess_reviews(reviews):
    reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews]
    
    return reviews

df['review'] = preprocess_reviews(df['review'])
print(df['review'].head())

0    once again mr costner has dragged out a movie ...
1    this is an example of why the majority of acti...
2    first of all i hate those moronic rappers who ...
3    not even the beatles could write songs everyon...
4    brass pictures movies is not a fitting word fo...
Name: review, dtype: object


In [61]:
# get rid of stop words (words like 'if', 'but', 'we', 'he') which won't help predicing sentiment

from nltk.corpus import stopwords
stop_words = stopwords.words('english')

def remove_stop_words(corpus):
    removed_stop_words = []
    for review in corpus:
        removed_stop_words.append(
            ' '.join([word for word in review.split() 
                      if word not in stop_words])
        )
    return removed_stop_words

df['review'] = remove_stop_words(df['review'])
print(df['review'].head())

0    mr costner dragged movie far longer necessary ...
1    example majority action films generic boring t...
2    first hate moronic rappers couldnt act gun pre...
3    even beatles could write songs everyone liked ...
4    brass pictures movies fitting word really some...
Name: review, dtype: object


In [62]:
# normalize words by grouping different forms of the same word into one, eg: car, cars, car's, cars' -> car. The below uses lemmatization (other option is stemming)

def get_lemmatized_text(corpus):
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    return [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus]

df['review'] = get_lemmatized_text(df['review'])
print(df['review'].head())

0    mr costner dragged movie far longer necessary ...
1    example majority action film generic boring th...
2    first hate moronic rapper couldnt act gun pres...
3    even beatles could write song everyone liked a...
4    brass picture movie fitting word really somewh...
Name: review, dtype: object


In [63]:
# Train model method #1 (simplest)
# First try one hot encoding where each review is turned into a matrix of 0's and 1's, with each 0 or 1 representing whether a certain word is present in the review. Every unique word across all reviews will have an index somewhere in the matrix.

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# shuffle
df = df.sample(frac=1)

# create train and test datasets
df_train = df[df['type'] == 'train']
df_test = df[df['type'] == 'test']

cv = CountVectorizer(binary=True)
cv.fit(df_train['review'])
X_train = cv.transform(df_train['review'])
X_test = cv.transform(df_test['review'])

target_train = df_train['label']
target_test = df_test['label']

model = LogisticRegression()
model.fit(X_train, target_train)

print("Accuracy of %s for method #1" % accuracy_score(target_test, model.predict(X_test)))

Accuracy of 0.86572 for method #1


In [64]:
# Can also try logistic regression with different values of c
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    model = LogisticRegression(C=c)
    model.fit(X_train, target_train)
    print ("Accuracy on method #1 (plain logistic regression) for C=%s: %s" % (c, accuracy_score(target_test, model.predict(X_test))))

Accuracy on method #1 (plain logistic regression) for C=0.01: 0.87468
Accuracy on method #1 (plain logistic regression) for C=0.05: 0.8774
Accuracy on method #1 (plain logistic regression) for C=0.25: 0.87308
Accuracy on method #1 (plain logistic regression) for C=0.5: 0.87008
Accuracy on method #1 (plain logistic regression) for C=1: 0.86572


In [65]:
# Train model method #2 - use ngrams to consider more than one word at once 

ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
ngram_vectorizer.fit(df_train['review'])
X_train = ngram_vectorizer.transform(df_train['review'])
X_test = ngram_vectorizer.transform(df_test['review'])

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    model = LogisticRegression(C=c)
    model.fit(X_train, target_train)
    print ("Accuracy on method #2 (logistic regression with ngrams) for C=%s: %s" % (c, accuracy_score(target_test, model.predict(X_test))))

Accuracy on method #2 (logistic regression with ngrams) for C=0.01: 0.8806
Accuracy on method #2 (logistic regression with ngrams) for C=0.05: 0.88652
Accuracy on method #2 (logistic regression with ngrams) for C=0.25: 0.8872
Accuracy on method #2 (logistic regression with ngrams) for C=0.5: 0.88768
Accuracy on method #2 (logistic regression with ngrams) for C=1: 0.88728


In [66]:
# Train model method #3 - random forest
from sklearn.model_selection import train_test_split

vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, max_features = 6000) 
train_data_features = vectorizer.fit_transform(df_train['review'])
train_data_features = train_data_features.toarray()

from sklearn.ensemble import RandomForestClassifier

x_train, x_test, y_train, y_test = train_test_split(train_data_features, df_train['label'], test_size=0.2, random_state=0)

forest = RandomForestClassifier(n_estimators = 100) 
forest = forest.fit(x_train, y_train)

result = forest.predict(x_test)
print('Accuracy on method #3 (random forest): %s' % (accuracy_score(y_test, result)))

Accuracy on method #3 (random forest): 0.838


In [67]:
# Train model with method #4

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense , Input , LSTM , Embedding, Dropout , Activation, GRU, Flatten
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model, Sequential
from keras.layers import Convolution1D
from keras import initializers, regularizers, constraints, optimizers, layers

max_features = 6000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(df_train['review'])
list_tokenized_train = tokenizer.texts_to_sequences(df_train['review'])

maxlen = 130

X_train = pad_sequences(list_tokenized_train, maxlen=maxlen)

y_train = df_train['label']
y_train = y_train.replace('neg', 0)
y_train = y_train.replace('pos', 1)

embed_size = 128
model = Sequential()
model.add(Embedding(max_features, embed_size))
model.add(Bidirectional(LSTM(32, return_sequences = True)))
model.add(GlobalMaxPool1D())
model.add(Dense(20, activation="relu"))
model.add(Dropout(0.05))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

batch_size = 100
epochs = 3 # validation loss increases (starts to overfit) 

model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x19120a36668>

In [68]:
# Predict with method #4

from sklearn.metrics import f1_score

list_tokenized_test = tokenizer.texts_to_sequences(df_test['review'])

X_test = pad_sequences(list_tokenized_test, maxlen=maxlen)

y_test = df_test['label']
y_test = y_test.replace('neg', 0)
y_test = y_test.replace('pos', 1)

prediction = model.predict(X_test)

y_pred = (prediction > 0.5)

print("Accuracy: %s" % accuracy_score(y_test, y_pred))
print('F1-score: {0}'.format(f1_score(y_pred, y_test)))

Accuracy: 0.85816
F1-score: 0.8525816911948116
