# Natural language processing for IMDb reviews

Random forest seems to be better than Naive Bayes.

## Setup

In [40]:
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
import re
import nltk
# download text data sets
# nltk.download()
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

## Data

In [41]:
train = pd.read_csv('labeledTrainData.tsv', header = 0, delimiter = '\t', quoting = 3)
# header = 0 means first line is column name
# quoting = 3 means ignore doubled quotes

In [42]:
print(train.shape)
print(train.head())

(25000, 3)
         id  sentiment                                             review
0  "5814_8"          1  "With all this stuff going down at the moment ...
1  "2381_9"          1  "\"The Classic War of the Worlds\" by Timothy ...
2  "7759_3"          0  "The film starts with a manager (Nicholas Bell...
3  "3630_4"          0  "It must be assumed that those who praised thi...
4  "9495_8"          1  "Superbly trashy and wondrously unpretentious ...


## Tokenization

In [43]:
def review_to_words(raw_review):
    """
    convert review to a string of words, input and output are a single review and stirng of words
    """
    review_text = BeautifulSoup(raw_review).get_text()
    letters_only = re.sub("[^A-Za-z]", " ", review_text)
    words = letters_only.lower().split()
    
    stops = set(stopwords.words("english"))
    meaningful_words = [w for w in words if not w in stops]
    
    return(" ".join(meaningful_words))

In [44]:
num_reviews = train['review'].size
clean_train_reviews = []
for i in range(num_reviews):
    if (i+1) % 5000 == 0:
        print("Review %d of %d\n" % (i+1, num_reviews))
    clean_train_reviews.append(review_to_words(train["review"][i]))

Review 5000 of 25000

Review 10000 of 25000

Review 15000 of 25000

Review 20000 of 25000

Review 25000 of 25000



In [45]:
vectorizer = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = None,
                             max_features = 5000)

train_data_features = vectorizer.fit_transform(clean_train_reviews)
train_data_features = train_data_features.toarray()

In [46]:
print(train_data_features.shape)

(25000, 5000)


In [47]:
vocab = vectorizer.get_feature_names()
print(vocab[:10])

['abandoned', 'abc', 'abilities', 'ability', 'able', 'abraham', 'absence', 'absent', 'absolute', 'absolutely']


In [48]:
# return the number of appearing in each word
dist = np.sum(train_data_features, axis = 0)
for tag, count in zip(vocab[:10], dist[:10]):
    print(count, tag)

187 abandoned
125 abc
108 abilities
454 ability
1259 able
85 abraham
116 absence
83 absent
352 absolute
1485 absolutely


## Model

In [49]:
# random forest
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit(train_data_features, train['sentiment'])

In [50]:
# naive bayes
nb_classifier = MultinomialNB()
nb_classifier = nb_classifier.fit(train_data_features, train['sentiment'])

## Prediction

In [51]:
test = pd.read_csv('testData.tsv', header = 0, delimiter = "\t", quoting = 3)

num_reviews = len(test['review'])

clean_test_reviews = []
for i in range(num_reviews):
    clean_review = review_to_words(test['review'][i])
    clean_test_reviews.append(clean_review)
    
test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()

In [52]:
result_forest = forest.predict(test_data_features)
result_nbc = nb_classifier.predict(test_data_features)

output_forest = pd.DataFrame(data = {"id": test["id"], "sentiment": result_forest})
output_nbc = pd.DataFrame(data = {"id": test["id"], "sentiment": result_nbc})

In [53]:
output_nbc.head()

Unnamed: 0,id,sentiment
0,"""12311_10""",1
1,"""8348_2""",0
2,"""5828_4""",1
3,"""7186_2""",1
4,"""12128_7""",1


In [39]:
# output_forest.to_csv("C:\\Users\\yukic\\Documents\\kaggle\\popcorn\\submission_190913_forest.csv", index = False, quoting = 3)
# output_nbc.to_csv("C:\\Users\\yukic\\Documents\\kaggle\\popcorn\\submission_190913_nbc.csv", index = False, quoting = 3)