In [1]:
import pandas as pd
import numpy as np

## 1. Import the data

In [2]:
data = pd.read_csv("data/imdb.csv")
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## 2. Preprocessing

In [3]:
data["sentiment"] = data["sentiment"].map({"negative":0,"positive":1})
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [4]:
import re
import nltk

# nltk.download('stopwords')

In [5]:
from nltk.corpus import stopwords # Stop word like: the a an
from nltk.stem.porter import PorterStemmer # Root word like: loved->love

In [6]:
sample = 5000

In [7]:
def text_preprocessing(data):
    corpus = []
    for i in range(0, sample):
        review = re.sub('[^a-zA-Z]',' ',data["review"][i]) # Replace puntuation
        review = review.lower() # Make it lowercase
        review = review.split() # Split the word
        ps = PorterStemmer()
        all_stopwords = stopwords.words('english')
        all_stopwords.remove('not')
        review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
        review = " ".join(review)
        corpus.append(review)
    return corpus

In [8]:
corpus = text_preprocessing(data)

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=5000)
X = cv.fit_transform(corpus).toarray()
y = data["sentiment"][:sample]

## 3. Modelling

In [10]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [23]:
from sklearn.ensemble import RandomForestClassifier

rd_classifier = RandomForestClassifier(n_estimators=100) # Defualt: n_estimators=100
rd_classifier.fit(X_train,y_train)
rd_classifier.score(X_test,y_test)

0.823

In [12]:
from sklearn.naive_bayes import GaussianNB

gs_classifier = GaussianNB()
gs_classifier.fit(X_train,y_train)
y_pred = gs_classifier.predict(X_test)
gs_classifier.score(X_test,y_test)

0.673

In [13]:
from sklearn.linear_model import LogisticRegression

log = LogisticRegression(max_iter=1000)
log.fit(X_train,y_train)
log.score(X_test,y_test)

0.834

In [14]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=8) # Defualt: n_neighbors=5
knn.fit(X_train,y_train)
knn.score(X_test,y_test)

0.602

In [19]:
from sklearn.svm import SVC

svc = SVC(kernel='rbf') # Defualt kernel='rbf'
svc.fit(X_train,y_train)
svc.score(X_test,y_test)

[LibSVM]

0.818

In [16]:
import tensorflow as tf

model = tf.keras.models.Sequential()

model.add(tf.keras.layers.Dense(units=6,activation="relu"))
model.add(tf.keras.layers.Dense(units=1,activation="sigmoid"))

model.compile(optimizer="adam",loss="binary_crossentropy",metrics=["accuracy"]) # Compile the model

early_stopping = tf.keras.callbacks.EarlyStopping(monitor="accuracy",patience=3)

model.fit(x=X_train,y=y_train,batch_size=50,epochs=100,callbacks=[early_stopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100


<tensorflow.python.keras.callbacks.History at 0x7ff8b7b2e580>

## 4. Evaluation

In [17]:
y_pred = model.predict(X_test)>0.5

In [18]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test,y_pred)

0.834