In [118]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier

## Load the Data

In [54]:
def load_data(data_dir, train=True):
    if train == True:
            data_dir = os.path.join(data_dir, "train")
    else:
            data_dir = os.path.join(data_dir, "test")

    data_id = []
    score = []
    review_l = []
    sentiments_l = []
    
    sentiments = ["pos", "neg"]
    for s in sentiments:
        path = os.path.join(data_dir, s)

        for file in os.listdir(path):
            data_id.append(file.split("_")[0])
            score.append(file.split("_")[1].split(".")[0])
            sentiments_l.append(s)
            with open(os.path.join(path, file), encoding="utf-8") as f:
                review = f.read()
                review_l.append(str(review))
                
    df = pd.DataFrame()
    df["id"] = data_id
    df["score"] = score
    df["review"] = review_l
    df["sentiment"] = sentiments_l
    
    return df   

In [55]:
data_dir = "aclImdb/"
train_df = load_data(data_dir, train=True)
test_df = load_data(data_dir, train=False)
print(train_df.shape)
print(test_df.shape)

(25000, 4)
(25000, 4)


## EDA

In [56]:
train_df.head(3)

Unnamed: 0,id,score,review,sentiment
0,0,9,Bromwell High is a cartoon comedy. It ran at t...,pos
1,10000,8,Homelessness (or Houselessness as George Carli...,pos
2,10001,10,Brilliant over-acting by Lesley Ann Warren. Be...,pos


In [57]:
test_df.head(3)

Unnamed: 0,id,score,review,sentiment
0,0,10,I went and saw this movie last night after bei...,pos
1,10000,7,Actor turned director Bill Paxton follows up h...,pos
2,10001,9,As a recreational golfer with some knowledge o...,pos


In [59]:
train_df["sentiment"] = train_df["sentiment"].map({"pos":1, "neg":0})
test_df["sentiment"] = test_df["sentiment"].map({"pos":1, "neg":0})

In [61]:
train_df.sentiment.value_counts()

0    12500
1    12500
Name: sentiment, dtype: int64

In [91]:
test_df.sentiment.value_counts()

0    12500
1    12500
Name: sentiment, dtype: int64

## Vectorize the Data

In [114]:
corpus_train = train_df.review
corpus_test = test_df.review
vectorizer = CountVectorizer(stop_words="english")
X_train = vectorizer.fit_transform(corpus_train)
X_test = vectorizer.transform(corpus_test)

## Model Building

### Baseline
We will use a Gaussian Naive Bayes Classifier as our baseline approach

In [115]:
X_train = X_train.todense()
X_test = X_test.todense()
y_train = train_df.sentiment
y_test = test_df.sentiment

In [116]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred = gnb.predict(X_test)

In [117]:
print((y_test == y_pred).sum()/len(y_test))

0.56944


### Gradient Boosting

In [120]:
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.82544