## Importing Data

In [1]:
import pandas as pd
import re
import string
import nltk
pd.set_option('display.max_colwidth', 100)

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_excel('IMDB_dataset.xlsx')

## Preprocessing Text Data(Removing punctuation, Performing Tokenization, Removing stopwords and Lemmatize)

In [2]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

In [5]:
nltk.download('wordnet')
wn = nltk.WordNetLemmatizer()
def lemmatizing(tokenized_text): 
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

data['body_text_lemmatized'] = data['review'].apply(lambda x: lemmatizing(x))

data.head(10)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


Unnamed: 0,review,sentiment,body_text_lemmatized
0,"I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air...",positive,"[I, , t, h, o, u, g, h, t, , t, h, i, s, , w, a, s, , a, , w, o, n, d, e, r, f, u, l, , w,..."
1,"Probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a nobl...",positive,"[P, r, o, b, a, b, l, y, , m, y, , a, l, l, -, t, i, m, e, , f, a, v, o, r, i, t, e, , m, o,..."
2,I sure would like to see a resurrection of a up dated Seahunt series with the tech they have tod...,positive,"[I, , s, u, r, e, , w, o, u, l, d, , l, i, k, e, , t, o, , s, e, e, , a, , r, e, s, u, r,..."
3,"This show was an amazing, fresh & innovative idea in the 70's when it first aired. The first 7 o...",negative,"[T, h, i, s, , s, h, o, w, , w, a, s, , a, n, , a, m, a, z, i, n, g, ,, , f, r, e, s, h, ,..."
4,Encouraged by the positive comments about this film on here I was looking forward to watching th...,negative,"[E, n, c, o, u, r, a, g, e, d, , b, y, , t, h, e, , p, o, s, i, t, i, v, e, , c, o, m, m, e,..."
5,Phil the Alien is one of those quirky films where the humour is based around the oddness of ever...,negative,"[P, h, i, l, , t, h, e, , A, l, i, e, n, , i, s, , o, n, e, , o, f, , t, h, o, s, e, , q,..."
6,I saw this movie when I was about 12 when it came out. I recall the scariest scene was the big b...,negative,"[I, , s, a, w, , t, h, i, s, , m, o, v, i, e, , w, h, e, n, , I, , w, a, s, , a, b, o, u,..."
7,So im not a big fan of Boll's work but then again not many are. I enjoyed his movie Postal (mayb...,negative,"[S, o, , i, m, , n, o, t, , a, , b, i, g, , f, a, n, , o, f, , B, o, l, l, ', s, , w, o,..."
8,This a fantastic movie of three prisoners who become famous. One of the actors is george clooney...,positive,"[T, h, i, s, , a, , f, a, n, t, a, s, t, i, c, , m, o, v, i, e, , o, f, , t, h, r, e, e, ,..."
9,This movie made it into one of my top 10 most awful movies. Horrible. <br /><br />There wasn't a...,negative,"[T, h, i, s, , m, o, v, i, e, , m, a, d, e, , i, t, , i, n, t, o, , o, n, e, , o, f, , m,..."


## TFIDF Vectorization

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['body_text_lemmatized'])
print(X_tfidf.shape)
print(tfidf_vect.get_feature_names())

(25000, 94469)


## Exploring parameter settings using GridSearchCV on Random Forest

In [7]:
from sklearn.ensemble import RandomForestClassifier

In [8]:
from sklearn.model_selection import KFold, cross_val_score

In [9]:
rf = RandomForestClassifier(n_jobs=-1)
k_fold = KFold(n_splits=5)
cross_val_score(rf, X_tfidf, data['sentiment'], cv=k_fold, scoring='accuracy', n_jobs=-1)

array([0.844 , 0.8516, 0.8392, 0.8468, 0.8454])

In [10]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, data['sentiment'], test_size=0.2)

In [12]:
from sklearn.ensemble import RandomForestClassifier

In [13]:
def train_RF(n_est, depth):
    rf = RandomForestClassifier(n_estimators=n_est, max_depth=depth, n_jobs=-1)
    rf_model = rf.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    precision, recall, fscore, support = score(y_test, y_pred, pos_label='positive', average='binary')
    print('Est: {} / Depth: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
        n_est, depth, round(precision, 3), round(recall, 3),
        round((y_pred==y_test).sum() / len(y_pred), 3)))

In [14]:
for n_est in [10, 50, 100]:
    for depth in [10, 20, 30, None]:
        train_RF(n_est, depth)

Est: 10 / Depth: 10 ---- Precision: 0.687 / Recall: 0.759 / Accuracy: 0.708
Est: 10 / Depth: 20 ---- Precision: 0.722 / Recall: 0.785 / Accuracy: 0.743
Est: 10 / Depth: 30 ---- Precision: 0.737 / Recall: 0.764 / Accuracy: 0.746
Est: 10 / Depth: None ---- Precision: 0.79 / Recall: 0.669 / Accuracy: 0.746
Est: 50 / Depth: 10 ---- Precision: 0.794 / Recall: 0.856 / Accuracy: 0.817
Est: 50 / Depth: 20 ---- Precision: 0.81 / Recall: 0.844 / Accuracy: 0.824
Est: 50 / Depth: 30 ---- Precision: 0.815 / Recall: 0.854 / Accuracy: 0.831
Est: 50 / Depth: None ---- Precision: 0.84 / Recall: 0.831 / Accuracy: 0.837
Est: 100 / Depth: 10 ---- Precision: 0.802 / Recall: 0.869 / Accuracy: 0.828
Est: 100 / Depth: 20 ---- Precision: 0.827 / Recall: 0.867 / Accuracy: 0.843
Est: 100 / Depth: 30 ---- Precision: 0.828 / Recall: 0.864 / Accuracy: 0.843
Est: 100 / Depth: None ---- Precision: 0.847 / Recall: 0.841 / Accuracy: 0.845


Here we can see that, for higher number of estimators and higher depth, we are getting higher precision and accuracy scores.

## Exploring parameter settings using GridSearchCV on Gradient Boosting

In [15]:
from sklearn.ensemble import GradientBoostingClassifier

In [16]:
def train_GB(est, max_depth, lr):
    gb = GradientBoostingClassifier(n_estimators=est, max_depth=max_depth, learning_rate=lr)
    gb_model = gb.fit(X_train, y_train)
    y_pred = gb_model.predict(X_test)
    precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='positive', average='binary')
    print('Est: {} / Depth: {} / LR: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
        est, max_depth, lr, round(precision, 3), round(recall, 3), 
        round((y_pred==y_test).sum()/len(y_pred), 3)))

In [19]:
for n_est in [50, 100, 150]:
    for max_depth in [3, 7, 11, 15]:
        for lr in [0.01, 0.1, 1]:
            train_GB(n_est, max_depth, lr)

Est: 50 / Depth: 3 / LR: 0.01 ---- Precision: 0.646 / Recall: 0.887 / Accuracy: 0.702
Est: 50 / Depth: 3 / LR: 0.1 ---- Precision: 0.745 / Recall: 0.871 / Accuracy: 0.787
Est: 50 / Depth: 3 / LR: 1 ---- Precision: 0.826 / Recall: 0.842 / Accuracy: 0.833
Est: 50 / Depth: 7 / LR: 0.01 ---- Precision: 0.693 / Recall: 0.864 / Accuracy: 0.741
Est: 50 / Depth: 7 / LR: 0.1 ---- Precision: 0.787 / Recall: 0.865 / Accuracy: 0.816
Est: 50 / Depth: 7 / LR: 1 ---- Precision: 0.815 / Recall: 0.848 / Accuracy: 0.828
Est: 50 / Depth: 11 / LR: 0.01 ---- Precision: 0.707 / Recall: 0.852 / Accuracy: 0.75
Est: 50 / Depth: 11 / LR: 0.1 ---- Precision: 0.802 / Recall: 0.853 / Accuracy: 0.822
Est: 50 / Depth: 11 / LR: 1 ---- Precision: 0.798 / Recall: 0.829 / Accuracy: 0.811
Est: 50 / Depth: 15 / LR: 0.01 ---- Precision: 0.722 / Recall: 0.831 / Accuracy: 0.756
Est: 50 / Depth: 15 / LR: 0.1 ---- Precision: 0.8 / Recall: 0.837 / Accuracy: 0.815
Est: 50 / Depth: 15 / LR: 1 ---- Precision: 0.788 / Recall: 0.822

KeyboardInterrupt: 

Here we can see that, for learning rate 0.1 and depth greater than 3, we are getting higher recall,precision and accuracy scores.

## Final evaluation of Random Forest model

In [17]:
from sklearn.model_selection import GridSearchCV
param = {'n_estimators': [10, 150, 300],
        'max_depth': [30, 60, 90, None]}

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)
gs_fit = gs.fit(X_tfidf, data['sentiment'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
8,253.966734,4.322762,19.728348,3.038647,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.859,0.8662,0.858,0.8576,0.8548,0.85912,0.003804,1
11,208.418265,2.674954,0.684553,0.37075,,300,"{'max_depth': None, 'n_estimators': 300}",0.8576,0.8712,0.852,0.854,0.858,0.85856,0.006706,2
5,181.733651,0.800259,19.444474,0.672855,60.0,300,"{'max_depth': 60, 'n_estimators': 300}",0.8512,0.8638,0.8538,0.8528,0.8592,0.85616,0.004671,3
7,134.376523,0.218176,13.104564,0.837054,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.8508,0.868,0.8566,0.8472,0.8528,0.85508,0.007139,4
2,34.528992,16.273212,4.603938,7.147741,30.0,300,"{'max_depth': 30, 'n_estimators': 300}",0.8546,0.8576,0.8502,0.8506,0.848,0.8522,0.003439,5


## Final evaluation of Gradient boosting model 

In [18]:
gb = GradientBoostingClassifier()
param = {
    'n_estimators': [50, 100, 150], 
    'max_depth': [7, 11, 15],
    'learning_rate': [0.1]
}

clf = GridSearchCV(gb, param, cv=5, n_jobs=-1)
cv_fit = clf.fit(X_tfidf, data['sentiment'])
pd.DataFrame(cv_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
5,1137.659728,8.579032,0.110225,0.007835,0.1,11,150,"{'learning_rate': 0.1, 'max_depth': 11, 'n_estimators': 150}",0.8438,0.8544,0.8456,0.8456,0.8428,0.84644,0.004123,1
2,761.924079,1.016734,0.113025,0.030253,0.1,7,150,"{'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 150}",0.8448,0.8548,0.8452,0.8418,0.8426,0.84584,0.00466,2
8,1176.430757,18.934669,0.08782,0.004709,0.1,15,150,"{'learning_rate': 0.1, 'max_depth': 15, 'n_estimators': 150}",0.844,0.8506,0.8398,0.8366,0.8402,0.84224,0.004794,3
4,813.661742,3.986107,0.110425,0.021551,0.1,11,100,"{'learning_rate': 0.1, 'max_depth': 11, 'n_estimators': 100}",0.8334,0.8474,0.8404,0.8392,0.84,0.84008,0.004452,4
1,515.549526,3.699668,0.076817,0.01013,0.1,7,100,"{'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100}",0.836,0.8422,0.8356,0.8328,0.8368,0.83668,0.003071,5
