# Sentiment Analysis on IMDB Movie Reviews with TF-IDF Text Embedding

## Loading Libraries

In [None]:
!pip install git+https://github.com/troyhunterz/preprocess_tr.git

In [2]:
import pandas as pd
import numpy as np

# scikit-learn
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# custom library
import preprocess_tr as ps

# pickle
import pickle as pkl

# warnings
import warnings
warnings.filterwarnings('ignore')

## Loading Data

In [3]:
df = pd.read_csv('dataset/imdb_reviews.txt', sep='\t', header=None)

## Text Preprocessing

In [4]:
df.columns = ['reviews', 'sentiment']
df.head()

Unnamed: 0,reviews,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [5]:
df['reviews'] = df['reviews'].apply(lambda x: ps.contraction_to_expansion(x))
df['reviews'] = df['reviews'].apply(lambda x: ps.rm_accented_chars(x))
df['reviews'] = df['reviews'].apply(lambda x: ps.rm_html(x))
df['reviews'] = df['reviews'].apply(lambda x: ps.remove_emails(x))
df['reviews'] = df['reviews'].apply(lambda x: ps.remove_urls(x))
df['reviews'] = df['reviews'].apply(lambda x: ps.rm_special_chars(x))
df['reviews'] = df['reviews'].apply(lambda x: ps.lemmatize(x))
df['reviews'] = df['reviews'].apply(lambda x: str(x).lower())

In [6]:
df.head(3)

Unnamed: 0,reviews,sentiment
0,a very very very slowmove aimless movie about ...,0
1,not sure who be more lose the flat character...,0
2,attempt artiness with black white and clever...,0


## Data Preparation for Model Training

In [7]:
X = df['reviews']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

X_train.shape, X_test.shape

((598,), (150,))

## ML Model Building

In [13]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(solver='liblinear'))
])

In [17]:
hyperparameters = {
    'tfidf__max_df': (.5, 1.),
    'tfidf__ngram_range': ((1,1),(1,2)),
    'tfidf__use_idf': (True, False),
    'tfidf__analyzer': ('word', 'char', 'char_wb'),
    
    'clf__penalty': ('l2', 'l1'),
    'clf__C': (1,2)
}

In [18]:
clf = GridSearchCV(pipe, hyperparameters, n_jobs=-1, cv=None)

In [21]:
%%time
clf.fit(X_train, y_train)

CPU times: total: 547 ms
Wall time: 5.75 s


In [22]:
print(f'{clf.best_score_}\n\n{clf.best_estimator_}')

0.7792296918767507

Pipeline(steps=[('tfidf', TfidfVectorizer(max_df=0.5)),
                ('clf', LogisticRegression(C=2, solver='liblinear'))])


In [23]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.71      0.73        73
           1       0.74      0.77      0.75        77

    accuracy                           0.74       150
   macro avg       0.74      0.74      0.74       150
weighted avg       0.74      0.74      0.74       150



## SVM Model

In [26]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ( 'clf', LinearSVC())
])

In [27]:
hyperparameters = {
    'tfidf__max_df': (.5, 1.),
    'tfidf__ngram_range': ((1,1), (1,2)),
    'tfidf__use_idf': (True, False),
    'tfidf__analyzer': ('word', 'chars', 'char_wb'),

    'clf__C': (1,2, 2.5,3)
}

In [28]:
clf = GridSearchCV(pipe, hyperparameters, n_jobs=-1, cv=None)

In [29]:
%%time
clf.fit(X_train, y_train)

CPU times: total: 359 ms
Wall time: 1.27 s


In [30]:
clf.best_estimator_, clf.best_params_

(Pipeline(steps=[('tfidf', TfidfVectorizer(max_df=0.5)),
                 ('clf', LinearSVC(C=1))]),
 {'clf__C': 1,
  'tfidf__analyzer': 'word',
  'tfidf__max_df': 0.5,
  'tfidf__ngram_range': (1, 1),
  'tfidf__use_idf': True})

In [31]:
clf.best_score_

0.7842296918767507

In [32]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.75      0.74        73
           1       0.76      0.73      0.74        77

    accuracy                           0.74       150
   macro avg       0.74      0.74      0.74       150
weighted avg       0.74      0.74      0.74       150



## Model Testing and Saving

In [33]:
x = ['This is a great movie. I loved it', 'i have watched this movie. plot is straight. return my money']
clf.predict(x)

array([1, 0], dtype=int64)

In [34]:
pkl.dump(clf, open('model/model.pkl', 'wb'))