# Sentiment Analysis on IMDB Movie Reviews with TF-IDF Text Embedding

## Loading Libraries

In [None]:
!pip install git+https://github.com/troyhunterz/preprocess_tr.git

In [None]:
import pandas as pd
import numpy as np

# scikit-learn
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# custom library
import preprocess_tr as ps

# pickle
import pickle as pkl

# warnings
import warnings
warnings.filterwarnings('ignore')

## Loading Data

In [None]:
df = pd.read_csv('dataset/imdb_reviews.txt', sep='\t', header=None)

## Text Preprocessing

In [None]:
df.columns = ['reviews', 'sentiment']
df.head()

In [None]:
df['reviews'] = df['reviews'].apply(lambda x: ps.contraction_to_expansion(x))
df['reviews'] = df['reviews'].apply(lambda x: ps.rm_accented_chars(x))
df['reviews'] = df['reviews'].apply(lambda x: ps.rm_html(x))
df['reviews'] = df['reviews'].apply(lambda x: ps.remove_emails(x))
df['reviews'] = df['reviews'].apply(lambda x: ps.remove_urls(x))
df['reviews'] = df['reviews'].apply(lambda x: ps.rm_special_chars(x))
df['reviews'] = df['reviews'].apply(lambda x: ps.lemmatize(x))
df['reviews'] = df['reviews'].apply(lambda x: str(x).lower())

In [None]:
df.head(3)

## Data Preparation for Model Training

In [None]:
X = df['reviews']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

X_train.shape, X_test.shape

## ML Model Building

In [None]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer(),
     'clf', LogisticRegression(solver='liblinear'))
])

In [None]:
hyperparameters = {
    'tfidf__max_df': (.5, 1.),
    'tfidf__ngram_range': ((1,1),(1,2)),
    'tfidf__use_idf': (True, False),
    'tfifd__analyzer': ('word', 'char', 'char_wb'),

    'clf__penalty': ('l2', 'l1'),
    'clf__C': (1,2)
}

In [None]:
clf = GridSearchCV(pipe, hyperparameters, n_jobs=-1, cv=None)

In [None]:
%%time
clf.fit(X_train, X_test)

In [None]:
print(f'{clf.best_score_}\n\n{clf.best_estimator_}')

In [None]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

## SVM Model

In [None]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer(),
     'clf', LinearSVC())
])

In [None]:
hyperparameters = {
    'tfidf__max_df': (.5, 1.),
    'tfidf__ngram_range': ((1,1), (1,2)),
    'tfidf__use_idf': (True, False),
    'tfidf__analyzer': ('word', 'chars', 'char_wb'),

    'clf__C': (1,2, 2.5,3)
}

In [None]:
clf = GridSearchCV(pipe, hyperparameters, n_jobs=-1, cv=None)

In [None]:
%%time
clf.fit(X_train, y_train)

In [None]:
clf.best_estimator_, clf.best_params_

In [None]:
clf.best_score_

In [None]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

## Model Testing and Saving

In [None]:
x = ['This is a great movie. I loved it', 'i have watched this movie. plot is straight. return my money']
clf.predict(x)

In [None]:
pkl.dump(clf, open('model/model.pkl', 'wb'))