In [5]:
import nltk
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [6]:
file_path = "./IMDB Dataset.csv"

In [7]:
# eda

import pandas as pd
df = pd.read_csv(file_path)
print("columns: ", df.columns)
print(df.head())
print(df.sentiment.value_counts)
print(len(df))

columns:  Index(['review', 'sentiment'], dtype='object')
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
<bound method IndexOpsMixin.value_counts of 0        positive
1        positive
2        positive
3        negative
4        positive
           ...   
49995    positive
49996    negative
49997    negative
49998    negative
49999    negative
Name: sentiment, Length: 50000, dtype: object>
50000


In [8]:
# shuffle and create train and test df
df = df.sample(frac=1).reset_index(drop=True)
df_train = df[:45000]
df_test = df[45000:]

In [10]:
print(len(df_test))

5000


In [14]:
# Processing and tokenizing the dataset

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import re
from functools import partial

stopwords_list = stopwords.words("english")
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def tokenizer(s, type="l"):
    # remove the <br >
    s = re.sub("<[^>]*>", " ", s)
    # remove the https
    s = re.sub("https\S+\s", " ", s)
    words_list = word_tokenize(s)
    # words_list = [w for w in words_list if w not in stopwords_list]
    if type == "l":
        res_words = [lemmatizer.lemmatize(w) for w in words_list]
    else:
        res_words = [stemmer.stem(w) for w in words_list]

    return res_words


from functools import partial
lemmatize_tokenizer = partial(tokenizer, type="l")
stem_tokenizer = partial(tokenizer, type="s")

# testing
stem_tokenizer("Hi, how is it going? https:www.hello.com <br> hello <\br>")

['hi', ',', 'how', 'is', 'it', 'go', '?', 'hello']

In [17]:
# pipeline creation

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, recall_score, make_scorer

pipe = Pipeline([
    ("vect", TfidfVectorizer(tokenizer=tokenizer)),
    ("clf", LogisticRegression(solver='liblinear')) # as the dataset is comparatively small
])


param_search = [
    {
        "vect__tokenizer": [lemmatize_tokenizer,stem_tokenizer],
        "clf__C": [1.0, 10.0]
    }
]

gs = GridSearchCV(pipe, param_search, cv=5,
                  scoring = make_scorer(f1_score, average="macro"))

def find_optimal_params(x_train, y_train):
    gs.fit(x_train, y_train)
    print(gs.best_params_)
    best_classifier = gs.best_estimator_
    return best_classifier


In [24]:
def predict(classifier, x):
    # res = preprocess(input_str)
    if isinstance(x, str):
        x = [x]
    probs = classifier.predict_proba(x)
    pred_sentiment = classifier.predict(x)
    print(probs, pred_sentiment)
    return pred_sentiment

In [19]:
x_train, y_train = df_train.review.values, df_train.sentiment.values

best_classifier = find_optimal_params(x_train, y_train)
classifier = best_classifier



{'clf__C': 10.0, 'vect__tokenizer': functools.partial(<function tokenizer at 0x7ada9fae3f40>, type='l')}


In [26]:
res = predict(classifier, ["hi how are you?", "I am delighted to meet you"])
print(res)

[[0.4890404  0.5109596 ]
 [0.34317481 0.65682519]] ['positive' 'positive']
['positive' 'positive']


In [39]:
from sklearn.metrics import classification_report
pred_sentiment = classifier.predict(df_test.review.values)
print(classification_report(df_test.sentiment.values, pred_sentiment, digits=3))

              precision    recall  f1-score   support

    negative       0.91      0.89      0.90      2466
    positive       0.89      0.91      0.90      2534

    accuracy                           0.90      5000
   macro avg       0.90      0.90      0.90      5000
weighted avg       0.90      0.90      0.90      5000



In [22]:
print(classifier)

Pipeline(steps=[('vect',
                 TfidfVectorizer(tokenizer=functools.partial(<function tokenizer at 0x7ada9fae3f40>, type='l'))),
                ('clf', LogisticRegression(C=10.0, solver='liblinear'))])


In [40]:
print(classification_report(df_test.sentiment.values, pred_sentiment, digits=3))

              precision    recall  f1-score   support

    negative      0.909     0.886     0.897      2466
    positive      0.892     0.914     0.903      2534

    accuracy                          0.900      5000
   macro avg      0.900     0.900     0.900      5000
weighted avg      0.900     0.900     0.900      5000

