In [1]:
import re
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix

In [None]:
nltk.download()

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##  Read data and Data analysis

In [3]:
imdb_data = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/ml_tasks/data/imbd_dataset.csv')
print(imdb_data.shape)
imdb_data.head(5)

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
imdb_data['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

## Text preprocessing

In [5]:
# Removing brackets and html strips 

def html_strip(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def brackets(text):
    return re.sub('\[[^]]*\]', '', text)

def denoise_text(text):
    text = html_strip(text)
    text = brackets(text)
    return text

imdb_data['review'] = imdb_data['review'].apply(denoise_text)

In [6]:
# Stemming the text

def stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

imdb_data['review'] = imdb_data['review'].apply(stemmer)

In [7]:
imdb_data.head(5)

Unnamed: 0,review,sentiment
0,one of the other review ha mention that after ...,positive
1,a wonder littl production. the film techniqu i...,positive
2,i thought thi wa a wonder way to spend time on...,positive
3,basic there' a famili where a littl boy (jake)...,negative
4,"petter mattei' ""love in the time of money"" is ...",positive


In [10]:
# Tokenization the text

tokenizer = ToktokTokenizer()

# Setting english stopwords

stopword_list = nltk.corpus.stopwords.words('english')
stop = set(stopwords.words('english'))

# Removing the stopwords

def remove_stopwords(text):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]

    cleaned_tokens = [token for token in tokens if token.lower() not in stopword_list]
    cleaned_text = ' '.join(cleaned_tokens)

    return cleaned_text

imdb_data['review'] = imdb_data['review'].apply(remove_stopwords)


## Split by train and test

In [56]:

train_reviews = imdb_data.review[:4000]
train_sentiments = imdb_data.sentiment[:4000]

test_reviews = imdb_data.review[4000:5000]
test_sentiments = imdb_data.sentiment[4000:5000]

print(train_reviews.shape, train_sentiments.shape)
print(test_reviews.shape, test_sentiments.shape)


norm_train_reviews = imdb_data.review[:4000]
norm_test_reviews = imdb_data.review[4000:5000]

(4000,) (4000,)
(1000,) (1000,)


## Bag-of-words model

In [57]:

cv = CountVectorizer(min_df=0, max_df=1, ngram_range=(1, 3))

cv_train_reviews = cv.fit_transform(norm_train_reviews)
cv_test_reviews = cv.transform(norm_test_reviews)

print('Bag_of_words_cv_train:', cv_train_reviews.shape)
print('Bag_of_words_cv_test:', cv_test_reviews.shape)



Bag_of_words_cv_train: (4000, 814250)
Bag_of_words_cv_test: (1000, 814250)


In [58]:
lb = LabelBinarizer()
sentiment_data = lb.fit_transform(imdb_data['sentiment'])

train_sentiments = sentiment_data[:4000]
test_sentiments = sentiment_data[4000:5000]
print(sentiment_data.shape)

(50000, 1)


## Logistic regression model

In [59]:
lr_model = LogisticRegression()

grid_search = GridSearchCV(estimator = lr_model,  
                           param_grid = [{'C' : [0.1, 0.5, 1, 2, 3]}],
                           scoring = 'f1_macro',
                           cv = 3,
                           verbose=0)


grid_search.fit(cv_train_reviews, train_sentiments.ravel())

print(grid_search.best_params_)

{'C': 0.1}


In [60]:
lr_model = LogisticRegression(penalty='l2', C=0.1)
lr = lr_model.fit(cv_train_reviews, train_sentiments.ravel())
lr_predict = lr_model.predict(cv_test_reviews)

In [61]:
lr_score = accuracy_score(test_sentiments,lr_predict)
print("lr_score :", lr_score)

lr_score : 0.706


In [62]:
lr_report = classification_report(test_sentiments,lr_predict,target_names=['Positive','Negative'])
print(lr_report)

              precision    recall  f1-score   support

    Positive       0.67      0.84      0.74       505
    Negative       0.77      0.57      0.66       495

    accuracy                           0.71      1000
   macro avg       0.72      0.70      0.70      1000
weighted avg       0.72      0.71      0.70      1000



In [63]:
cm = confusion_matrix(test_sentiments,lr_predict,labels=[1,0])
print(cm)


[[284 211]
 [ 83 422]]


## SVM

In [64]:
svm_model = SVC()

grid_search = GridSearchCV(estimator = svm_model,  
                           param_grid = [{'kernel': ['rbf', 'poly', 'linear'],
                                          'C': [1, 2, 3, 4]}],
                           scoring = 'f1_macro',
                           cv = 3,
                           verbose=0)

grid_search.fit(cv_train_reviews, train_sentiments.ravel())

print(grid_search.best_params_)


{'C': 2, 'kernel': 'rbf'}


In [65]:
svm_model = SVC(kernel='rbf', C=2)
svm = svm_model.fit(cv_train_reviews, train_sentiments.ravel())
svm_predict = svm_model.predict(cv_test_reviews)

In [66]:
svm_score = accuracy_score(test_sentiments, svm_predict)
print("svm_score :", svm_score)

svm_score : 0.629


In [67]:
svm_report=classification_report(test_sentiments,svm_predict,target_names=['Positive','Negative'])
print(svm_report)

              precision    recall  f1-score   support

    Positive       0.83      0.33      0.48       505
    Negative       0.58      0.93      0.71       495

    accuracy                           0.63      1000
   macro avg       0.70      0.63      0.59      1000
weighted avg       0.71      0.63      0.59      1000



In [68]:
cm = confusion_matrix(test_sentiments, svm_predict, labels=[1,0])
print(cm)

[[461  34]
 [337 168]]
