# Sentiment analysis based on BOW and TF-IDF

In [78]:
import pandas as pd

import re

# import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [12]:
RANDOM_STATE = 42

## 1. About the data

The data is taken from https://www.kaggle.com/datasets/yasserh/imdb-movie-ratings-sentiment-analysis.
It has two columns:
1. "text" - film reviews from IMDB, string;
2. "label" - int: 0 if the review was negative, 1 if the review was positive.

In [13]:
reviews = pd.read_csv('movie.csv')

In [14]:
reviews.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


## 2. Data preprocessing

In [15]:
def prepare_stopwords(language='english'):
    """
    Prepares stopwords for a given language.
    """
    stop_words = stopwords.words(language)
    if language == 'english':
        stop_words.remove('not')
        stop_words.remove('no')
    return stop_words

In [16]:
def preprocess_string(text):
    """
    Preprocesses the text string in English for further BOW or TF-IDF analysis.
    After the preprocessing the string contains words only and is lower-cased, tokenized, 
    without stop words and lemmatized.
    param text: string to preprocess.
    return: a preprocessed string.
    """
    
    text = re.sub('[^a-zA-Z0-9]+', ' ', text)
    
    text = text.lower()
    
    tokens = word_tokenize(text)
    
    stop_words = prepare_stopwords()
    text = [word for word in tokens if word not in stop_words]
    
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]
    
    text = ' '.join(text)
    
    return text

In [17]:
reviews['preprocessed_text'] = reviews['text'].apply(lambda text: preprocess_string(text))

In [18]:
reviews.head()

Unnamed: 0,text,label,preprocessed_text
0,I grew up (b. 1965) watching and loving the Th...,0,grew b 1965 watching loving thunderbird mate s...
1,"When I put this movie in my DVD player, and sa...",0,put movie dvd player sat coke chip expectation...
2,Why do people who do not know what a particula...,0,people not know particular time past like feel...
3,Even though I have great interest in Biblical ...,0,even though great interest biblical movie bore...
4,Im a die hard Dads Army fan and nothing will e...,1,im die hard dad army fan nothing ever change g...


## 3. Splitting the data to train and test parts

In [19]:
X = reviews.drop(['label'], axis=1)
y = reviews['label']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=RANDOM_STATE, stratify=y)

## 4. Text vectorizing

### 4.1. BOW vectorizing

In [21]:
bow_vect = CountVectorizer()

In [22]:
X_train_bow = bow_vect.fit_transform(X_train['preprocessed_text'])
X_test_bow = bow_vect.transform(X_test['preprocessed_text'])

### 4.2. TF-IDF vectorizing

In [23]:
tfidf_vect = TfidfVectorizer()

In [24]:
X_train_tfidf = tfidf_vect.fit_transform(X_train['preprocessed_text'])
X_test_tfidf = tfidf_vect.transform(X_test['preprocessed_text'])

## 5. ML models

### 5.1. Naive Bayes classifier

In [25]:
nb_clf = MultinomialNB()

#### 5.1.1. BOW Naive Bayes

In [26]:
nb_clf.fit(X_train_bow, y_train)

MultinomialNB()

In [27]:
y_pred_bow_nb = nb_clf.predict(X_test_bow)

In [28]:
accuracy_score(y_test, y_pred_bow_nb) # 0.8556818181818182

0.8556818181818182

#### 5.1.2. TF-IDF Naive Bayes

In [29]:
nb_clf.fit(X_train_tfidf, y_train)

MultinomialNB()

In [30]:
y_pred_tfidf_nb = nb_clf.predict(X_test_tfidf)

In [31]:
accuracy_score(y_test, y_pred_tfidf_nb) # 0.8642424242424243

0.8642424242424243

## 5.2. Logistic regression

#### 5.2.1. TF-IDF logistic regression

In [32]:
log_clf = LogisticRegression()

In [33]:
log_clf.fit(X_train_tfidf, y_train)

LogisticRegression()

In [34]:
y_pred_tfidf_log = log_clf.predict(X_test_tfidf)

In [35]:
accuracy_score(y_test, y_pred_tfidf_log) # 0.89 - best result

0.89

## 5.3. Random forest

In [36]:
rf_clf = RandomForestClassifier()

#### 5.3.1. BOW random forest

In [37]:
rf_clf.fit(X_train_bow, y_train)

RandomForestClassifier()

In [38]:
y_pred_bow_rf = rf_clf.predict(X_test_bow)

In [40]:
accuracy_score(y_test, y_pred_bow_rf) # 0.8567424242424242

0.8567424242424242

#### 5.3.2. TF-IDF random forest

In [41]:
rf_clf.fit(X_train_tfidf, y_train)

RandomForestClassifier()

In [42]:
y_pred_tfidf_rf = rf_clf.predict(X_test_tfidf)

In [44]:
accuracy_score(y_test, y_pred_tfidf_rf) # 0.8528787878787879

0.8528787878787879