In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv


# Sentiment analysis of the IMDB dataset

I've decided to use some of the basic methods and compare them, namely:
* Several Logistic regression solvers
* K-nearest neighbors
* Naive Bayes
* Extreme gradient boosting

# Importing and preparing the dataset

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [3]:
# Load the dataset
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')

In [4]:
# Split the dataset into train and test sets
train_df = df[:24499]
test_df = df[24499:]

In [5]:
# Extract the reviews and labels
X_train = train_df['review'].values
y_train = train_df['sentiment'].values
X_test = test_df['review'].values
y_test = test_df['sentiment'].values

# Logistic regression

![](https://miro.medium.com/max/1400/1*dm6ZaX5fuSmuVvM4Ds-vcg.jpeg)

In [6]:
# Convert the reviews to a bag-of-words representation
vectorizer = CountVectorizer()
X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)

## Different solvers

In [None]:
# Fit a logistic regression model to the training data
ncg_model = LogisticRegression(max_iter=300, solver='newton-cg')
default_model = LogisticRegression(max_iter=300, solver='liblinear')
sag_model = LogisticRegression(max_iter=300, solver='sag')
saga_model = LogisticRegression(max_iter=300, solver='newton-cg')

ncg_model.fit(X_train_vectors, y_train)
default_model.fit(X_train_vectors, y_train)
sag_model.fit(X_train_vectors, y_train)
saga_model.fit(X_train_vectors, y_train)

In [None]:
# Evaluate the model on the test data
ncg_accuracy = ncg_model.score(X_test_vectors, y_test)
def_accuracy = default_model.score(X_test_vectors, y_test)
sag_accuracy = sag_model.score(X_test_vectors, y_test)
saga_accuracy = saga_model.score(X_test_vectors, y_test)
print(f'Newton-CG accuracy: {ncg_accuracy:.3f}')
print(f'LibLinear accuracy: {def_accuracy:.3f}')
print(f'SAG accuracy: {sag_accuracy:.3f}')
print(f'SAGA accuracy: {saga_accuracy:.3f}')

# KNN

![](https://miro.medium.com/max/1400/0*34SajbTO2C5Lvigs.png)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier

In [None]:
# Convert the reviews to a tf-idf representation
knn_vectorizer = TfidfVectorizer()
knn_X_train_vectors = vectorizer.fit_transform(X_train)
knn_X_test_vectors = vectorizer.transform(X_test)


In [None]:
# Fit a KNN model to the training data
knn_model = KNeighborsClassifier()
knn_model.fit(knn_X_train_vectors, y_train)

In [None]:
# Evaluate the model on the test data
knn_accuracy = knn_model.score(knn_X_test_vectors, y_test)
print(f'Test accuracy: {knn_accuracy:.2f}')

# Naive Bayes

![](https://miro.medium.com/max/1200/1*39U1Ln3tSdFqsfQy6ndxOA.png)

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
# Using the BoW vectors from the LR model

# Fit a naive Bayes model to the training data
nb_model = MultinomialNB()
nb_model.fit(X_train_vectors, y_train)

In [None]:
# Evaluate the model on the test data
nb_accuracy = nb_model.score(X_test_vectors, y_test)
print(f'Test accuracy: {nb_accuracy:.2f}')

# XGBoost

![](https://www.researchgate.net/publication/345327934/figure/fig3/AS:1022810793209856@1620868504478/Flow-chart-of-XGBoost.png)

In [None]:
import xgboost as xgb

In [None]:
y_train

In [None]:
# Converting the data to vector representations
xgb_y_train = np.where(y_train == 'positive', 1, 0)
xgb_y_train
xgb_y_test = np.where(y_test == 'positive', 1, 0)
xgb_y_test

In [None]:
# Define the XGBoost classifier
classifier = xgb.XGBClassifier()

# Fit the classifier to the training data
classifier.fit(X_train_vectors, xgb_y_train)

In [None]:
xgb_accuracy = classifier.score(X_test_vectors, xgb_y_test)

In [None]:
print(f'Test accuracy: {xgb_accuracy:.2f}')

# Visualizing the results

In [None]:
import seaborn as sns, pandas as pd

In [None]:
results = {
    'NCG LR': ncg_accuracy,
    'Default LR': def_accuracy,
    'SAG LR': sag_accuracy,
    'SAGA LR': saga_accuracy,
    'KNN Model': knn_accuracy,
    'Naive Bayes': nb_accuracy,
    'XGBoost': xgb_accuracy
}
indexes = results.keys()

In [None]:
results_plot = pd.DataFrame.from_dict(results, orient='index', columns=['score'])

In [None]:
results_plot.sort_values(by='score',ascending=False)

In [None]:
# define the color palette of different colors
pal = sns.color_palette("Greens_d", len(results_plot))

In [None]:
plot = sns.barplot(data=results_plot.sort_values(by='score',ascending=False), 
            x='score', 
            y=results_plot.index,
            palette=pal)

plot.bar_label(plot.containers[0],
               label_type='center', 
               color='snow', 
               fmt='%.3f')

# Conclusion

It seems like logistic regression works best for this classification task, with slight differences in accuracy scores.