- Kaggle : https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews/overview
- Maker notes : https://www.notion.so/maker-NLP-00d265601ad146e490bea30cda512756

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
sys.path.append('..')

from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from maker_nlp.preprocessing import \
    convert_to_lowercase, remove_accents, remove_punctuation_and_digits, normalize_text,\
    remove_stop_words, clean_text

DATA_FOLDER = Path('../data')
CLASS_NAMES = ['negative', 'neutral', 'positive']
CLASS_NAMES = ['negative', 'somewhat negative', 'neutral', 'somewhat positive', 'positive']

## Dataset

### Load dataset

In [None]:
df = pd.read_csv(DATA_FOLDER / 'final_dataset.csv')
print(df.shape)
df.head()

### Seperate dataset in train and test sets

In [None]:
X, y = df.Phrase, df.Sentiment
print(f'Shape of X = {X.shape}, Shape of y = {y.shape}')

In [None]:
X_train_master, X_test_master, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=42)
X_train_master.shape, X_test_master.shape, y_train.shape, y_test.shape

## Pre-processing & Feature engineering

### Normalize text

In [None]:
text = """24 sept. 2020 14:52 - Le groupe Total a confirmé ce jeudi la fermeture de sa raffinerie de Grandpuits (Seine-et-Marne) pour la transformer en "plateforme zéro pétrole"."""
text

- Convert text to lowercase

In [None]:
convert_to_lowercase(text)

- Remove accents

In [None]:
remove_accents(text)

- Remove punctuation and digits

In [None]:
remove_punctuation_and_digits(text)

In [None]:
normalized_text = normalize_text(text)
normalized_text

### Remove Stop Words

In [None]:
normalized_text

In [None]:
useful_words = remove_stop_words(normalized_text)
useful_words

In [None]:
useless_words = set(useful_words.split(' ')).symmetric_difference(set(normalized_text.split(' ')))
useless_words

### Application of text cleaning

In [None]:
X_train = X_train_master.copy()
X_train.head()

In [None]:
X_train = X_train.apply(clean_text)
X_train.head()

### Transform dataset to a bag of words

In [None]:
count_vectorizer = CountVectorizer()

In [None]:
count_vectorizer.fit(X_train)

In [None]:
X_train_count_features = count_vectorizer.transform(X_train)
X_train_count_features

## Modeling

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=42, max_iter=1000)

In [None]:
clf.fit(X_train_count_features, y_train)

## Evaluation

### Preprocess test dataset

In [None]:
X_test = X_test_master.copy()
X_test = X_test.apply(clean_text)

In [None]:
X_test_count_features = count_vectorizer.transform(X_test)
X_test_count_features

### Compute predictions on test dataset

In [None]:
y_pred = clf.predict(X_test_count_features)
y_pred[:10]

### Evaluate model performance

In [None]:
from sklearn.metrics import plot_confusion_matrix

np.set_printoptions(precision=2)
fig, ax = plt.subplots(figsize=(10, 10))
disp = plot_confusion_matrix(clf, X_test_count_features, y_test,
                             display_labels=CLASS_NAMES, cmap=plt.cm.Blues,
                             normalize='true', ax=ax)
disp.ax_.set_title('Normalized confusion matrix')
plt.show()