In [None]:
import sys
sys.path.append('..')

from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from maker_nlp.vizualisation import plot_top_k_words_per_sentiment_tfidf, plot_top_k_explicative_words_per_sentiment
from maker_nlp.preprocessing import remove_stop_words, convert_to_lowercase, remove_accents, \
    remove_punctuation_and_digits, normalize_text, lemmatize, clean_text

from maker_nlp.config import CLASS_DICT, CLASS_NAMES

DATA_FOLDER = Path('../data')

## Load Dataset

In [None]:
df = pd.read_csv(DATA_FOLDER / 'final_dataset.csv')
print(df.shape)
df.head()

In [None]:
phrase, sentiment = df.Phrase, df.Sentiment
print(f'Shape of Phrase = {phrase.shape}, Shape of Sentiment = {sentiment.shape}')

In [None]:
cleaned_phrase = phrase.apply(clean_text)
cleaned_phrase = cleaned_phrase.dropna()

In [None]:
cleaned_phrase[sentiment == 0][12]

### Separate train and test sets

In [None]:
X_train_master, X_test_master, y_train, y_test = train_test_split(cleaned_phrase, sentiment[cleaned_phrase.index], 
                                                                  test_size= 0.2, random_state=42)
X_train_master.shape, X_test_master.shape, y_train.shape, y_test.shape

## Pre-processing & Feature engineering

### Vectorize Text Data with Tf-Idf

Tf-Idf stand for Term Frequency - Inverse document frequency: these are two methods combined in order to vectorize texts.  

This idea came to correct the incapacity of bag-of-words to take into account the importance of words (for example, *the* should always be ignored whereas *car* might be a valuable information). Hence, Tf-Idf balances the imortance of a word ***locally*** (is this word repeated many times in this document?) and ***globally*** (how much is this word repeated in every document?).  
If a word is very present in a document but also in all the other documents (the case of *the* for instance), then its tf-idf score will be low. On the contrary, if a word is present in some documents and seems to be important in them, then its score will be high. All the math in tf-idf is to quantify these *seems to be important* and *very present*. Let's give a quick look at it:  

$$TfIdf(t,d) = tf(t,d)  \times  idf(t)$$

with t as the term, d the document and:

$$tf(t, d) = \frac{n_{t,d}}{\sum \limits _{k} n_{k, d}} $$

$$idf(t) = log \frac{|D|}{|\{d_{j} : t_{i} \in d_{j}\} |}$$

So, what we understand from these formulas is that the tf-idf score is a comination of these two factors:
&emsp;- The ***term frequency (tf)*** score which is the frequency of the word normalized by the number of words in the document
&emsp;- The ***inverse document frequency (idf)*** score which represents the number of documents in the corpus divided by the number of documents where the word appears. The log function serves as a catalyst: the higher the number of documents where the term appears, the lower the idf score.

**In conclusion, to have a high tf-idf score, a word has to have a high idf score, which means that it shouldn't appear in too many documents. And for the documents where it appears, it should be repeated a lot!**

In [None]:
vectorizer = TfidfVectorizer()

vectorized_phrase = vectorizer.fit_transform(X_train_master)

In [None]:
#vectorizer = TfidfVectorizer().fit(cleaned_phrase)

negative_df = vectorizer.transform(X_train_master[y_train == 0])

for sentence in negative_df:
    sorted_sentence = sorted(sentence)

In [None]:
#vectorizer = TfidfVectorizer().fit(cleaned_phrase)

#negative_df = vectorizer.transform(cleaned_phrase[sentiment == 0])

In [None]:
inverse_dict = {val: key for key, val in vectorizer.vocabulary_.items()}

row = negative_df.getrow(1).toarray()[0].ravel()

top_10_indices = row.argsort()[-12:]
top_10_values = row[top_10_indices]
top_10_words = [inverse_dict[k] for k in top_10_indices]
top_10_words

In [None]:
top_10_values

In [None]:
plt.hist(row[row > 0])

In [None]:
X_train_master[y_train == 0].values[1]

In [None]:
plot_top_k_words_per_sentiment_tfidf(X_train_master, y_train, 20)

In [None]:
vectorizer.idf_

In [None]:
## Negative

from sklearn.feature_extraction.text import CountVectorizer

vect_neg = TfidfVectorizer()
count_neg = CountVectorizer()

vect_neg.fit(X_train_master[y_train == CLASS_DICT['negative']])
counts = count_neg.fit_transform(X_train_master[y_train == CLASS_DICT['negative']])
counts = counts.sum(axis = 0).reshape((-1, 1))

negative_dict = vect_neg.vocabulary_

inverse_dict = {val: key for key, val in vectorizer.vocabulary_.items()}

idf_df = pd.DataFrame([[inverse_dict[i], x] for i, x in enumerate(vectorizer.idf_)], columns = ['word', 'idf_score'])

In [None]:
idf_df = idf_df.sort_values(by = 'idf_score', ascending = False).reset_index().drop(columns = ['index'])

In [None]:
top_neg_words = []

neg_inverse_dict = {val: key for key, val in vect_neg.vocabulary_.items()}
counts_df = pd.DataFrame([[neg_inverse_dict[i], x[0,0]] for i, x in enumerate(counts)], columns = ['word', 'count'])
counts_df = counts_df[counts_df['count'] > 4]

for index, (word, score) in idf_df.iterrows():
    if word in counts_df.word.values:
        top_neg_words.append([word, score])
    if len(top_neg_words) ==20:
        break
top_neg_words

In [None]:
plot_top_k_explicative_words_per_sentiment(X_train_master, y_train, 20)

## Modelling

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=42, max_iter=1000)

In [None]:
clf.fit(vectorized_phrase, y_train)

## Evaluation

### Preprocess test dataset

In [None]:
X_test = X_test_master.copy()
X_test = vectorizer.transform(X_test)

### Compute predictions on test dataset

In [None]:
y_pred = clf.predict(X_test)
y_pred[:10]

### Evaluate model performance

In [None]:
from sklearn.metrics import plot_confusion_matrix

np.set_printoptions(precision=2)
fig, ax = plt.subplots(figsize=(10, 10))
disp = plot_confusion_matrix(clf, X_test, y_test,
                             display_labels=CLASS_NAMES, cmap=plt.cm.Blues,
                             normalize='true', ax=ax)
disp.ax_.set_title('Normalized confusion matrix')
plt.show()