In [None]:
import sys
sys.path.append('..')

from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from maker_nlp.vizualisation import plot_top_k_words_per_sentiment_tfidf
from maker_nlp.preprocessing import remove_stop_words, convert_to_lowercase, remove_accents, \
    remove_punctuation_and_digits, normalize_text, lemmatize, clean_text

DATA_FOLDER = Path('../data')

## Load Dataset

In [None]:
df = pd.read_csv(DATA_FOLDER / 'final_dataset.csv')
print(df.shape)
df.head()

In [None]:
phrase, sentiment = df.Phrase, df.Sentiment
print(f'Shape of Phrase = {phrase.shape}, Shape of Sentiment = {sentiment.shape}')

In [None]:
cleaned_phrase = phrase.apply(clean_text)
cleaned_phrase = cleaned_phrase.dropna()

In [None]:
cleaned_phrase[sentiment == 0][12]

## Pre-processing & Feature engineering

### Vectorize Text Data with Tf-Idf

In [None]:
vectorizer = TfidfVectorizer()

vectorized_phrase = vectorizer.fit_transform(cleaned_phrase)

In [None]:
vectorizer = TfidfVectorizer().fit(cleaned_phrase)

negative_df = vectorizer.transform(cleaned_phrase[sentiment == 0])

for sentence in negative_df:
    sorted_sentence = sorted(sentence)

In [None]:
vectorizer = TfidfVectorizer().fit(cleaned_phrase)

negative_df = vectorizer.transform(cleaned_phrase[sentiment == 0])

In [None]:
inverse_dict = {val: key for key, val in vectorizer.vocabulary_.items()}

row = negative_df.getrow(1).toarray()[0].ravel()

top_10_indices = row.argsort()[-12:]
top_10_values = row[top_10_indices]
top_10_words = [inverse_dict[k] for k in top_10_indices]
top_10_words

In [None]:
top_10_values

In [None]:
plt.hist(row[row > 0])

In [None]:
cleaned_phrase[sentiment == 0].values[1]

In [None]:
plot_top_k_words_per_sentiment_tfidf(cleaned_phrase, sentiment, 20)

In [None]:
t = phrase.apply(lambda x: x.lower())
i = 0
for sentence in cleaned_phrase:
    if 'film' in sentence:
        i+=1

In [None]:
i