In [None]:
# %% [markdown]
# # Movie reviews Classification

# %% [markdown]
# <img src="https://frenzy86.s3.eu-west-2.amazonaws.com/python/nlp/rev.png" width="1200">
# 
# 

# %%
import numpy as np
import pandas as pd
import re
from sklearn.datasets import load_files

import warnings
warnings.filterwarnings('ignore')

# %%
!wget https://frenzy86.s3.eu-west-2.amazonaws.com/python/nlp/txt_sentoken.zip

# %%
!unzip txt_sentoken.zip

# %%
movie_data = load_files(r"txt_sentoken")
X_, y = movie_data.data, movie_data.target

# %%
unique, counts = np.unique(y, return_counts=True)

print(np.asarray((unique, counts)).T)

# %%
X_

# %%
type(X_)

# %%
len(X_)

# %%
len(y)

# %%
def pulizia_preliminare(lista_stringhe, lista_regex):
    stringhe_pulite = []
    for stringa in lista_stringhe:
        stringa = stringa.decode()  # Converte da byte a stringa
        for regex in lista_regex:
            stringa = re.sub(regex, '', stringa)
        stringhe_pulite.append(stringa)
    return stringhe_pulite

# %%
# Esempio di regex per rimuovere caratteri speciali
regex = [
    # r"[!@#$%^&*()]",
    # r"[\_\-\"\?\;\'\,\.\:\/\\\n]",
    # r"(?<!\S)\\d+(?!\S)"
    r"[^a-zA-Z\s]"
]

# Applica la pulizia preliminare
stringhe_pulite = pulizia_preliminare(X_, regex)

# %%
stringhe_pulite

# %%
def rimuovi_spazi_in_eccesso(lista_stringhe):
    stringhe_pulite = []
    for stringa in lista_stringhe:
        stringa_pulita = ' '.join(stringa.split())
        stringhe_pulite.append(stringa_pulita)
    return stringhe_pulite

# %%
stringhe_pulite = rimuovi_spazi_in_eccesso(stringhe_pulite)

# %%
stringhe_pulite

# %%
print(type(stringhe_pulite))

# %%
len(stringhe_pulite)

# %%
len(y)

# %%
res = " ".join(stringhe_pulite)

# %%
print(type(res))

# %%
type(stringhe_pulite[0])

# %%
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

vectorizer = CountVectorizer()
tfidfconverter = TfidfTransformer()

X_vector = vectorizer.fit_transform(stringhe_pulite).toarray()
X_tf_idf = tfidfconverter.fit_transform(X_vector).toarray()

X_tf_idf

# %%
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_tf_idf, y, 
                                                    test_size=0.2, 
                                                    random_state=667
                                                    )

# %%
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score

classifier = MultinomialNB(alpha=.01)
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
y_prob = classifier.predict_proba(X_test)[:,1]

print(classification_report(y_test,y_pred))
print(f1_score(y_test, y_pred, average='macro'))

# %%
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix 

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize = (14,10))
sns.heatmap(cm,cmap= "Blues", 
            linecolor = 'black', 
            linewidth = 1, 
            annot = True, 
            fmt='')

plt.xlabel("Predicted")
plt.ylabel("Actual")

# %%
from wordcloud import WordCloud, STOPWORDS

wordcloud = WordCloud(width = 3000, height = 2000,background_color='white')

wordcloud.generate(res)


# %%
plt.figure(figsize=(18,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")

# %%
wordcloud.to_file('wordcloud.png')

# %%
stopwords = set(STOPWORDS)
wordcloud = WordCloud(width = 3000, height = 2000,background_color='white',stopwords=stopwords)
wordcloud.generate(res)

# %%
plt.figure(figsize=(20, 16))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

# %%



