In [None]:
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
plt.style.use('default')
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF, PCA
from sklearn.cluster import KMeans
from sklearn.manifold import Isomap
import requests
stopwords_list = requests.get("https://gist.githubusercontent.com/rg089/35e00abf8941d72d419224cfd5b5925d/raw/12d899b70156fd0041fa9778d657330b024b959c/stopwords.txt").content
stopwords = set(stopwords_list.decode().splitlines()) 
import os


In [None]:
df = pd.read_csv('./datasets/IMDB Dataset.csv')
vectorizer = CountVectorizer(binary=True, stop_words=stopwords, max_features=1000, max_df=0.4)
X = vectorizer.fit_transform(list(df['review'])).toarray()
print(X.shape)

## One-Hot Encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder
labels = np.array([list(df['sentiment'])]).T
ohe = OneHotEncoder()
y = ohe.fit_transform(labels).toarray()
print(type(y))
print(y.shape)
print(labels[0:5,:])
print(y[0:5,:])


# Definindo uma rede neural artificial com Keras

In [None]:
from keras.models import Model
from keras.layers import Input, Dense

In [None]:
def rede_neural_simples(input_dims, n_dims_out):
  input_layer = Input(shape=(input_dims,))
  x = input_layer
  y = Dense(n_dims_out, activation='sigmoid', name='classificador')(x)
  return Model(input_layer, y)

In [None]:
from tensorflow.keras.utils import plot_model

rede_neural = rede_neural_simples(X.shape[1], y.shape[1])
rede_neural.compile(optimizer='adam', loss='mse')
plot_model(rede_neural, show_shapes=True, show_layer_activations=True)

## Fit da rede neural

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
from keras.callbacks import EarlyStopping
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10, restore_best_weights=True)
history = rede_neural.fit(X_train, y_train, epochs=500, validation_split=0.2, callbacks=es)

In [None]:
plt.figure(figsize=(7,2))
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.xlabel('Épocas')
plt.ylabel('MSE')
plt.legend()
plt.show()

## Avaliando a rede neural artificial

In [None]:
y_est = rede_neural.predict(X_test)
print(y_est[0:5,:])
print(y_test[0:5,:])

In [None]:
from sklearn.metrics import classification_report
print(classification_report(ohe.inverse_transform(y_test), ohe.inverse_transform(y_est)))

## Quais palavras têm mais peso?

In [None]:
w = rede_neural.get_layer('classificador').get_weights()
#print(w)
#print(w[0])

In [None]:
# Visualização 1: quais são as palavras de maior peso?
pesos = np.abs(w[0][:,0] - w[0][:,1])
feature_names = vectorizer.get_feature_names() # Modificar isso para versões mais atuais do sklearn
pares = [ (pesos[i], feature_names[i]) for i in range(len(feature_names))]
pares = sorted(pares, reverse=True)
pesos_ = [c[0] for c in pares]
palavras_ = [c[1] for c in pares]

n_palavras = 50
plt.figure(figsize=(14,3))
plt.bar(np.arange(n_palavras), pesos_[0:n_palavras])
plt.xticks(np.arange(n_palavras), palavras_[0:n_palavras], rotation=80)
plt.xlabel('Palavras')
plt.ylabel('Peso no classificador')
plt.show()

# Projeções intermediárias

In [None]:
def rede_neural_proj(input_dims, n_dims_out):
  input_layer = Input(shape=(input_dims,))
  x = input_layer
  x = Dense(2, name='projecao')(x)
  y = Dense(n_dims_out, activation='sigmoid', name='classificador')(x)
  return Model(input_layer, y)

rede_neural = rede_neural_proj(X.shape[1], y.shape[1])
rede_neural.compile(optimizer='adam', loss='mse')
plot_model(rede_neural, show_shapes=True, show_layer_activations=True)

In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10, restore_best_weights=True)
history = rede_neural.fit(X_train, y_train, epochs=500, validation_split=0.2, callbacks=es)

In [None]:
plt.figure(figsize=(7,2))
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.xlabel('Épocas')
plt.ylabel('MSE')
plt.legend()
plt.show()

In [None]:
y_est = rede_neural.predict(X_test)
print(classification_report(ohe.inverse_transform(y_test), ohe.inverse_transform(y_est)))

In [None]:
# Visualização 2: onde foi parar cada palavra?
v = rede_neural.get_layer('projecao').get_weights()[0]
plt.figure(figsize=(4,4))
plt.scatter(v[:,0], v[:,1], s=1, alpha=0.3, c='b')
for s in ["director", "actor", "bad", "good", "excellent", "plot", "worst", "terrible", "waste", "awful", "fantastic"]:
    _n = vectorizer.vocabulary_[s]
    plt.text(v[_n,0], v[_n,1], s, ha='center')
plt.title('Projeção das palavras no espaço latente')
plt.ylabel('Componente 2')
plt.xlabel('Componente 1')
#plt.xlim([-20,20])
#plt.ylim([-20,20])
plt.show()