## Ciência de Dados - Trabalho Prático

> **Nomes:** Bruno Santos Fernandes, João Paulo Moura Clevelares, Thamya Vieira Hashimoto Donadia <br>
> **Matrículas:** 2021100784, 2021100149, 2021100146 <br>
> **E-mails:** {bruno.s.fernandes, joao.clevelares, thamya.donadia}@edu.ufes.br <br>
> **Curso:** Engenharia de Computação <br>


### Metodologia

#### Instalação de bibliotecas

In [None]:
# importação de bibliotecas
import numpy as np
import pandas as pd
import string
import unidecode

import nltk
nltk.download('punkt_tab')
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer

from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD

from sklearn.cluster import KMeans


#### Pré-processamento dos dados textuais

In [None]:
# carregamento do dataset 
df = pd.read_csv("./filmes.csv")
df.head()

In [None]:
# obtenção das informações gerais do dataset
df.info()

In [None]:
# verificando as features do dataset
df.columns

In [None]:
# obtendo a feature a ser processada (sinopse)
df['sinopse'].head(10)

In [None]:
# divisão do texto em sentenças e palavras
df['sentences'] = df['sinopse'].apply(sent_tokenize)
df['tokens'] = df['sinopse'].apply(word_tokenize)

df.head()

In [None]:
# conversão do texto para letras minúsculas
df['tokens'] = df['tokens'].apply(lambda x: [token.lower() for token in x])
df['tokens'].head(10)

In [None]:
# remoção de símbolos de pontuação de cada token
table = str.maketrans('', '', string.punctuation)
df['tokens'] = df['tokens'].apply(lambda x: [token.translate(table) for token in x])
df['tokens'].head(10)

In [None]:
# conversão de caracteres especiais
df['tokens'] = df['tokens'].apply(lambda x: [unidecode.unidecode(token) for token in x])
df['tokens'].head(10)


In [None]:
# remoção de tokens que não são palavras
df['tokens'] = df['tokens'].apply(lambda x: [token for token in x if token.isalpha()])
df['tokens'].head(10)

# TODO: Talvez seja necessário usar alguns tokens númericos

In [None]:
# remoção de stop words
stop_words = set(stopwords.words('english'))
df['tokens'] = df['tokens'].apply(lambda x: [token for token in x if token not in stop_words])
df['tokens'].head(10)


In [None]:
# stemming 
df['tokens'] = df['tokens'].apply(lambda x: [PorterStemmer().stem(token) for token in x])
df['tokens'].head(10)

### Amostragem

In [None]:
sample = df[["sinopse", "tokens", "genres"]].sample(frac=0.3, random_state=42)
sample

#### Construção da matriz de TF-IDF

In [None]:
# gerando a matriz de contagem de termos 
vectorizer = CountVectorizer()
X_counts = vectorizer.fit_transform(sample["sinopse"])
vocab = vectorizer.get_feature_names_out()
vocab

In [None]:
# calculando a frequência de documentos em que cada termo aparece
doc_freq = np.array((X_counts > 0).sum(axis=0)).flatten()
df_vocab = pd.DataFrame({'termo': vocab, 'doc_freq': doc_freq})
df_vocab[df_vocab['doc_freq'] == 1].head(10)

In [None]:
# análise estatística descritiva
mean = np.mean(doc_freq)
median = np.median(doc_freq)
percentiles = np.percentile(doc_freq, [25, 50, 75])

print("Estatísticas da frequência dos termos:")
print(f"Média: {mean:.2f}")
print(f"Mediana: {median}")
print(f"Percentis 25, 50 e 75: {percentiles}")

In [None]:
# plotando o histrogama da frequência dos termos
sns.displot(df_vocab, x=df_vocab['doc_freq'], kde=True, bins=50, log_scale=(True, False))
plt.ylabel('Número de termos')
plt.xlabel('Número de Documentos em que o termo aparece')
plt.title('Distribuição da Frequência dos Termos no Corpus')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

In [None]:
sns.boxplot(x=df_vocab['doc_freq'])
plt.xlabel('Número de Documentos em que o termo aparece')
plt.title('Boxplot da Frequência dos Termos')
plt.show()

In [None]:
vectorizer = TfidfVectorizer(min_df = 2)
X = vectorizer.fit_transform(sample["tokens"].apply(lambda tokens: " ".join(tokens)))
X.shape

In [None]:
tfidf_matrix = pd.DataFrame(X.todense(), columns = vectorizer.get_feature_names_out())
tfidf_matrix

#### Redução de dimensionalidade, via Truncated SVD

In [None]:
n_components_full = X.shape[1]
svd_full = TruncatedSVD(n_components=n_components_full)
svd_full.fit(X)

In [None]:
# plotando a variância cumulativa
cumulative_variance = np.cumsum(svd_full.explained_variance_ratio_)

plt.figure(figsize=(7, 5))
plt.plot(np.arange(1, min(X.shape[0], X.shape[1]) + 1), cumulative_variance)
plt.xlabel(r'$k$ - Número de componentes principais')
plt.ylabel(r'$f(k)$ - Fração cumulativa da variância explicada')
plt.title('Variância Explicada Cumulativa com TruncatedSVD')
plt.grid(True)
plt.show()

In [None]:
new_n_components = 3000
svd = TruncatedSVD(n_components=new_n_components)
X2 = svd.fit_transform(X)
X2

In [None]:
X2.shape

### Normalizando

In [None]:
from sklearn.preprocessing import Normalizer

np.set_printoptions(precision=4, suppress=True)

normalizer = Normalizer(norm='l2')
X_normalized = normalizer.fit_transform(X2.copy())
X_normalized

### Inércia

In [None]:
# Inércial
inertia = []
for i in range(1, 30):
  km = KMeans(n_clusters = i)
  km.fit(X_normalized.copy())
  inertia.append(km.inertia_)

# Scatter
plt.scatter(range(1, 30), inertia)
_ = plt.ylabel("Função Objetivo")
_ = plt.xlabel(r"$k$")

### Silhueta V2

In [None]:
Xs = X_normalized.copy()

from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.cm as cm
import numpy as np

range_n_clusters = [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]

for n_clusters in range_n_clusters:
    # Create a subplot with 1 row and 2 columns
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax1.set_xlim([-0.1, 0.01])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(Xs) + (n_clusters + 1) * 10])

    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(Xs)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(Xs, cluster_labels)
    print(
        "For n_clusters =",
        n_clusters,
        "The average silhouette_score is :",
        silhouette_avg,
    )

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(Xs, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(
            np.arange(y_lower, y_upper),
            0,
            ith_cluster_silhouette_values,
            facecolor=color,
            edgecolor=color,
            alpha=0.7,
        )

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.01])

### Silhueta V1

In [None]:
import sklearn.metrics as metrics

def sc_evaluate_clusters(X, max_clusters, n_init, seed):
    s = np.zeros(max_clusters+1)
    s[0] = 0
    s[1] = 0
    for k in range(2, max_clusters+1):
        kmeans = KMeans(init='k-means++', n_clusters = k, n_init = n_init, random_state = seed)
        kmeans.fit_predict(X)
        s[k] = metrics.silhouette_score(X, kmeans.labels_, metric = 'euclidean')
    return s

s = sc_evaluate_clusters(X_normalized.copy(), 20, 10, 1)
plt.plot(range(2, len(s)), s[2:], 'o-')
plt.xlabel('Number of Clusters')
plt.title('$k$-means clustering performance on synthetic data')
plt.ylabel('Silhouette Score');

### KMeans

In [None]:
K = 7

# Clusterização
kmeans = KMeans(n_clusters = K)
kmeans.fit(X_normalized.copy())


y_kmeans = kmeans.predict(X_normalized.copy())  # neste caso ele esta reprevendo os rotulos dos dados de treinamento, pois essa função é pra predizer novos dados
# ou y_means = kmeans.labels_ # Pega os rotulos dos dados de treinamento

# Vetor com os clusters de cada sinopse
y_kmeans

In [None]:
# Reduzindo dimensionalidade para o plot
pca = PCA(n_components=3)
X2_reduced = pca.fit_transform(X_normalized.copy())
X2_reduced

In [None]:
# Plotando  clusters em 3D
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection='3d')

ax.scatter(X2_reduced[:, 0], X2_reduced[:, 1], X2_reduced[:, 2], c=y_kmeans, cmap=plt.cm.tab20, s=5)

# Rótulos dos eixos
ax.set_xlabel('Componente 1')
ax.set_ylabel('Componente 2')
ax.set_zlabel('Componente 3')

# plt.xlabel('Componente 1')
# plt.ylabel('Componente 2')
# plt.title('Visualização dos Clusters com KMeans')
# plt.colorbar(label='Cluster', ticks=range(20))
# plt.grid(True)
plt.show()

### Percentual de Gêneros em cada Cluster

In [None]:
# Criando nova coluna com os clusters
sample_kmeans = sample.copy()
sample_kmeans["cluster"] = y_kmeans
sample_kmeans

In [None]:
# Separando Generos agrupados nas linhas
sample_kmeans["genres"] = sample_kmeans["genres"].str.split(',')
sample_kmeans = sample_kmeans.explode("genres")
sample_kmeans

In [202]:
# Agregando por (cluster + genero) para descobrir frequencia
genre_frequency_kmeans = sample_kmeans.groupby(["genres", "cluster"]).size().reset_index(name="freq")
genre_frequency_kmeans

# Transformando amostra em formato matricial
df_pivot_kmeans = genre_frequency_kmeans.pivot_table(index='genres', columns='cluster', values='freq', fill_value=0)
df_pivot_kmeans = df_pivot_kmeans.astype(int)

In [None]:
# Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df_pivot_kmeans, cmap="Reds", linewidths=1)
plt.title('Heatmap de Frequência de Gêneros por Cluster')
plt.xlabel('Cluster')
plt.ylabel('Gênero')
plt.show()

### Eigengap

In [None]:
from sklearn.neighbors import kneighbors_graph
from scipy.sparse import csgraph
from numpy import linalg as LA

# 1 Construindo a matriz de adjacências do grafo de vizinhos mais próximos.
G = kneighbors_graph(X_normalized.copy(), n_neighbors = 300, include_self = True)
A = 0.5 * (G + G.T)

# 2 Construindo a Laplaciana Normalizada
L = csgraph.laplacian(A, normed = True).todense()

# 3 Obtendo os autovalores da Laplaciana Normalizada
# Valores já estão ordenados em ordem crescente.
values, _ = LA.eigh(L)

# 4 Plotando os valores dos 'gaps' e escolhendo um k adequado.
plt.scatter([i for i in range(1, 21)], values[:20])
plt.xlabel('Índice do autovalor')
plt.ylabel('Autovalor')

### Agrupamento Espectral

In [None]:
from sklearn.cluster import SpectralClustering

spectral = SpectralClustering(n_clusters=K, assign_labels='discretize', random_state=0).fit(X_normalized.copy())
spectral.labels_

In [None]:
sample_spectral = sample.copy()
sample_spectral["cluster"] = spectral.labels_
sample_spectral

In [None]:
# Separando Generos agrupados nas linhas
sample_spectral["genres"] = sample_spectral["genres"].str.split(',')
sample_spectral = sample_spectral.explode("genres")
sample_spectral

In [None]:
genre_frequency_spectral = sample_spectral.groupby(["genres", "cluster"]).size().reset_index(name="freq")
genre_frequency_spectral

In [152]:
df_pivot_spectral = genre_frequency_spectral.pivot_table(index='genres', columns='cluster', values='freq', fill_value=0)
df_pivot_spectral = df_pivot_spectral.astype(int)

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(df_pivot_spectral, cmap="Reds", linewidths=1)
plt.title('Heatmap de Frequência de Gêneros por Cluster')
plt.xlabel('Cluster')
plt.ylabel('Gênero')
plt.show()