In [None]:
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
plt.style.use('default')
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF, PCA
from sklearn.cluster import KMeans
from sklearn.manifold import Isomap
import requests
stopwords_list = requests.get("https://gist.githubusercontent.com/rg089/35e00abf8941d72d419224cfd5b5925d/raw/12d899b70156fd0041fa9778d657330b024b959c/stopwords.txt").content
stopwords = set(stopwords_list.decode().splitlines()) 



In [None]:
from sklearn.datasets import make_blobs
x,y = make_blobs(n_samples=300, n_features=2, centers=[[-2,-2],[2,2]])
print (x.shape)
plt.figure(figsize=(5,5))
plt.scatter(x[:,0], x[:,1], s=2)
#plt.savefig('clusters.png')
plt.show()

In [None]:
kmeans = KMeans(n_clusters=2)
x_ = kmeans.fit_transform(x)
means_ = kmeans.cluster_centers_
print(means_)


In [None]:
labels_ = kmeans.labels_
print(labels_)

In [None]:
xp = np.array([x[i,:] for i in range(len(labels_)) if labels_[i]==0])
xn = np.array([x[i,:] for i in range(len(labels_)) if labels_[i]==1])
plt.figure(figsize=(4,4))
plt.scatter(xp[:,0], xp[:,1], s=1, c='b', alpha=0.5, label='Cluster 0')
plt.scatter(xn[:,0], xn[:,1], s=1, c='r', alpha=0.5, label='Cluster 1')
plt.scatter(means_[0,0], means_[0,1], s=40, c='b', label='Centro 0')
plt.scatter(means_[1,0], means_[1,1], s=40, c='r', label='Centro 1')
plt.title('Pontos e clusterização')
plt.legend()
plt.show()

# Aplicação: IMDB Dataset

In [None]:
df = pd.read_csv('./datasets/IMDB Dataset.csv').sample(2000)
adhoc_stopwords = set(("good time 10 lost great bad".split()))
vectorizer = CountVectorizer(binary=True, stop_words=set.union(stopwords, adhoc_stopwords), max_features=1000, max_df=0.4, ngram_range=(1,2))
X = vectorizer.fit_transform(list(df['review']))
print(X.shape)
projecao = PCA(n_components=2)
y = projecao.fit_transform(X.toarray())
plt.figure(figsize=(4,4))
plt.scatter(y[:,0], y[:,1], s=1, c='b', alpha=0.5)
plt.title('Projeção da distribuição de documentos')
plt.ylabel('Componente 2')
plt.xlabel('Componente 1')
plt.legend()
plt.show()

In [None]:
n_clusters = 6
colors = list(mcolors.TABLEAU_COLORS)
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(X)
c = kmeans.labels_
plt.figure(figsize=(5,5))
for k in range(n_clusters):
    yc = np.array([ [y[i,0], y[i,1]] for i in range(len(c)) if c[i]==k ])
    print(yc.shape)
    plt.scatter(yc[:,0], yc[:,1], s=5, c=colors[k], label=str(k))
plt.title('Clusterização da distribuição de documentos')
plt.ylabel('Componente 2')
plt.xlabel('Componente 1')
plt.show()


In [None]:

for k in range(n_clusters):
    x_ = np.mean(X[c==k,:], axis=0)

    tuplas = [ (x_[0,vectorizer.vocabulary_[i]], i) for i in vectorizer.vocabulary_.keys()]
    tuplas_ordenadas = sorted(tuplas, reverse=True) # reverse=True pede uma ordenação em ordem decrescente
    palavras = [ t[1] for t in tuplas_ordenadas ]
    contagens = [ t[0] for t in tuplas_ordenadas ]

    n_palavras = 15
    eixo_x = np.arange(n_palavras)
    plt.figure(figsize=(10,1))
    plt.bar(eixo_x[0:n_palavras], contagens[0:n_palavras], color=colors[k])
    plt.title('Cluster ' + str(k) + ' - Documentos:' + str(X[c==k,:].shape[0]))
    plt.xticks(eixo_x[0:n_palavras], palavras[0:n_palavras], rotation=70)
    plt.show()

# Aplicação: clusterizando clientes

In [None]:
df = pd.read_csv('./datasets/amazon_reviews.csv').sample(2000)
df.head()

In [None]:
adhoc_stopwords = set(("good time 10 lost great bad amazon echo alexa love tablet".split()))
vectorizer = CountVectorizer(binary=True, stop_words=set.union(stopwords, adhoc_stopwords), max_features=1000, max_df=0.4, ngram_range=(1,2))
X = vectorizer.fit_transform(list(df['reviews.title'].dropna()))
print(X.shape)
projecao = PCA(n_components=2)
y = projecao.fit_transform(X.toarray())
plt.figure(figsize=(4,4))
plt.scatter(y[:,0], y[:,1], s=1, c='b', alpha=0.5)
plt.title('Projeção da distribuição de documentos')
plt.ylabel('Componente 2')
plt.xlabel('Componente 1')
plt.legend()
plt.show()

In [None]:
n_clusters = 9
colors = list(mcolors.TABLEAU_COLORS)
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(X)
c = kmeans.labels_
plt.figure(figsize=(5,5))
for k in range(n_clusters):
    yc = np.array([ [y[i,0], y[i,1]] for i in range(len(c)) if c[i]==k ])
    print(yc.shape)
    plt.scatter(yc[:,0], yc[:,1], s=5, c=colors[k], label=str(k))
plt.title('Clusterização da distribuição de documentos')
plt.ylabel('Componente 2')
plt.xlabel('Componente 1')
plt.show()


In [None]:

for k in range(n_clusters):
    x_ = np.mean(X[c==k,:], axis=0)

    tuplas = [ (x_[0,vectorizer.vocabulary_[i]], i) for i in vectorizer.vocabulary_.keys()]
    tuplas_ordenadas = sorted(tuplas, reverse=True) # reverse=True pede uma ordenação em ordem decrescente
    palavras = [ t[1] for t in tuplas_ordenadas ]
    contagens = [ t[0] for t in tuplas_ordenadas ]

    n_palavras = 15
    eixo_x = np.arange(n_palavras)
    plt.figure(figsize=(10,1))
    plt.bar(eixo_x[0:n_palavras], contagens[0:n_palavras], color=colors[k])
    plt.title('Cluster ' + str(k) + ' - Documentos:' + str(X[c==k,:].shape[0]))
    plt.xticks(eixo_x[0:n_palavras], palavras[0:n_palavras], rotation=70)
    plt.show()