# Clusterização - Classificação Não Supervisionada

In [None]:
import pandas as pd, numpy as np
import sklearn.cluster, sklearn.preprocessing
import matplotlib, matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:,.0f}'.format

In [None]:
# Lendo o dataset mosn - Most Online Social Network
df = pd.read_csv('mosn.csv', thousands=',',
                   names=('Name', 'Description', 'Date', 'Registered Users',
                          'Registration', 'Alexa Rank'))
columns = ['Registered Users', 'Alexa Rank']
df.head()

In [None]:
len(df)

In [None]:
df.tail()

# Análise Descritiva

In [None]:
import seaborn as sns
import numpy as np
df2 = df.copy()
df2 = df2.dropna(subset=['Registered Users'])
sns.distplot(df2['Registered Users'])
print(df2['Registered Users'].describe())

In [None]:
sns.distplot(np.log(df2['Registered Users']))

In [None]:
df2 = df2.dropna(subset=['Alexa Rank'])
sns.distplot(df2['Alexa Rank'])
print(df2['Alexa Rank'].describe())

# Algumas redes socias...

In [None]:
# procurar Redes Sociais Específicas

df[df['Registered Users'] == df['Registered Users'].max() ]

In [None]:
df[df.Name == "Facebook"]

In [None]:
# procurar Redes Sociais Específicas
df[df['Name'].isin(["Twitter", "Facebook", "Instagram"])]

In [None]:
len(df)

# Pré-processamento

In [None]:
# Elimina linhas com dados "missing" (faltosos-Nulos) e zeros
df2 = df[np.log(df[columns]).notnull().all(axis=1)].copy()
df2.head()

In [None]:
len(df2)

# Clusterização

In [None]:
columns

In [None]:
df2.loc[:, columns].head()

In [None]:
# aplicar o algoritmo de clusterização
from sklearn.cluster import KMeans
kmeans = KMeans()
kmeans.fit(np.log(df2[columns]))
df2["Clusters"] = kmeans.labels_
kmeans

In [None]:
df2.head()

In [None]:
# acessando a tupla que contém o Facebook
fb = df2.set_index('Name').loc['Facebook']['Clusters']
df2.iloc[fb,:]

In [None]:
df2[df2.Name == "Facebook"]

In [None]:
df2.head()

In [None]:
df2.Clusters.value_counts()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from mpl_toolkits.mplot3d import Axes3D
colors=['#12efff','#eee111','#eee00f','#e00fff','#123456','#abc222','#000000','#123fff','#1eff1f','#2edf4f','#2eaf9f',
        '#22222f',
        '#eeeff1','#eee112','#00ef00','#aa0000','#0000aa','#000999','#32efff','#23ef68','#2e3f56','#7eef1f','#eeef11']

C=1

x = df2['Registered Users'] 
y = df2['Alexa Rank']
Cluster = df2["Clusters"]    # Labels of cluster 0 to 3

dim_plot = (14,9)
fig = plt.figure(figsize=dim_plot)
ax = fig.add_subplot(111)
scatter = ax.scatter(x,y,c=Cluster,s=100)

ax.set_xlabel('Número de Usuários Registrados')
ax.set_ylabel('Alexa Rank')
plt.colorbar(scatter)
plt.title("Sites de Redes Sociais")
plt.xscale("log")
plt.yscale("log")
ax.set_aspect('auto')
#plt.figure(figsize=(18,12))
#plt.subplots(figsize=(12, 10))

#fig.show()
plt.show()

In [None]:
# Select a good-locking style
matplotlib.style.use("ggplot")

dim_plot = (18,9.5)
fig = plt.figure(figsize=dim_plot)
ax = fig.add_subplot(111)

x = df2['Registered Users'] 
y = df2['Alexa Rank']
Cluster = df2["Clusters"]
# Display the results
scatter = ax.scatter(x,y,c=Cluster,  cmap=plt.cm.Accent,s=100)

#scatter = df2.plot.scatter(columns[0], columns[1], c="Clusters", 
#                       cmap=plt.cm.Accent, s=100)

plt.colorbar(scatter)
plt.title("Massive online social networking sites")
plt.xscale("log")
plt.yscale("log")
ax.set_aspect('auto')


# Annotate the most prominent sites
def add_abbr(site):
    if site['Clusters'] == fb:
        _ = ax.annotate(site["Name"], site[columns], xytext=(1, 5), 
                        textcoords="offset points", size=8,
                        color="darkslategrey")
df2.apply(add_abbr, axis=1)
#ax.set_aspect('auto')
#ax.set_aspect(.5)
plt.show()

In [None]:
df2.head()

In [None]:
# Cluster do Facebook
df2[df2.Name == "Facebook"]

In [None]:
# Cluster do Twitter
df2[df2.Name == "Twitter"]

In [None]:
# Cluster do Academia.edu 
df2[df2.Name == "Academia.edu"]

In [None]:
df2[df2.Name == "Instagram"]

# Predição de novos dados

In [None]:
df2.head()

In [None]:
df2.tail()

In [None]:
import numpy as np

columns = ['Registered Users', 'Alexa Rank']
dados_x = [[1800000, 842], [5000000, 1200], [60000, 35000] ]
filmes = np.log([[1800000, 842], [5000000, 1200], [60000, 35000]])
kmeans.predict(filmes)

In [None]:
# Select a good-locking style
matplotlib.style.use("ggplot")

dim_plot = (18,9.5)
fig = plt.figure(figsize=dim_plot)
ax = fig.add_subplot(111)

x = df2['Registered Users'] 
y = df2['Alexa Rank']
Cluster = df2["Clusters"]
# Display the results
scatter = ax.scatter(x,y,c=Cluster,  cmap=plt.cm.Accent,s=100)

#scatter = df2.plot.scatter(columns[0], columns[1], c="Clusters", 
#                       cmap=plt.cm.Accent, s=100)

plt.colorbar(scatter)
plt.title("Massive online social networking sites")
plt.xscale("log")
plt.yscale("log")
ax.set_aspect('auto')


# Annotate the most prominent sites
def add_abbr(site):
    if site['Clusters'] == fb:
        _ = ax.annotate(site["Name"], site[columns], xytext=(1, 5), 
                        textcoords="offset points", size=8,
                        color="darkslategrey")
df2.apply(add_abbr, axis=1)
#ax.set_aspect('auto')
#ax.set_aspect(.5)

# os novos dados
dados_x = [[1800000, 842], [5000000, 1200], [60000, 35000] ]
x_new = [x[0] for x in dados_x]
y_new = [y[1] for y in dados_x]
plt.scatter(x_new,y_new, c='red', marker='x', s=100)

plt.show()

#  Determinando o melhor número de Clusters (Elbow Method)

In [None]:
import numpy as np
np.set_printoptions(suppress=True, precision=4)

Xy = df2.loc[:, ['Registered Users', 'Alexa Rank']]

Ks = range(1, 10)
km = [KMeans(n_clusters=i) for i in Ks]
score = [km[i].fit(Xy).score(Xy) for i in range(len(km))]
formatted_score = map(lambda x : '{:,.0f}'.format(x), score)
print(list(formatted_score))

In [None]:
plt.scatter( list(range(len(score))), score, c="b")
plt.title("Curva de Elbow")
plt.xlabel("Clusters")
plt.ylabel("Distorção")
plt.show()