# Importações

In [423]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go

import os
import shutil

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA


# Dados

In [424]:
data = pd.read_csv('features.csv')
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,509,510,511,name,breed_id,species_id,breed_id_on_species,breed,breed_index,image_path
0,0.461086,0.82149,1.434763,0.888579,0.222046,0.657443,0.117693,0.429433,3.015186,0.437083,...,1.406313,0.136907,1.032156,Abyssinian_100,0,0,0,abyssinian,100,dataset/oxford-iiit-pet/images/Abyssinian_100.jpg
1,0.21973,0.338444,1.065189,2.873387,0.575815,0.193819,0.899878,0.169958,1.455703,0.356942,...,0.620578,0.067177,0.366279,Abyssinian_101,0,0,0,abyssinian,101,dataset/oxford-iiit-pet/images/Abyssinian_101.jpg
2,0.008081,0.478684,1.201589,1.337107,0.54823,0.022551,0.168075,0.360136,1.322711,0.659819,...,1.013998,0.971341,0.912967,Abyssinian_102,0,0,0,abyssinian,102,dataset/oxford-iiit-pet/images/Abyssinian_102.jpg
3,0.02865,0.051734,0.730143,0.351149,0.047293,0.093462,0.789751,0.098805,1.914757,0.609086,...,0.75603,0.40856,0.565956,Abyssinian_103,0,0,0,abyssinian,103,dataset/oxford-iiit-pet/images/Abyssinian_103.jpg
4,0.122697,0.230548,0.387585,0.439176,0.010927,0.085835,1.785948,0.812164,0.770544,1.130807,...,0.593158,0.119751,0.112835,Abyssinian_104,0,0,0,abyssinian,104,dataset/oxford-iiit-pet/images/Abyssinian_104.jpg


In [425]:
features = data.iloc[:,:512]
labels = data.iloc[:,512:] 

# Métricas

In [426]:
scores = pd.read_csv('clustering_scores.csv')

In [427]:
bestsc = scores['silhouette'].idxmax() + 2
bestdb = scores['davies_bouldin'].idxmin() + 2

In [428]:
# Criando o gráfico principal para a 'silhueta'
fig = go.Figure()

# Adicionando a linha da silhueta
fig.add_trace(go.Scatter(x=list(range(2, 38)),
                         y=scores['silhouette'],
                         mode='lines',
                         name='Silhueta',
                         line=dict(color='darkblue')))

# Adicionando a linha vertical para o valor de 'bestsc'
fig.add_vline(x=bestsc, line_width=2, line_dash="dash", line_color="blue")

# Adicionando o segundo eixo y para 'davies_bouldin'
fig.add_trace(go.Scatter(x=list(range(2, 38)),
                         y=scores['davies_bouldin'],
                         mode='lines',
                         name='Davies Bouldin',
                         line=dict(color='red'),
                         yaxis='y2'))

# Adicionando a linha vertical para o valor de 'bestdb'
fig.add_vline(x=bestdb, line_width=2, line_dash="dash", line_color="salmon")

# Atualizando o layout para suportar dois eixos y
fig.update_layout(
    title='Silhueta e Davies Bouldin vs Número de Clusters',
    xaxis=dict(title='Número de Clusters', tickvals=list(range(2, 38, 2))),
    yaxis=dict(title='Silhueta'),
    yaxis2=dict(title='Davies Bouldin', overlaying='y', side='right'),
    width=800,
    height=600
)

# Exibir o gráfico
fig.show()


# Análise dos dados

In [429]:
pca = PCA(n_components=2)

In [430]:
plotable_feat = pca.fit_transform(features.values)

In [431]:
plotable_data = pd.DataFrame(data= plotable_feat, columns=['x', 'y'], index= range(7349))
plotable_data = plotable_data.join(labels)
plotable_data['species_id'] = plotable_data['species_id'].map({0: 'cat', 1:'dog'})


## Silhouette

In [432]:
silhouette_model = KMeans(bestsc, init= 'k-means++', n_init= 'auto', random_state= 257)

In [433]:
plotable_data['clusters_sc'] = silhouette_model.fit_predict(features)
features['clusters_sc'] = silhouette_model.fit_predict(features) 

In [434]:
plotable_data

Unnamed: 0,x,y,name,breed_id,species_id,breed_id_on_species,breed,breed_index,image_path,clusters_sc
0,11.446927,-4.524805,Abyssinian_100,0,cat,0,abyssinian,100,dataset/oxford-iiit-pet/images/Abyssinian_100.jpg,2
1,11.187494,-3.335890,Abyssinian_101,0,cat,0,abyssinian,101,dataset/oxford-iiit-pet/images/Abyssinian_101.jpg,2
2,10.474515,-4.767078,Abyssinian_102,0,cat,0,abyssinian,102,dataset/oxford-iiit-pet/images/Abyssinian_102.jpg,2
3,6.587549,-3.961922,Abyssinian_103,0,cat,0,abyssinian,103,dataset/oxford-iiit-pet/images/Abyssinian_103.jpg,2
4,3.163762,-1.059690,Abyssinian_104,0,cat,0,abyssinian,104,dataset/oxford-iiit-pet/images/Abyssinian_104.jpg,2
...,...,...,...,...,...,...,...,...,...,...
7344,-5.314499,1.906685,yorkshire_terrier_96,36,dog,24,yorkshire_terrier,96,dataset/oxford-iiit-pet/images/yorkshire_terri...,1
7345,-1.074354,1.838093,yorkshire_terrier_97,36,dog,24,yorkshire_terrier,97,dataset/oxford-iiit-pet/images/yorkshire_terri...,1
7346,-0.460642,0.595738,yorkshire_terrier_98,36,dog,24,yorkshire_terrier,98,dataset/oxford-iiit-pet/images/yorkshire_terri...,1
7347,-0.831955,1.538507,yorkshire_terrier_99,36,dog,24,yorkshire_terrier,99,dataset/oxford-iiit-pet/images/yorkshire_terri...,1


In [435]:
clusters_sc = plotable_data.groupby('clusters_sc').apply(lambda x: x.groupby('species_id').apply(lambda x: x.groupby('breed').apply(lambda x: x, include_groups=False), include_groups=False), include_groups=False)

In [436]:
clusters_sc['breed'] = clusters_sc['name'].str.rsplit('_', expand=True, n= 1)[0]

In [437]:
total = clusters_sc['breed'].value_counts()

In [438]:
def middle(x: pd.Series):
    return  x.apply(lambda x : min(100 - x, x))

In [439]:
with open('logs/silhouete.log', 'w') as sil:
    for i in range(bestsc):
        print_data = pd.DataFrame(dict(percentage=((clusters_sc.loc[(i)]['breed'].value_counts() / total).dropna()*100),
                                       count= clusters_sc.loc[(i)]['breed'].value_counts())).sort_values(by='percentage', ascending= False)
        
        print(f'Cluster: {i}', file= sil, flush=True)
        print(print_data, file= sil, flush=True)
        print('', file= sil, flush=True)
        print('', file= sil, flush=True)

### Plot das e classes identificadas como as piores agrupadas

In [441]:
centros = pd.DataFrame({'x': centros[:, 0], 'y': centros[:, 1]})
centros

Unnamed: 0,x,y
0,-4.032776,-5.438284
1,-3.599232,4.845648
2,8.773989,-0.724227


In [442]:
plot = clusters_sc.loc[([0,1,2], ['cat', 'dog'],['sphynx', 'shiba_inu', 'saint_bernard'])].drop(columns= 'breed').reset_index()

In [444]:
def intersec(retas: list):
    m1, b1 = retas[0]['m'], retas[0]['b']
    m2, b2 = retas[1]['m'], retas[1]['b']
    m3, b3 = retas[2]['m'], retas[2]['b']
    

    A = np.array([
        [m1, -1],
        [m2, -1],
        [m3, -1]
    ])
    B = np.array([-b1, -b2, -b3])
    
    return np.linalg.lstsq(A, B, rcond=None)[0]


In [445]:


# Cores e simbolos
breed_colors = {
    'sphynx': 'blue',
    'shiba_inu': 'green',
    'saint_bernard': 'red',

}

clusters_symbols = {
    0: 'circle',
    1: 'square',
    2: 'diamond',

}
plot['color'] = plot['breed'].map(breed_colors)
plot['symbol'] = plot['clusters_sc'].map(clusters_symbols)

# Plot
fig = go.Figure()

# Todos os dados
fig.add_trace(go.Scatter(
    x=plotable_data['x'],
    y=plotable_data['y'],
    mode='markers',
    marker=dict(
        color='gray',
        symbol='circle',
        size=3
    ),
    text= None,
    name='Dados'
))

# Dados das piores raças
fig.add_trace(go.Scatter(
    x=plot['x'],
    y=plot['y'],
    mode='markers',
    marker=dict(
        color=plot['color'],
        symbol=plot['symbol'],
        size=7
    ),
    text=plot['name'],
    name='Piores raças'
))

# centro dos clusters
fig.add_trace(go.Scatter(
    x=centros['x'],
    y=centros['y'],
    mode='markers',
    marker=dict(color='black', size=10, symbol='x'),
    name='Centros dos Clusters',
    text= centros.index
))

# Retas de fronteira dos clusters
def calc_y(m, b, x):
    return m * x + b

p_intersec = intersec(lines)

for line in lines:
    m = line["m"]
    b = line["b"]

    # Limita as retas até o ponto de intersecção e os extremos dos dados
    x_min = p_intersec[0] if line['p'][0] > p_intersec[0] else plotable_data['x'].min() - 1
    x_max = p_intersec[0] if line['p'][0] < p_intersec[0] else plotable_data['x'].max() + 1

    y_min = p_intersec[1] if line['p'][1] > p_intersec[1] else plotable_data['y'].min() - 1
    y_max = p_intersec[1] if line['p'][1] < p_intersec[1] else plotable_data['y'].max() + 1

    
    x_values = np.array([x_min, x_max])
    y_values = calc_y(m, b, x_values)

    # Reajusta os valores para dentro dos limites
    if y_values[0] < y_min or y_values[0] > y_max:
        y_values[0] = np.clip(y_values[0], y_min, y_max)
        x_values[0] = (y_values[0] - b) / m

    if y_values[1] < y_min or y_values[1] > y_max:
        y_values[1] = np.clip(y_values[1], y_min, y_max)
        x_values[1] = (y_values[1] - b) / m

    fig.add_trace(go.Scatter(x=x_values, y=y_values, mode='lines', line=dict(dash='dash', width=3, color='black')))

fig.update_layout(
    title='Gráfico de Dispersão com Centros dos Clusters',
    xaxis_title='X',
    yaxis_title='Y',
    width=1920,
    height=1080
)

fig.show()


## Davies Bloudin

In [455]:
davies_model = KMeans(bestdb, init= 'k-means++', n_init= 'auto', random_state= 257)

In [456]:
plotable_data['clusters_db'] = davies_model.fit_predict(features[[str(i) for i in range(512)]])
features['clusters_db'] = davies_model.fit_predict(features[[str(i) for i in range(512)]]) 

In [457]:
plotable_data

Unnamed: 0,x,y,name,breed_id,species_id,breed_id_on_species,breed,breed_index,image_path,clusters_sc,clusters_db
0,11.446927,-4.524805,Abyssinian_100,0,cat,0,abyssinian,100,dataset/oxford-iiit-pet/images/Abyssinian_100.jpg,2,13
1,11.187494,-3.335890,Abyssinian_101,0,cat,0,abyssinian,101,dataset/oxford-iiit-pet/images/Abyssinian_101.jpg,2,13
2,10.474515,-4.767078,Abyssinian_102,0,cat,0,abyssinian,102,dataset/oxford-iiit-pet/images/Abyssinian_102.jpg,2,13
3,6.587549,-3.961922,Abyssinian_103,0,cat,0,abyssinian,103,dataset/oxford-iiit-pet/images/Abyssinian_103.jpg,2,13
4,3.163762,-1.059690,Abyssinian_104,0,cat,0,abyssinian,104,dataset/oxford-iiit-pet/images/Abyssinian_104.jpg,2,4
...,...,...,...,...,...,...,...,...,...,...,...
7344,-5.314499,1.906685,yorkshire_terrier_96,36,dog,24,yorkshire_terrier,96,dataset/oxford-iiit-pet/images/yorkshire_terri...,1,17
7345,-1.074354,1.838093,yorkshire_terrier_97,36,dog,24,yorkshire_terrier,97,dataset/oxford-iiit-pet/images/yorkshire_terri...,1,17
7346,-0.460642,0.595738,yorkshire_terrier_98,36,dog,24,yorkshire_terrier,98,dataset/oxford-iiit-pet/images/yorkshire_terri...,1,17
7347,-0.831955,1.538507,yorkshire_terrier_99,36,dog,24,yorkshire_terrier,99,dataset/oxford-iiit-pet/images/yorkshire_terri...,1,17


In [458]:
clusters_db = plotable_data.groupby('clusters_db').apply(lambda x: x.groupby('species_id').apply(lambda x: x.groupby('breed').apply(lambda x: x, include_groups=False), include_groups=False), include_groups=False)

In [459]:
clusters_db['breed'] = clusters_db['name'].str.rsplit('_', expand=True, n= 1)[0]

In [460]:
total = clusters_db['breed'].value_counts()

In [461]:
def middle(x: pd.Series):
    return  x.apply(lambda x : min(100 - x, x))

In [462]:
with open('logs/davies.log', 'w') as sil:
    for i in range(bestdb):
        print_data = pd.DataFrame(dict(percentage=((clusters_db.loc[(i)]['breed'].value_counts() / total).dropna()*100),
                                       count= clusters_db.loc[(i)]['breed'].value_counts())).sort_values(by='percentage', ascending= False)
        
        print(f'Cluster: {i}', file= sil, flush=True)
        print(print_data, file= sil, flush=True)
        print('', file= sil, flush=True)
        print('', file= sil, flush=True)

### Plot das e classes identificadas como as piores agrupadas

In [463]:
centros = pca.transform(davies_model.cluster_centers_)

In [464]:
centros = pd.DataFrame({'x': centros[:, 0], 'y': centros[:, 1]})
centros

Unnamed: 0,x,y
0,-1.230919,0.069635
1,-0.645452,4.638885
2,9.783877,3.301709
3,-6.059024,-6.433382
4,0.907094,-6.144674
5,-7.214483,2.162891
6,-3.249258,7.909684
7,8.6903,1.061757
8,-4.52032,-6.707469
9,-6.68803,-4.800915


In [465]:
plot = clusters_sc.loc[([0,1,2], ['cat', 'dog'],['sphynx', 'shiba_inu', 'saint_bernard'])].drop(columns= 'breed').reset_index()

In [466]:
from itertools import combinations

lines = []
for i, j in combinations(range(bestdb), 2):
    
    m = -((centros.loc[j, 'x'] - centros.loc[i, 'x']) / (centros.loc[j, 'y'] - centros.loc[i, 'y']))
    p = ((centros.loc[j, 'x'] + centros.loc[i, 'x'])/2, (centros.loc[j, 'y'] + centros.loc[i, 'y'])/2)

    b = p[1] - m*p[0]

    lines.append({'m': m, 'b': b, 'p': p})

    

In [467]:
def intersec(retas: list):
    m1, b1 = retas[0]['m'], retas[0]['b']
    m2, b2 = retas[1]['m'], retas[1]['b']
    m3, b3 = retas[2]['m'], retas[2]['b']
    

    A = np.array([
        [m1, -1],
        [m2, -1],
        [m3, -1]
    ])
    B = np.array([-b1, -b2, -b3])
    
    return np.linalg.lstsq(A, B, rcond=None)[0]


In [469]:


# Cores e simbolos
# breed_colors = {
#     'sphynx': 'blue',
#     'shiba_inu': 'green',
#     'saint_bernard': 'red',

# }

# clusters_symbols = {
#     0: 'circle',
#     1: 'square',
#     2: 'diamond',

# }
# plot['color'] = plot['breed'].map(breed_colors)
# plot['symbol'] = plot['clusters_sc'].map(clusters_symbols)

# Plot
fig = go.Figure()

# Todos os dados
fig.add_trace(go.Scatter(
    x=plotable_data['x'],
    y=plotable_data['y'],
    mode='markers',
    marker=dict(
        color='gray',
        symbol='circle',
        size=3
    ),
    text= None,
    name='Dados'
))

# # Dados das piores raças
# fig.add_trace(go.Scatter(
#     x=plot['x'],
#     y=plot['y'],
#     mode='markers',
#     marker=dict(
#         color=plot['color'],
#         symbol=plot['symbol'],
#         size=7
#     ),
#     text=plot['name'],
#     name='Piores raças'
# ))

# centro dos clusters
fig.add_trace(go.Scatter(
    x=centros['x'],
    y=centros['y'],
    mode='markers',
    marker=dict(color='black', size=10, symbol='x'),
    name='Centros dos Clusters',
    text= centros.index
))

# Retas de fronteira dos clusters
def calc_y(m, b, x):
    return m * x + b

p_intersec = intersec(lines)

# for line in lines:
#     m = line["m"]
#     b = line["b"]

#     # Limita as retas até o ponto de intersecção e os extremos dos dados
#     x_min = plotable_data['x'].min() - 1
#     x_max = plotable_data['x'].max() + 1
#     y_min = plotable_data['y'].min() - 1
#     y_max = plotable_data['y'].max() + 1

    
#     x_values = np.array([x_min, x_max])
#     y_values = calc_y(m, b, x_values)

#     # Reajusta os valores para dentro dos limites
#     if y_values[0] < y_min or y_values[0] > y_max:
#         y_values[0] = np.clip(y_values[0], y_min, y_max)
#         x_values[0] = (y_values[0] - b) / m

#     if y_values[1] < y_min or y_values[1] > y_max:
#         y_values[1] = np.clip(y_values[1], y_min, y_max)
#         x_values[1] = (y_values[1] - b) / m

#     fig.add_trace(go.Scatter(x=x_values, y=y_values, mode='lines', line=dict(dash='dash', width=3, color='black')))

fig.update_layout(
    title='Gráfico de Dispersão com Centros dos Clusters',
    xaxis_title='X',
    yaxis_title='Y',
    width=1920,
    height=1080
)

fig.show()


## Species Label

In [None]:
species_model = KMeans(2, init= 'k-means++', n_init= 'auto', random_state= 257)

In [None]:
features.columns

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '504', '505', '506', '507', '508', '509', '510', '511', 'clusters_sc',
       'clusters_db'],
      dtype='object', length=514)

In [None]:
plotable_data['clusters_sl'] = species_model.fit_predict(features[[str(i) for i in range(512)]].values)
features['clusters_sl'] = species_model.fit_predict(features[[str(i) for i in range(512)]].values) 

In [None]:
plotable_data

Unnamed: 0,x,y,name,breed_id,species_id,breed_id_on_species,breed,breed_index,image_path,clusters_sc,clusters_db,clusters_sl
0,11.446927,-4.524805,Abyssinian_100,0,cat,0,abyssinian,100,dataset/oxford-iiit-pet/images/Abyssinian_100.jpg,2,1,1
1,11.187494,-3.335890,Abyssinian_101,0,cat,0,abyssinian,101,dataset/oxford-iiit-pet/images/Abyssinian_101.jpg,2,1,1
2,10.474515,-4.767078,Abyssinian_102,0,cat,0,abyssinian,102,dataset/oxford-iiit-pet/images/Abyssinian_102.jpg,2,1,1
3,6.587549,-3.961922,Abyssinian_103,0,cat,0,abyssinian,103,dataset/oxford-iiit-pet/images/Abyssinian_103.jpg,2,1,1
4,3.163762,-1.059690,Abyssinian_104,0,cat,0,abyssinian,104,dataset/oxford-iiit-pet/images/Abyssinian_104.jpg,2,11,1
...,...,...,...,...,...,...,...,...,...,...,...,...
7344,-5.314499,1.906685,yorkshire_terrier_96,36,dog,24,yorkshire_terrier,96,dataset/oxford-iiit-pet/images/yorkshire_terri...,1,3,0
7345,-1.074354,1.838093,yorkshire_terrier_97,36,dog,24,yorkshire_terrier,97,dataset/oxford-iiit-pet/images/yorkshire_terri...,1,14,0
7346,-0.460642,0.595738,yorkshire_terrier_98,36,dog,24,yorkshire_terrier,98,dataset/oxford-iiit-pet/images/yorkshire_terri...,1,14,0
7347,-0.831955,1.538507,yorkshire_terrier_99,36,dog,24,yorkshire_terrier,99,dataset/oxford-iiit-pet/images/yorkshire_terri...,1,14,0


In [None]:
clusters_sl = plotable_data.groupby('clusters_sl').apply(lambda x: x.groupby('species_id').apply(lambda x: x.groupby('breed').apply(lambda x: x, include_groups=False), include_groups=False), include_groups=False)

In [None]:
clusters_sl['breed'] = clusters_sl['name'].str.rsplit('_', expand=True, n= 1)[0]

In [None]:
total = clusters_sl['breed'].value_counts()

In [None]:
def middle(x: pd.Series):
    return  x.apply(lambda x : min(100 - x, x))

In [None]:
with open('logs/species.log', 'w') as sil:
    for i in range(2):
        print_data = pd.DataFrame(dict(percentage=((clusters_sl.loc[(i)]['breed'].value_counts() / total).dropna()*100),
                                       count= clusters_sl.loc[(i)]['breed'].value_counts())).sort_values(by='percentage', ascending= False)
        
        print(f'Cluster: {i}', file= sil, flush=True)
        print(print_data, file= sil, flush=True)
        print('', file= sil, flush=True)
        print('', file= sil, flush=True)

### Plot das e classes identificadas como as piores agrupadas

In [None]:
centros = pca.transform(species_model.cluster_centers_)

In [None]:
centros = pd.DataFrame({'x': centros[:, 0], 'y': centros[:, 1]})
centros

Unnamed: 0,x,y
0,-3.90925,0.358445
1,8.625251,-0.790863


In [None]:
plot = clusters_sl.loc[([0,1], ['cat', 'dog'], 'sphynx')].drop(columns= 'breed').reset_index()

In [None]:
plot

Unnamed: 0,clusters_sl,species_id,breed,level_3,x,y,name,breed_id,breed_id_on_species,breed_index,image_path,clusters_sc,clusters_db
0,0,cat,sphynx,1646,1.184919,-4.708940,Sphynx_102,33,11,102,dataset/oxford-iiit-pet/images/Sphynx_102.jpg,0,11
1,0,cat,sphynx,1649,1.999276,-6.925433,Sphynx_105,33,11,105,dataset/oxford-iiit-pet/images/Sphynx_105.jpg,0,11
2,0,cat,sphynx,1650,1.069605,-6.034640,Sphynx_107,33,11,107,dataset/oxford-iiit-pet/images/Sphynx_107.jpg,0,11
3,0,cat,sphynx,1652,0.540465,-3.984201,Sphynx_110,33,11,110,dataset/oxford-iiit-pet/images/Sphynx_110.jpg,0,11
4,0,cat,sphynx,1653,1.098269,-9.510850,Sphynx_111,33,11,111,dataset/oxford-iiit-pet/images/Sphynx_111.jpg,0,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,1,cat,sphynx,7052,3.260584,-8.675917,Sphynx_92,33,11,92,dataset/oxford-iiit-pet/images/Sphynx_92.jpg,0,11
196,1,cat,sphynx,7053,7.224623,-3.325194,Sphynx_93,33,11,93,dataset/oxford-iiit-pet/images/Sphynx_93.jpg,2,11
197,1,cat,sphynx,7054,6.230920,-6.779366,Sphynx_94,33,11,94,dataset/oxford-iiit-pet/images/Sphynx_94.jpg,2,11
198,1,cat,sphynx,7056,6.716340,-5.769696,Sphynx_96,33,11,96,dataset/oxford-iiit-pet/images/Sphynx_96.jpg,2,1


In [None]:
from itertools import combinations

lines = []
for i, j in combinations([0,1], 2):
    
    m = -((centros.loc[j, 'x'] - centros.loc[i, 'x']) / (centros.loc[j, 'y'] - centros.loc[i, 'y']))
    p = ((centros.loc[j, 'x'] + centros.loc[i, 'x'])/2, (centros.loc[j, 'y'] + centros.loc[i, 'y'])/2)

    b = p[1] - m*p[0]

    lines.append({'m': m, 'b': b, 'p': p})

    

In [None]:
def intersec(retas: list):
    m1, b1 = retas[0]['m'], retas[0]['b']
    m2, b2 = retas[1]['m'], retas[1]['b']
    m3, b3 = retas[2]['m'], retas[2]['b']
    

    A = np.array([
        [m1, -1],
        [m2, -1],
        [m3, -1]
    ])
    B = np.array([-b1, -b2, -b3])
    
    return np.linalg.lstsq(A, B, rcond=None)[0]


In [None]:


# Cores e simbolos
breed_colors = {
    'sphynx': 'blue',
    

}

clusters_symbols = {
    0: 'circle',
    1: 'square',

}
plot['color'] = plot['breed'].map(breed_colors)
plot['symbol'] = plot['clusters_sl'].map(clusters_symbols)

# Plot
fig = go.Figure()

# Todos os dados
fig.add_trace(go.Scatter(
    x=plotable_data['x'],
    y=plotable_data['y'],
    mode='markers',
    marker=dict(
        color='gray',
        symbol='circle',
        size=3
    ),
    text= None,
    name='Dados'
))

# Dados das piores raças
fig.add_trace(go.Scatter(
    x=plot['x'],
    y=plot['y'],
    mode='markers',
    marker=dict(
        color=plot['color'],
        symbol=plot['symbol'],
        size=7
    ),
    text=plot['name'],
    name='Piores raças'
))

# centro dos clusters
fig.add_trace(go.Scatter(
    x=centros['x'],
    y=centros['y'],
    mode='markers',
    marker=dict(color='black', size=10, symbol='x'),
    name='Centros dos Clusters',
    text= centros.index
))

# Retas de fronteira dos clusters
def calc_y(m, b, x):
    return m * x + b



for line in lines:
    m = line["m"]
    b = line["b"]

    # Limita as retas até o ponto de intersecção e os extremos dos dados
    x_min = plotable_data['x'].min() - 1
    x_max = plotable_data['x'].max() + 1

    y_min = plotable_data['y'].min() - 1
    y_max = plotable_data['y'].max() + 1

    
    x_values = np.array([x_min, x_max])
    y_values = calc_y(m, b, x_values)

    # Reajusta os valores para dentro dos limites
    if y_values[0] < y_min or y_values[0] > y_max:
        y_values[0] = np.clip(y_values[0], y_min, y_max)
        x_values[0] = (y_values[0] - b) / m

    if y_values[1] < y_min or y_values[1] > y_max:
        y_values[1] = np.clip(y_values[1], y_min, y_max)
        x_values[1] = (y_values[1] - b) / m

    fig.add_trace(go.Scatter(x=x_values, y=y_values, mode='lines', line=dict(dash='dash', width=3, color='black')))

fig.update_layout(
    title='Gráfico de Dispersão com Centros dos Clusters',
    xaxis_title='X',
    yaxis_title='Y',
    width=1920,
    height=1080
)

fig.show()


## Breed Label

In [None]:
breed_model = KMeans(37, init= 'k-means++', n_init= 'auto', random_state= 257)

In [None]:
plotable_data['clusters_bl'] = breed_model.fit_predict(features)
features['clusters_bl'] = breed_model.fit_predict(features) 

In [None]:
plotable_data

Unnamed: 0,x,y,name,breed_id,species_id,breed_id_on_species,breed,breed_index,image_path,clusters_sc,clusters_db,clusters_sl,clusters_bl
0,11.446927,-4.524805,Abyssinian_100,0,cat,0,abyssinian,100,dataset/oxford-iiit-pet/images/Abyssinian_100.jpg,2,1,1,33
1,11.187494,-3.335890,Abyssinian_101,0,cat,0,abyssinian,101,dataset/oxford-iiit-pet/images/Abyssinian_101.jpg,2,1,1,33
2,10.474515,-4.767078,Abyssinian_102,0,cat,0,abyssinian,102,dataset/oxford-iiit-pet/images/Abyssinian_102.jpg,2,1,1,33
3,6.587549,-3.961922,Abyssinian_103,0,cat,0,abyssinian,103,dataset/oxford-iiit-pet/images/Abyssinian_103.jpg,2,1,1,19
4,3.163762,-1.059690,Abyssinian_104,0,cat,0,abyssinian,104,dataset/oxford-iiit-pet/images/Abyssinian_104.jpg,2,11,1,35
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7344,-5.314499,1.906685,yorkshire_terrier_96,36,dog,24,yorkshire_terrier,96,dataset/oxford-iiit-pet/images/yorkshire_terri...,1,3,0,21
7345,-1.074354,1.838093,yorkshire_terrier_97,36,dog,24,yorkshire_terrier,97,dataset/oxford-iiit-pet/images/yorkshire_terri...,1,14,0,2
7346,-0.460642,0.595738,yorkshire_terrier_98,36,dog,24,yorkshire_terrier,98,dataset/oxford-iiit-pet/images/yorkshire_terri...,1,14,0,2
7347,-0.831955,1.538507,yorkshire_terrier_99,36,dog,24,yorkshire_terrier,99,dataset/oxford-iiit-pet/images/yorkshire_terri...,1,14,0,2


In [None]:
clusters_bl = plotable_data.groupby('clusters_bl').apply(lambda x: x.groupby('species_id').apply(lambda x: x.groupby('breed').apply(lambda x: x, include_groups=False), include_groups=False), include_groups=False)

In [None]:
clusters_bl['breed'] = clusters_bl['name'].str.rsplit('_', expand=True, n= 1)[0]

In [None]:
total = clusters_bl['breed'].value_counts()

In [None]:
def middle(x: pd.Series):
    return  x.apply(lambda x : min(100 - x, x))

In [None]:
with open('logs/breed.log', 'w') as sil:
    for i in range(37):
        print_data = pd.DataFrame(dict(percentage=((clusters_bl.loc[(i)]['breed'].value_counts() / total).dropna()*100),
                                       count= clusters_bl.loc[(i)]['breed'].value_counts())).sort_values(by='percentage', ascending= False)
        
        print(f'Cluster: {i}', file= sil, flush=True)
        print(print_data, file= sil, flush=True)
        print('', file= sil, flush=True)
        print('', file= sil, flush=True)