In [None]:
import pandas as pd
import json
from warnings import filterwarnings as _fw_
import matplotlib.pyplot as plt
from random import randint
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.cluster import KMeans

_fw_('ignore')

In [None]:
df = pd.read_csv('df.csv', encoding='utf-8').drop(columns='Unnamed: 0')
# df

In [None]:
with open('binary_columns.json', 'r', encoding='utf-8') as file:
    binary_columns = json.load(file)
# binary_columns[:-2]

In [None]:
target = binary_columns[0:13]
school_head5 = binary_columns[14:30]
school_tail5 = binary_columns[31:46]
course = binary_columns[47:56]
profession = binary_columns[57:66]

# ИЕРАРХИЧЕСКИЙ КЛАСТЕРНЫЙ АНАЛИЗ

In [None]:
df_H = df[binary_columns[:-2]]

In [None]:
link = linkage(df_H, 'ward', 'euclidean')
# link[:5]

In [None]:
dn = dendrogram(link, orientation='right')  # дендрограмма

In [None]:
df_H.loc[:, 'cluster'] = fcluster(link, t=40, criterion='distance')

In [None]:
*clusters, = range(1, df_H.cluster.nunique() + 1)
categories = [target, school_head5, school_tail5, course, profession]
clusters_dictionary = {}
FX = lambda x: df_H.groupby('cluster').mean().loc[:, x].idxmax(axis=1)
for i_cluster in clusters:
    clusters_dictionary[i_cluster] = {'target': FX(categories[0])[i_cluster],
                                      'school_head5': FX(categories[1])[i_cluster],
                                      'school_tail5': FX(categories[2])[i_cluster],
                                      'course': FX(categories[3])[i_cluster],
                                      'profession': FX(categories[4])[i_cluster]}
clusters_dictionary

In [None]:
df_H.groupby('cluster').size()

# КЛАСТЕРНЫЙ АНАЛИЗ МЕТОДОМ K-MEANS

In [None]:
df_K = df[binary_columns[:-2]]

In [None]:
model = KMeans(n_clusters=4, random_state=42, n_init=50, verbose=1, tol=1e-05)
model.fit(df_K)

In [None]:
model.labels_

In [None]:
model.cluster_centers_

In [None]:
# пробуем обучать ml-модельку
new_respondents = [[randint(0, 1) for _ in range(67)] for __ in range(10)]
model.predict(new_respondents)

In [None]:
K = range(1, 11)
models = [KMeans(n_clusters=k).fit(df_K) for k in K]
dist = [model.inertia_ for model in models]

# Plot the elbow
plt.plot(K, dist, marker='o')
plt.xlabel('число кластеров <<k>>')
plt.ylabel('сумма расстояний между центрами кластеров')
plt.title('"Каменистая осыпь" для определения оптимального <<k>>')
plt.show()

In [None]:
model = KMeans(n_clusters=4, random_state=42, n_init=50, verbose=1, tol=1e-05)
model.fit(df_K)
df_K['cluster'] = model.labels_
df_K.groupby('cluster').mean()

In [None]:
*clusters, = range(0, df_K.cluster.nunique())
categories = [target, school_head5, school_tail5, course, profession]
clusters_dictionary = {}
FX = lambda x: df_K.groupby('cluster').mean().loc[:, x].idxmax(axis=1)
for i_cluster in clusters:
    clusters_dictionary[i_cluster] = {'target': FX(categories[0])[i_cluster],
                                      'school_head5': FX(categories[1])[i_cluster],
                                      'school_tail5': FX(categories[2])[i_cluster],
                                      'course': FX(categories[3])[i_cluster],
                                      'profession': FX(categories[4])[i_cluster]}
clusters_dictionary

In [None]:
df_K.groupby('cluster').size()