In [None]:
from sklearn import datasets
import pandas as pd
import glob

import rpy2.interactive as r
import rpy2.interactive.packages

from rpy2.robjects import pandas2ri
pandas2ri.activate()
%load_ext rpy2.ipython

rlib = r.packages.packages
r.packages.importr("utils")

import seaborn as sns
%matplotlib inline

# install
#rlib.utils.install_packages("NbClust");
#rlib.utils.install_packages("kohonen")

## число кластеров

In [None]:
df_fns = glob.glob('data/processed/joined/*')
df_fns

In [None]:
df = pd.read_pickle(df_fns[0])

features = df.filter(regex='(size_)|(price)').columns
df = df[features]
df['target'] = df.price
print(df.head())
print(df.shape)

sns.pairplot(df[features]);

In [None]:
# преобразуем его в R-пригодный вид
r_df = pandas2ri.py2ri(df[features])
# и отправим в R
%Rpush r_df 

In [None]:
%R library(NbClust);
%R set.seed(1234);
%R Nnc <- NbClust(r_df, distance = "euclidean", min.nc = 2, max.nc = 10, method = "complete", index = "all")

In [None]:
%R table(Nnc$Best.n[1,])
%R barplot(table(Nnc$Best.n[1,]), xlab="Number of Clusters", ylab="Number of Criteria", main="Number of Clusters Chosen by 30 Criteria");

In [None]:
sns.pairplot(df, hue = 'target');

## кохонен

In [None]:
features

In [None]:
r_df = pandas2ri.py2ri(df)
sns.pairplot(df);

In [None]:
%R library(kohonen)
%R set.seed(1234)

In [None]:
# данные нужно отдать в виде матрицы
%R matrix <- as.matrix(r_df)
%R som_grid <- somgrid(xdim=15, ydim=10, topo="hexagonal")

In [None]:
# и организовать карту
%R bos.som <- som(matrix, som_grid, rlen=50, alpha=c(0.05,0.01));

In [None]:
# теперь определим количество кластеров
%R codes <- getCodes(bos.som)
%R b_Nnc <- NbClust(codes, min.nc=2, max.nc=20, method="kmeans");

In [None]:
# и посмотрим на некоторые визуализации
%R plot(bos.som, type="dist.neighbours")
%R plot(bos.som, type="changes",main="Training Progress")

In [None]:
# в конечном итоге, вот так выглядит в двумерном пространстве весь датасет
%R som_cluster <- cutree(hclust(dist(codes)), 3)
%R pretty_palette <- c('#7DBD00', '#62C2CC', '#FF5B00', '#f1bc41')
%R plot(bos.som, type='mapping', bgcol = pretty_palette[som_cluster], main = "Clusters") 