In [None]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import DBSCAN, KMeans
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.linear_model import LassoCV
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import silhouette_score, davies_bouldin_score,v_measure_score
from sklearn.mixture import GaussianMixture
import warnings
warnings.filterwarnings('ignore')


In [None]:
df = pd.read_csv('../data/diamonds.csv')
df

In [None]:
df2 = df.drop('id', axis=1)
df2 = pd.get_dummies(df2, columns=["cut", 'color', 'clarity'])
df2

In [None]:
for i in df2.loc[:, df2.columns != 'price']:
    df2[i] = df2[i] / max(df2[i])
df2

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(15, 10))
for idx, feature in enumerate(df.columns[1:7].append(df.columns[8:])):
    df.plot(feature, "price", subplots=True, kind="scatter", ax=axes[idx // 3, idx % 3])

In [None]:
X = df2.loc[:, df2.columns != 'price']
y = df2.loc[:, df2.columns == 'price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [None]:
model_LR = LinearRegression()
model_LR.fit(X_train, y_train)

for i, j in zip(X.columns, *model_LR.coef_):
    print(i, j)

In [None]:
model_ridge = Ridge()
model_ridge.fit(X_train, y_train)

for i, j in zip(X.columns, *model_ridge.coef_):
    print(i, j)

In [None]:
model_lasso = Lasso()
model_lasso.fit(X_train, y_train)

for i, j in zip(X.columns, model_lasso.coef_):
    print(i, j)

In [None]:
# Инициализируем модель решающего дерева
model = DecisionTreeClassifier(random_state=42,
                               # функция для impurity ('gini' или 'entropy')
                               criterion='gini',
                               # максимальная глубина дерева +5-5
                               max_depth=10,
                               # минимальное число элементов в узле для разбиения (может быть долей)
                               min_samples_split=5,
                               # минимальное число элементов в листе (может быть долей)
                               min_samples_leaf=5,
                               # Минимальное значение дельты impurity
                               # min_impurity_decrease=0,
                               # веса для классов (можно дополнительно штрафовать за ошибку в нужных классах).
                               # Поддерживает опцию 'balanced'.
                               class_weight=None
                               )

# Обучаем модель
model.fit(X_train, y_train)

In [None]:
pd.DataFrame({'feature': df2.loc[:, df2.columns != 'price'].columns,
              'importance': model.feature_importances_}).sort_values('importance', ascending=False)

In [None]:
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

In [None]:
conf_mat = metrics.confusion_matrix(y_test, y_pred_test)
conf_mat

In [None]:
rf = RandomForestRegressor(random_state=0)
cb = CatBoostRegressor()

rf.fit(X_train,y_train)
cb.fit(X_train,y_train)

predict_rf = rf.predict(X_test)
predict_cb = cb.predict(X_test)

In [None]:
cluster_tags=df2.columns

In [None]:
t = 20 #Перебор количества кластеров
# создадим пустой список для записи показателя WCSS (нашей ошибки)
wcss = []
X= df2[cluster_tags]
for i in range(1, t):

    kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 2000, n_init = 10, random_state = 42)
    kmeans.fit(X)

    # для каждого кластера рассчитаем ошибку (атрибут inertia_) и поместим в список
    wcss.append(kmeans.inertia_)

In [None]:
plt.figure(figsize = (10,6))
plt.plot(range(1, t), wcss)

plt.title('Выбор количества кластеров методом локтя')
plt.xlabel('Количество кластеров')
plt.ylabel('WCSS')

In [None]:
km_scores= []
km_silhouette = []
vmeasure_score =[]
db_score = []
X_scaled=X.copy()
for i in range(2,t):
    km = KMeans(n_clusters=i, random_state=0).fit(X_scaled)
    preds = km.predict(X_scaled)

    print("Score for number of cluster(s) {}: {}".format(i,km.score(X_scaled)))
    km_scores.append(-km.score(X_scaled))

    silhouette = silhouette_score(X_scaled,preds)
    km_silhouette.append(silhouette)
    print("Silhouette score for number of cluster(s) {}: {}".format(i,silhouette))

    db = davies_bouldin_score(X_scaled,preds)
    db_score.append(db)
    print("Davies Bouldin score for number of cluster(s) {}: {}".format(i,db))

    # v_measure = v_measure_score(y,preds)
    # vmeasure_score.append(v_measure)
    # print("V-measure score for number of cluster(s) {}: {}".format(i,v_measure))
    print("-"*100)

In [None]:
plt.figure(figsize=(7,4))
plt.title("The elbow method for determining number of clusters\n",fontsize=16)
plt.scatter(x=[i for i in range(2,t)],y=km_scores,s=150,edgecolor='k')
plt.grid(True)
plt.xlabel("Number of clusters",fontsize=14)
plt.ylabel("K-means score",fontsize=15)
plt.xticks([i for i in range(2,t)],fontsize=14)
plt.yticks(fontsize=15)
plt.show()

In [None]:
plt.figure(figsize=(7,4))
plt.title("The silhouette coefficient method \nfor determining number of clusters\n",fontsize=16)
plt.scatter(x=[i for i in range(2,t)],y=km_silhouette,s=150,edgecolor='k')
plt.grid(True)
plt.xlabel("Number of clusters",fontsize=14)
plt.ylabel("Silhouette score",fontsize=15)
plt.xticks([i for i in range(2,t)],fontsize=10)
plt.yticks(fontsize=15)
plt.show()

In [None]:
plt.scatter(x=[i for i in range(2,t)],y=db_score,s=150,edgecolor='k')
plt.grid(True)
plt.xlabel("Davies-Bouldin score")
plt.show()

In [27]:
gm_bic= []
gm_score=[]
for i in range(2,t):
    gm = GaussianMixture(n_components=i,n_init=10,tol=1e-3,max_iter=2000).fit(X_scaled)
    print("BIC for number of cluster(s) {}: {}".format(i,gm.bic(X_scaled)))
    print("Log-likelihood score for number of cluster(s) {}: {}".format(i,gm.score(X_scaled)))
    print("-"*100)
    gm_bic.append(-gm.bic(X_scaled))
    gm_score.append(gm.score(X_scaled))


BIC for number of cluster(s) 3: -4917081.732864491
Log-likelihood score for number of cluster(s) 3: 45.70209224740143
----------------------------------------------------------------------------------------------------


KeyboardInterrupt: 

In [None]:
plt.figure(figsize=(7,4))
plt.title("The Gaussian Mixture model BIC \nfor determining number of clusters\n",fontsize=16)
plt.scatter(x=[i for i in range(2,t)],y=np.log(np.abs(gm_bic)),s=150,edgecolor='k')
plt.grid(True)
plt.xlabel("Number of clusters",fontsize=14)
plt.ylabel("Log of Gaussian mixture BIC score",fontsize=15)
plt.xticks([i for i in range(2,t)],fontsize=14)
plt.yticks(fontsize=15)
plt.show()

In [None]:
model_cluster_6 = KMeans(6)
model_cluster_6.fit(X_scaled)
y_pred_km_6 = model_cluster_6.predict(X_scaled)
model_cluster_13 = KMeans(13)
model_cluster_13.fit(X_scaled)
y_pred_km_13 = model_cluster_13.predict(X_scaled)

In [None]:
x6 = X_scaled['sqft_living']
y6 = X_scaled['price']
plt.figure(figsize=(20,10))
plt.title("Визуализация кластеров\n",fontsize=16)
plt.scatter(x=x6,y=y6,s=15,c=y_pred_km_6)
plt.grid(True)
plt.xlabel("Жилая площадь",fontsize=14)
plt.ylabel("Стоимость",fontsize=15)
plt.xticks(fontsize=14)
plt.yticks(fontsize=15)
plt.show()

In [None]:
x6 = X_scaled['sqft_lot']
y6 = X_scaled['price']
plt.figure(figsize=(20,10))
plt.title("Визуализация кластеров\n",fontsize=16)
plt.scatter(x=x6,y=y6,s=15,c=y_pred_km_6)
plt.grid(True)
plt.xlabel("Общая площадь",fontsize=14)
plt.ylabel("Стоимость",fontsize=15)
plt.xticks(fontsize=14)
plt.yticks(fontsize=15)
plt.show()

In [None]:
x13= X_scaled['sqft_lot']
y13 = X_scaled['price']
plt.figure(figsize=(20,10))
plt.title("Визуализация кластеров\n",fontsize=16)
plt.scatter(x=x13,y=y13,s=15,c=y_pred_km_13)
plt.grid(True)
plt.xlabel("Общая площадь",fontsize=14)
plt.ylabel("Стоимость",fontsize=15)
plt.xticks(fontsize=14)
plt.yticks(fontsize=15)
plt.show()

In [None]:
model_dbscan = DBSCAN(eps=10000, min_samples=100)

y_pred_dbscan = model_dbscan.fit_predict(X_scaled)

In [None]:
xdbs= X_scaled['sqft_living']
ydbs = X_scaled['price']
plt.figure(figsize=(20,10))
plt.title("Визуализация кластеров\n",fontsize=16)
plt.scatter(x=xdbs,y=ydbs,s=15,c=y_pred_dbscan)
plt.grid(True)
plt.xlabel("Жилая площадь",fontsize=14)
plt.ylabel("Стоимость",fontsize=15)
plt.xticks(fontsize=14)
plt.yticks(fontsize=15)
plt.show()

In [None]:
#Здесь фактическое шкалирование
from sklearn.preprocessing import MinMaxScaler
mms = {}
dfs = X_scaled.copy(deep=True)
for c in X_scaled.columns:
    mms[c] = MinMaxScaler().fit(dfs[c].values.reshape(-1,1))
    dfs[c] = mms[c].transform(dfs[c].values.reshape(-1,1))


In [None]:
dfs

In [None]:
model_cluster_6 = KMeans(6)
model_cluster_6.fit(dfs)
y_pred_km_6 = model_cluster_6.predict(dfs)
dfs['Кластер'] = kmeans.labels_

In [None]:
x6 = dfs['sqft_living']
y6 = dfs['price']
plt.figure(figsize=(20,10))
plt.title("Визуализация кластеров\n",fontsize=16)
plt.scatter(x=y6,y=x6,s=15,c=y_pred_km_6)
plt.grid(True)
plt.xlabel("Стоимость",fontsize=14)
plt.ylabel("Кластер",fontsize=15)
plt.xticks(fontsize=14)
plt.yticks(fontsize=15)
plt.show()

In [None]:
dfs.loc[dfs['Кластер']==0].describe()

In [None]:
model_dbscan_s = DBSCAN(eps=0.0001, min_samples=100)
y_pred_dbscan_s = model_dbscan_s.fit_predict(dfs)

In [None]:
model_dbscan_s.get_params()

In [None]:
xdbs= dfs['sqft_living']
ydbs = dfs['price']
plt.figure(figsize=(20,10))
plt.title("Визуализация кластеров\n",fontsize=16)
plt.scatter(x=xdbs,y=ydbs,s=15,c=y_pred_dbscan_s)
plt.grid(True)
plt.xlabel("Жилая площадь",fontsize=14)
plt.ylabel("Стоимость",fontsize=15)
plt.xticks(fontsize=14)
plt.yticks(fontsize=15)
plt.show()

In [None]:
km_scores= []
km_silhouette = []
vmeasure_score =[]
db_score = []

for i in range(2,t):
    km = KMeans(n_clusters=i, random_state=0).fit(dfs)
    preds = km.predict(dfs)

    print("Score for number of cluster(s) {}: {}".format(i,km.score(dfs)))
    km_scores.append(-km.score(dfs))

    silhouette = silhouette_score(dfs,preds)
    km_silhouette.append(silhouette)
    print("Silhouette score for number of cluster(s) {}: {}".format(i,silhouette))

    db = davies_bouldin_score(dfs,preds)
    db_score.append(db)
    print("Davies Bouldin score for number of cluster(s) {}: {}".format(i,db))

#    v_measure = v_measure_score(y,preds)
#    vmeasure_score.append(v_measure)
#    print("V-measure score for number of cluster(s) {}: {}".format(i,v_measure))
    print("-"*100)

In [None]:
plt.figure(figsize=(7,4))
plt.title("The elbow method for determining number of clusters\n",fontsize=16)
plt.scatter(x=[i for i in range(2,t)],y=km_scores,s=150,edgecolor='k')
plt.grid(True)
plt.xlabel("Number of clusters",fontsize=14)
plt.ylabel("K-means score",fontsize=15)
plt.xticks([i for i in range(2,t)],fontsize=14)
plt.yticks(fontsize=15)
plt.show()

In [None]:
plt.figure(figsize=(7,4))
plt.title("The silhouette coefficient method \nfor determining number of clusters\n",fontsize=16)
plt.scatter(x=[i for i in range(2,t)],y=km_silhouette,s=150,edgecolor='k')
plt.grid(True)
plt.xlabel("Number of clusters",fontsize=14)
plt.ylabel("Silhouette score",fontsize=15)
plt.xticks([i for i in range(2,t)],fontsize=10)
plt.yticks(fontsize=15)
plt.show()

In [None]:
plt.scatter(x=[i for i in range(2,t)],y=db_score,s=150,edgecolor='k')
plt.grid(True)
plt.xlabel("Davies-Bouldin score")
plt.show()

In [None]:
model_cluster_8 = KMeans(8)
model_cluster_8.fit(dfs)
y_pred_km_8 = model_cluster_8.predict(dfs)

In [None]:
x13= dfs['sqft_living']
y13 = dfs['price']
plt.figure(figsize=(20,10))
plt.title("Визуализация кластеров\n",fontsize=16)
plt.scatter(x=x13,y=y13,s=15,c=y_pred_km_8)
plt.grid(True)
plt.xlabel("Жилая площадь",fontsize=14)
plt.ylabel("Стоимость",fontsize=15)
plt.xticks(fontsize=14)
plt.yticks(fontsize=15)
plt.show()