# 基本设置

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# load data

In [10]:
cor_table = pd.read_csv('datasets_fea/days_data_fea/days_data_cor_table.csv', header = None)
print cor_table.shape
used_data = pd.read_excel('datasets_fea/days_data_fea/days_data.xlsx', header = None)
print used_data.shape

(7120, 7120)
(7112, 1)


In [11]:
cor_table.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7110,7111,7112,7113,7114,7115,7116,7117,7118,7119
0,1.0,0.989583,0.916667,0.989583,0.989583,0.9375,0.989583,0.989583,0.979167,0.989583,...,1.0,0.979167,0.927083,1.0,0.989583,0.989583,1.0,1.0,1.0,0.895833
1,0.989583,1.0,0.90625,0.979167,0.979167,0.927083,0.979167,1.0,0.989583,0.979167,...,0.989583,0.96875,0.916667,0.989583,1.0,1.0,0.989583,0.989583,0.989583,0.885417
2,0.916667,0.90625,1.0,0.927083,0.927083,0.895833,0.90625,0.90625,0.895833,0.927083,...,0.916667,0.916667,0.864583,0.916667,0.90625,0.90625,0.916667,0.916667,0.916667,0.916667
3,0.989583,0.979167,0.927083,1.0,1.0,0.927083,0.979167,0.979167,0.96875,0.979167,...,0.989583,0.96875,0.916667,0.989583,0.979167,0.979167,0.989583,0.989583,0.989583,0.885417
4,0.989583,0.979167,0.927083,1.0,1.0,0.927083,0.979167,0.979167,0.96875,0.979167,...,0.989583,0.96875,0.916667,0.989583,0.979167,0.979167,0.989583,0.989583,0.989583,0.885417


# cluster

## DBSCAN

In [12]:
import numpy as np
from sklearn.cluster import DBSCAN

# Compute DBSCAN
db = DBSCAN(eps=0.2, min_samples=10).fit(cor_table)

In [13]:
predict_label = db.labels_  # 类别标签
label_indice = db.core_sample_indices_ # 标签位置
print '---------- predict_label ' , len(predict_label)
# print predict_label
print '---------- label_indice ' , len(label_indice)
# print label_indice

---------- predict_label  7120
---------- label_indice  2991


### 评价

In [14]:
from sklearn import metrics
from datetime import datetime

print 'Calinski-Harabaz Index：', metrics.calinski_harabaz_score(cor_table, predict_label)  # 分数值越大则聚类效果越好
print '轮廓系数：', metrics.silhouette_score(cor_table, predict_label, metric='euclidean')  # 分数值越大则聚类效果越好 [-1， 1]

Calinski-Harabaz Index： 31.962796020928906
轮廓系数： 0.13451277113162272


In [56]:
from sklearn import metrics
from datetime import datetime

print 'Calinski-Harabaz Index：', metrics.calinski_harabaz_score(cor_table, predict_label)  # 分数值越大则聚类效果越好
print '轮廓系数：', metrics.silhouette_score(cor_table, predict_label, metric='euclidean')  # 分数值越大则聚类效果越好 [-1， 1]

Calinski-Harabaz Index： 99.59247767409852
轮廓系数： 0.25576044660447456


## 参数寻优

In [2]:
eps_list = np.linspace(0.3, 1.7, num=15)
min_samples_list = range(3, 15, 2)

In [None]:
scores=[]
models=[]
pram_res = []  # elapsed_time, eps, min_samples, Calinski, silhouette, label_num, label_class
for eps in eps_list:
    for min_samples in min_samples_list:
        start_time = datetime.now()
        model=DBSCAN(eps = eps,min_samples = min_samples).fit(cor_table)
        labels=model.labels_
        elapsed_time = '%0.2f'%((datetime.now() - start_time).seconds)
        label_class = np.unique(labels)
        label_num=len(label_class)
        if label_num>1: # 需要判断label种类，因为如果只有一个label，silhouette_score报错
            silhouette_score = metrics.silhouette_score(cor_table, model.labels_)
            calinski_score = metrics.calinski_harabaz_score(cor_table, model.labels_)
            scores.append(silhouette_score)
            models.append(model)
        else:
            silhouette_score = 0
            calinski_score = 0
            scores.append(silhouette_score)
            models.append(None)

        pram_res.append([elapsed_time, eps, min_samples, calinski_score, silhouette_score, label_num, label_class])
        print [elapsed_time, eps, min_samples, calinski_score, silhouette_score, label_num] # , label_class
optimal_id = scores.index(max(scores))
optimal_eps = pram_res[optimal_id][1]
optimal_min_samples = pram_res[optimal_id][2]

['227.00', 0.3, 3, 70.88914067985118, 0.37404538949205673, 367]
['225.00', 0.3, 5, 129.47602507162645, 0.24770973128117285, 170]
['225.00', 0.3, 7, 192.95475960616847, 0.19251761397017303, 106]
['225.00', 0.3, 9, 26.842238571935333, 0.14148227158802396, 74]
['225.00', 0.3, 11, 34.49182246594606, 0.12905797840569572, 53]
['227.00', 0.3, 13, 41.001255932404824, 0.15657661051750296, 43]
['272.00', 0.39999999999999997, 3, 73.61745922594012, 0.3661793831631241, 359]
['271.00', 0.39999999999999997, 5, 131.65704979238942, 0.2483687875691595, 171]
['271.00', 0.39999999999999997, 7, 190.4938053274833, 0.19687122504574686, 110]
['271.00', 0.39999999999999997, 9, 27.104792559036436, 0.14057109867800133, 75]
['271.00', 0.39999999999999997, 11, 35.2918197052232, 0.12684887977689785, 53]
['271.00', 0.39999999999999997, 13, 41.27730111795277, 0.1544636682660492, 44]
['321.00', 0.5, 3, 96.53358938178174, 0.33568939876788073, 324]
['325.00', 0.5, 5, 161.78595746390297, 0.24203406342583808, 164]
['323.0

In [17]:
def get_optimal_eps(dataset,eps_list, min_samples_list):
    '''get optimal eps param for DBSCAN 
       params: 
            dataset: the whole dataset.
            eps_list: must be in np.linspace() format or list format.
        return:
            three values：optimal eps value, 
                optimal model with optimal eps
                silhouette_scores of all candidate eps.
        '''
    scores=[]
    models=[]
    pram_res = []  # elapsed_time, eps, min_samples, Calinski, silhouette, label_num, label_class
    for eps in eps_list:
        for min_samples in min_samples_list:
            start_time = datetime.now()
            model=DBSCAN(eps = eps,min_samples = min_samples).fit(dataset)
            labels=model.labels_
            elapsed_time = '%0.2f'%((datetime.now() - start_time).seconds)
            label_class = np.unique(labels)
            label_num=len(label_class)
            if label_num>1: # 需要判断label种类，因为如果只有一个label，silhouette_score报错
                silhouette_score = metrics.silhouette_score(dataset, model.labels_)
                calinski_score = metrics.calinski_harabaz_score(dataset, model.labels_)
                scores.append(silhouette_score)
                models.append(model)
            else:
                silhouette_score = 0
                calinski_score = 0
                scores.append(silhouette_score)
                models.append(None)
            
            pram_res.append([elapsed_time, eps, min_samples, calinski_score, silhouette_score, label_num, label_class])
            print [elapsed_time, eps, min_samples, calinski_score, silhouette_score, label_num, label_class]
    optimal_id = scores.index(max(scores))
    optimal_eps = pram_res[optimal_id][1]
    optimal_min_samples = pram_res[optimal_id][2]
    return optimal_eps, optimal_min_samples, models[optimal_id], max(scores), pram_res

In [18]:
eps_list = np.linspace(0.3, 1.7, num=15)
min_samples_list = range(3, 15, 2)

optimal_eps, optimal_min_samples, optimal_model,scores, pram_res = get_optimal_eps(cor_table,eps_list, min_samples_list)

['239.00', 0.3, 3, 70.88914067985118, 0.37404538949205673, 367, array([ -1,   0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,
        12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
        25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,
        38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,
        51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
        64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,
        77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
        90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102,
       103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115,
       116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128,
       129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141,
       142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154,
       155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167,


['277.00', 0.39999999999999997, 11, 35.2918197052232, 0.12684887977689785, 53, array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
       33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
       50, 51])]
['273.00', 0.39999999999999997, 13, 41.27730111795277, 0.1544636682660492, 44, array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
       33, 34, 35, 36, 37, 38, 39, 40, 41, 42])]
['327.00', 0.5, 3, 96.53358938178174, 0.33568939876788073, 324, array([ -1,   0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,
        12,  13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,
        25,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,
        38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,
        51,  52,  53,  54,  55,  56,  57,  58,  5

['407.00', 0.7, 5, 753.5412710185648, 0.21523900623133937, 54, array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
       33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
       50, 51, 52])]
['408.00', 0.7, 7, 1113.6409309444805, 0.2075495778785533, 35, array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
       33])]
['1011.00', 0.7, 9, 145.35257985712454, 0.19242927643276284, 27, array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
       16, 17, 18, 19, 20, 21, 22, 23, 24, 25])]


MemoryError: 

In [3]:
print 'eps_list: ',eps_list 
print 'min_samples_list: ',min_samples_list 
print 'scores: %s, optimal_eps: %s, optimal_min_samples: %s'%(scores, optimal_eps, optimal_min_samples)
print 'optimal_model: ',optimal_model 
# pram_res[]

eps_list:  [0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.  1.1 1.2 1.3 1.4 1.5 1.6 1.7]
min_samples_list:  [3, 5, 7, 9, 11, 13]


NameError: name 'scores' is not defined

In [None]:
pram_res = pd.DataFrame(pram_res, 
                        columns = 'elapsed_time, eps, min_samples, Calinski, silhouette, label_num, label_class'.split(', '))
pram_res

In [None]:
pram_res.columns

In [None]:
pram_res['Calinski'].plot()

In [None]:
pram_res['silhouette'].plot()