# Trials of Flat and Hierarchical Clusterings with Titanic Data
[Demo of DBSCAN clustering algorithm — scikit-learn 0.19.0 documentation](http://scikit-learn.org/stable/auto_examples/cluster/plot_dbscan.html)

In [None]:
print(__doc__)

import numpy as np

from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler


# #############################################################################
# Generate sample data
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4,
                            random_state=0)

X = StandardScaler().fit_transform(X)

# #############################################################################
# Compute DBSCAN
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

print('Estimated number of clusters: %d' % n_clusters_)
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))
print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))
print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
print("Adjusted Rand Index: %0.3f"
      % metrics.adjusted_rand_score(labels_true, labels))
print("Adjusted Mutual Information: %0.3f"
      % metrics.adjusted_mutual_info_score(labels_true, labels))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, labels))

# #############################################################################
# Plot result
import matplotlib.pyplot as plt

# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
          for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = [0, 0, 0, 1]

    class_member_mask = (labels == k)

    xy = X[class_member_mask & core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=14)

    xy = X[class_member_mask & ~core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=6)

plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()

In [None]:
X

In [None]:
len(X)

[sklearn.cluster.DBSCAN — scikit-learn 0.15-git documentation](http://scikit-learn.org/0.15/modules/generated/sklearn.cluster.DBSCAN.html)

In [None]:
from sklearn.cluster import DBSCAN

for eps in range(1,30,1):
    for minPts in range(1,20):
        dbscan = DBSCAN(eps=eps * 0.1,min_samples=minPts).fit(X)
        y_dbscan = dbscan.labels_
        
        """ 結果を表示 """
        labels = dbscan.labels_
        for i in range(len(labels)):
            if labels[i] != -1:
                print(labels[i], X[i])


# タイタニックデータ

In [None]:
#https://pythonprogramming.net/static/downloads/machine-learning-data/titanic.xls
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
import numpy as np
from sklearn.cluster import KMeans
from sklearn import preprocessing
import pandas as pd

df0 = pd.read_excel('titanic.xls')
#print(df.head())
df = df0.drop(['body','name'], 1)
df.convert_objects(convert_numeric=True)
df.fillna(0, inplace=True)
#print(df.head())

# 名義尺度を数値に置き換える。

In [None]:
def handle_non_numerical_data(df):
    columns = df.columns.values

    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]

        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            column_contents = df[column].values.tolist()
            unique_elements = set(column_contents)
            x = 0
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x+=1

            df[column] = list(map(convert_to_int, df[column]))

    return df

df = handle_non_numerical_data(df)

In [None]:
X = np.array(df.drop(['survived'], 1).astype(float))
X

# DBSCAN
[CSCE 420 Communication Project - DBSCAN - YouTube](https://www.youtube.com/watch?v=5E097ZLE9Sg&feature=youtu.be)  
[Brian Kent: Density Based Clustering in Python - YouTube](https://www.youtube.com/watch?v=5cOhL4B5waU&feature=youtu.be)

In [None]:
from sklearn.cluster import DBSCAN
import csv
#import codecs
writer = pd.ExcelWriter('DBSCAN.xlsx')
trial_no = 0
with open('DBSCAN_summary.csv', 'w', newline='') as csv_fd:
    csv_writer = csv.writer(csv_fd)
    csv_writer.writerow(['eps', 'minPts', 'label count', 'unclustered', 'cluster 1', 'cluster 2', '...'])
    ''' 1. 密度を隣接ノードまでの距離(eps)で表現する '''
    #for eps in range(100, 3000, 100):        # 群番号1に最大数の群が割り当てられるようだ。
    #for eps in [10, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 100, 1000]:
    for eps in range(1, 100):

        ''' 2. minPts個以下しか集まらない群は、ノイズとして数えない '''
        #for minPts in range(10, 200, 10):    # minPts個以下しか集まらない群は、ノイズとして数えない。
        for minPts in [10, 100, 10]:        # 群番号は-1で表示される。

            dbscan = DBSCAN(eps=eps,min_samples=minPts).fit(X)
            y_dbscan = dbscan.labels_

            """ 結果を表示 """
            labels = dbscan.labels_
            #for i in range(len(labels)):
            #    if labels[i] != -1:
            #        print(labels[i], X[i])
            df1 = df0     # df1: 書き出し用のデータフレーム
            if 2< len(set(labels)) < 20:        # 群数が3以上20未満の場合に書き出すようにしている。その場合は下に出力する。
                                                            # 群数が2の場合は、未分類と分類0の2群なので書き出さない
                df1['minPts={}'.format(minPts)] = labels
                cluster_count_dict = dict()
                for label in labels:
                    cluster_count_dict[label] = cluster_count_dict.get(label, 0) + 1
                trial_no += 1
                #print(cluster_count_dict)
                print('{}: eps = {:.1f}, minPts = {}, 異なり数 = {}'.format(trial_no, eps, minPts, len(set(labels))))
                count_list = [x[1] for x in sorted(cluster_count_dict.items(), key = lambda x: x[0])]
                csv_writer.writerow([eps, minPts, len(set(labels))] +count_list)
                print(repr([eps, minPts, len(set(labels))] +count_list))
                
                #print('  {}'.format(sorted(cluster_count_dict.items(), key = lambda x: x[0])))
                #cluster_count_list = sorted(cluster_count_dict.items(), key=lambda x: x[1], reverse=True)
                #print(cluster_count_list)
                df1.to_excel(writer,'eps={}'.format(eps))
               # cluster_count_dictをリストでシートに書き出す
        #df1.to_excel(writer,'eps={}'.format(eps))
    writer.save()

# Hierarchical Clustering

In [None]:
len(X)

[sklearn.cluster.AgglomerativeClustering — scikit-learn 0.19.0 documentation](http://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html)

affinity : string or callable, default: “euclidean”
Metric used to compute the linkage. Can be “euclidean”, “l1”, “l2”, “manhattan”, “cosine”, or ‘precomputed’. If linkage is “ward”, only “euclidean” is accepted.

linkage : {“ward”, “complete”, “average”}, optional, default: “ward”
Which linkage criterion to use. The linkage criterion determines which distance to use between sets of observation. The algorithm will merge the pairs of cluster that minimize this criterion.
ward minimizes the variance of the clusters being merged.
average uses the average of the distances of each observation of the two sets.
complete or maximum linkage uses the maximum distances between all observations of the two sets.

In [None]:
from sklearn.cluster import AgglomerativeClustering
import csv
import collections

writer = pd.ExcelWriter('AgglomerativeClustering.xlsx')
trial_no = 0
    
#from sklearn import cluster
# implementing agglomerative (bottom up) hierarchical clustering
# we're going to specify that we want 4 and 2 clusters, respectively
print("Dataset X")

affinity_list = ['euclidean', 'l1', 'l2', 'manhattan', 'cosine']
linkage_list = ['ward', 'complete', 'average']
for affinity in affinity_list:
    for linkage in linkage_list:
        if affinity == 'euclidean':
            if linkage != 'ward':
                continue
        if linkage == 'ward':
            if affinity != 'euclidean':
                continue
        print('affinity = {}, linkage = {}'.format(affinity, linkage))
        with open('AgglomerativeClustering_summary_{}_{}.csv'.format(affinity, linkage), 'w', newline='') as csv_fd:
            csv_writer = csv.writer(csv_fd)
            csv_writer.writerow(['n_clusters', 'cluster 1', 'cluster 2', '...'])
            for n in range(20, 1, -1):
                df1 = df0     # df1: 書き出し用のデータフレーム
                labels = AgglomerativeClustering(n_clusters=n, affinity=affinity, 
                                                              linkage=linkage).fit_predict(X)
                df1['n={}'.format(n)] = labels
                #print(df1['n={}'.format(n)])
                #df1.to_excel(writer,'{}_{}'.format(affinity, linkage))   # シート名
                print(len(labels))
                print('n_clusters: {}'.format(n))
                print(*["  Cluster "+str(i)+": "+ str(sum(labels==i)) for i in range(n)], sep='\n')

                # サマリー情報
                trial_no += 1
                count_list  = []
                for i in range(n):
                    count_list.append(sum(labels==i))
                csv_writer.writerow([n] +count_list)
                
            df1.to_excel(writer,'{}_{}'.format(affinity, linkage))   # シート名
writer.save()

In [None]:
df0.info()