In [None]:
from passengers import Passengers, Line

lines = {}

file_name = 'data/t091307.xlsx'
line = '田園都市線'
skip_rows = list(range(0, 11)) + list(range(12, 15))
use_cols = list(range(1, 100))
lines[line] = Line(file_name, line, skip_rows, use_cols)

file_name = 'data/t091306.xlsx'
line = '東横線'
skip_rows = list(range(0, 12)) + list(range(13, 16))
use_cols = list(range(1, 100))
lines[line] = Line(file_name, line, skip_rows, use_cols)

file_name = 'data/t091305.xlsx'
line = '京浜急行線'
skip_rows = list(range(0, 12)) + list(range(13, 16))
use_cols = list(range(1, 100))
lines[line] = Line(file_name, line, skip_rows, use_cols)

file_name = 'data/t091304.xlsx'
line = '相模鉄道線'
skip_rows = list(range(0, 12)) + list(range(13, 16))
use_cols = list(range(1, 100))
lines[line] = Line(file_name, line, skip_rows, use_cols)

file_name = 'data/t091303.xlsx'
line = 'みなとみらい線'
skip_rows = list(range(0, 11)) + list(range(12, 15))
use_cols = list(range(1, 100))
lines[line] = Line(file_name, line, skip_rows, use_cols)

file_name = 'data/t091302.xlsx'
line = '金沢シーサイドライン'
skip_rows = list(range(0, 11)) + list(range(12, 15))
use_cols = list(range(1, 100))
lines[line] = Line(file_name, line, skip_rows, use_cols)

file_name = 'data/t091203.xlsx'
line = 'グリーンライン'
skip_rows = list(range(0, 11)) + list(range(12, 15))
use_cols = list(range(1, 100))
lines[line] = Line(file_name, line, skip_rows, use_cols)

file_name = 'data/t091202.xlsx'
line = 'ブルーライン'
skip_rows = list(range(0, 11)) + list(range(12, 15))
use_cols = list(range(1, 100))
lines[line] = Line(file_name, line, skip_rows, use_cols)

ps = Passengers()
for line in lines.values():
    ps.read_csv(line=line)
display(ps.passengers.head())

In [None]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tslearn.utils import to_time_series_dataset
from tslearn.clustering import TimeSeriesKMeans, KShape
from tslearn.preprocessing import TimeSeriesScalerMeanVariance

In [None]:
for column in ps.passengers.columns:
    if re.match(".*総数.*", column):
        ps.passengers = ps.passengers.drop(column, axis=1)

In [None]:
import logging
logging.basicConfig()
logger = logging.getLogger()
logger.setLevel(logging.WARN)

import itertools
import calendar

from collections import Counter

class Visualizer(object):
    def __init__(self):
        self._markers = itertools.cycle([
            'o', 'v', '^', '<', '>', '1', '2', '3', '4', '8',
            's', 'p', '*', 'h', 'H', '+', 'x', 'D', 'd', 'P'])
        self._colors = itertools.cycle(['b', 'g', 'r', 'c', 'm', 'y'])
         
        self._name_to_marker = {}
        
         
    def _make_xticks_and_xlabels(self, xs_ticks, xtick_step, xs_labels):
        """グラフのx軸ラベル表示のために、特定単位のxtick, xlabelを生成。
        """
        shown_xticks  = []
        shown_xlabels = []
 
        for xtick, xlabel in zip(xs_ticks, xs_labels):
            check_digit = 0
            check_digit |= 1 if xlabel.minute == 0 else 0
            check_digit |= 2 if xlabel.hour == 0 else 0
            check_digit |= 4 if xlabel.day == 1 else 0
 
            if (xtick_step == 'h' and (check_digit & 1) == 1) \
               or (xtick_step == 'd' and (check_digit & 3) == 3) \
               or (xtick_step == 'm' and (check_digit & 7) == 7):
                shown_xticks.append(xtick)
                shown_xlabels.append(
                    '{} ({})'.format(str(xlabel), calendar.day_abbr[xlabel.weekday()]))
                logger.info('Adding xtick, xlabel: {}, {}'.format(xtick, xlabel))
 
        return shown_xticks, shown_xlabels    
 
 
    def draw_graphs(self, df, cnt=None, scatter=True,
                    xlabel='Time', xtick_step='m',
                    ylabel='Power Consumption', ylog=False,
                    figsize=(12,8), dotsize=3, alpha=0.3,
                    xlim=False):
 
        plt.subplots(1, figsize=figsize)
 
        # y軸: logスケール？
        if ylog:
            plt.yscale('log')
 
        # グラフを描く。
        xs_labels = df.index
        xs_ticks = np.arange(len(xs_labels))
        ys_columns = df.columns.tolist()
 
        for ys_col in ys_columns:
            ys = df[ys_col]
             
            # グラフの名前とそれのmarker/color情報を保存。
            if ys_col not in self._name_to_marker:
                self._name_to_marker[ys_col] = {
                    'marker':next(self._markers),
                    'color':next(self._colors)}
 
            # cnt != Noneの場合、各グラフの名前に追加情報を付着。
            if cnt:
                ys_label = '{}: {}'.format(ys_col, cnt[ys_col])
            else:
                ys_label = ys_col    
 
            # scatter/lineグラフを描く。
            if scatter:
                plt.scatter(
                    xs_labels, ys,
                    marker=self._name_to_marker[ys_col]['marker'],
                    c=self._name_to_marker[ys_col]['color'],
                    s=dotsize, alpha=alpha,
                    label=ys_label)
            else:
                plt.plot(
                    xs_labels, ys,
                    marker=self._name_to_marker[ys_col]['marker'],
                    c=self._name_to_marker[ys_col]['color'],
                    markersize=dotsize, alpha=alpha,
                    label=ys_label)
 
            logger.info('Plotting graph: {}'.format(ys_label))

        # そのほか、Canvas上の設定を行う。
        shown_xticks, shown_xlabels = self._make_xticks_and_xlabels(
            xs_labels, xtick_step, xs_labels)
        plt.xticks(shown_xticks, shown_xlabels, rotation=90)
        plt.xlabel(xlabel) 
        plt.ylabel(ylabel)
        
        if xlim:
            plt.xlim(xlim)

        #plt.grid()
 
        lgnd = plt.legend(loc='upper left', ncol=2, markerscale=3)
 
        plt.tight_layout()
 
        # Show it.
        plt.plot()
 
# Visualizerオブジェクトを生成。
viz = Visualizer()

# 標準化する場合

In [None]:
yss = to_time_series_dataset(ps.passengers.values.T)
yss = TimeSeriesScalerMeanVariance(mu=0.0, std=1.0).fit_transform(yss)

In [None]:
n_clusters  = 8
n_init = 3
rand_seed  = 13

ks = KShape(n_clusters=n_clusters,
            n_init=n_init,
            verbose=True,
            random_state=rand_seed)

ks.fit(yss)

In [None]:
data = []
for cluster_x in ks.cluster_centers_:
    data.append(
        [point_x[0] for point_x in cluster_x]
    )
columns = ['cluster-{}'.format(idx) for idx in range(n_clusters)]
clusters = np.array(data).T
df_clusters = pd.DataFrame(clusters, columns=columns)
 
df_clusters.index = ps.passengers.index

# クラスタに属するデータ数を計算。
cnt = Counter(ks.labels_)
cluster_labels = {}
for k in cnt:
    cluster_labels['cluster-{}'.format(k)] = cnt[k]
 
# グラフを表示。
viz.draw_graphs(df_clusters, cnt=cluster_labels,
            xtick_step='m',
            alpha=0.4)

In [None]:
for i in range(8):
    target_clusters = ['cluster-{}'.format(idx) for idx in [i]]
    filtered_df_clusters = df_clusters[target_clusters]

    viz.draw_graphs(filtered_df_clusters,
                    cnt=cluster_labels,
                    xtick_step='m',
                    scatter=False,
                    alpha=0.4)

In [None]:
for i in range(8):
    print("cluster " + str(i))
    tmp_df = []
    for j, label in enumerate(ks.labels_):
        if label == i:
            tmp_df.append(ps.passengers.columns[j])
    print(tmp_df)

In [None]:
clustering_metric  = 'euclidean'

km = TimeSeriesKMeans(
    n_init=n_init,
    n_clusters=n_clusters,
    metric=clustering_metric,
    random_state=rand_seed)

km.fit(yss)

In [None]:
data = []
for cluster_x in km.cluster_centers_:
    data.append(
        [point_x[0] for point_x in cluster_x]
    )
columns = ['cluster-{}'.format(idx) for idx in range(n_clusters)]
clusters = np.array(data).T
df_clusters = pd.DataFrame(clusters, columns=columns)
 
df_clusters.index = ps.passengers.index

# クラスタに属するデータ数を計算。
cnt = Counter(km.labels_)
cluster_labels = {}
for k in cnt:
    cluster_labels['cluster-{}'.format(k)] = cnt[k]
 
# グラフを表示。
viz.draw_graphs(df_clusters, cnt=cluster_labels,
            xtick_step='m',
            alpha=0.4)

In [None]:
for i in range(8):
    target_clusters = ['cluster-{}'.format(idx) for idx in [i]]
    filtered_df_clusters = df_clusters[target_clusters]

    viz.draw_graphs(filtered_df_clusters,
                    cnt=cluster_labels,
                    xtick_step='m',
                    scatter=False,
                    alpha=0.4)

In [None]:
for i in range(8):
    print("cluster " + str(i))
    tmp_df = []
    for j, label in enumerate(km.labels_):
        if label == i:
            tmp_df.append(ps.passengers.columns[j])
    print(tmp_df)

# 標準化しない場合

In [None]:
yss = to_time_series_dataset(ps.passengers.values.T)
# yss = TimeSeriesScalerMeanVariance(mu=0.0, std=1.0).fit_transform(yss)

In [None]:
n_clusters  = 8
n_init = 3
rand_seed  = 13

ks = KShape(n_clusters=n_clusters,
            n_init=n_init,
            verbose=True,
            random_state=rand_seed)

ks.fit(yss)

In [None]:
data = []
for cluster_x in ks.cluster_centers_:
    data.append(
        [point_x[0] for point_x in cluster_x]
    )
columns = ['cluster-{}'.format(idx) for idx in range(n_clusters)]
clusters = np.array(data).T
df_clusters = pd.DataFrame(clusters, columns=columns)
 
df_clusters.index = ps.passengers.index

# クラスタに属するデータ数を計算。
cnt = Counter(ks.labels_)
cluster_labels = {}
for k in cnt:
    cluster_labels['cluster-{}'.format(k)] = cnt[k]
 
# グラフを表示。
viz.draw_graphs(df_clusters, cnt=cluster_labels,
            xtick_step='m',
            alpha=0.4)

In [None]:
for i in range(8):
    target_clusters = ['cluster-{}'.format(idx) for idx in [i]]
    filtered_df_clusters = df_clusters[target_clusters]

    viz.draw_graphs(filtered_df_clusters,
                    cnt=cluster_labels,
                    xtick_step='m',
                    scatter=False,
                    alpha=0.4)

In [None]:
for i in range(8):
    print("cluster " + str(i))
    tmp_df = []
    for j, label in enumerate(ks.labels_):
        if label == i:
            tmp_df.append(ps.passengers.columns[j])
    print(tmp_df)

In [None]:
clustering_metric  = 'euclidean'

km = TimeSeriesKMeans(
    n_init=n_init,
    n_clusters=n_clusters,
    metric=clustering_metric,
    random_state=rand_seed)

km.fit(yss)

In [None]:
data = []
for cluster_x in km.cluster_centers_:
    data.append(
        [point_x[0] for point_x in cluster_x]
    )
columns = ['cluster-{}'.format(idx) for idx in range(n_clusters)]
clusters = np.array(data).T
df_clusters = pd.DataFrame(clusters, columns=columns)
 
df_clusters.index = ps.passengers.index

# クラスタに属するデータ数を計算。
cnt = Counter(km.labels_)
cluster_labels = {}
for k in cnt:
    cluster_labels['cluster-{}'.format(k)] = cnt[k]
 
# グラフを表示。
viz.draw_graphs(df_clusters, cnt=cluster_labels,
            xtick_step='m',
            alpha=0.4)

In [None]:
for i in range(8):
    target_clusters = ['cluster-{}'.format(idx) for idx in [i]]
    filtered_df_clusters = df_clusters[target_clusters]

    viz.draw_graphs(filtered_df_clusters,
                    cnt=cluster_labels,
                    xtick_step='m',
                    scatter=False,
                    alpha=0.4)

In [None]:
for i in range(8):
    print("cluster " + str(i))
    tmp_df = []
    for j, label in enumerate(km.labels_):
        if label == i:
            tmp_df.append(ps.passengers.columns[j])
    print(tmp_df)