# clustering-based analysis

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

def minMaxScaler(numArr):
    minx = np.min(numArr)
    maxx = np.max(numArr)
    numArr = (numArr - minx) / (maxx - minx)
    return numArr

from sklearn.cluster import MiniBatchKMeans as MBK
from sklearn.cluster import KMeans

from ipywidgets import widgets
from ipywidgets import interact, interactive, fixed, interact_manual
from IPython.display import display
from IPython.core.interactiveshell import InteractiveShell

In [None]:
#load the dataset
df = pd.read_csv('/content/drive/Shared drives/gh_new_zone/--.csv',index_col = 0

In [None]:
#select the columns for clustering
input_columns = ['start_time','rev_total','utime']
 #by user analysis, groupby column is 'member_id'
    #by zone analysis, groupby column is 'zone_id'
groupby_df = df.groupby('member_id').mean()[input_columns] 

scaled_df = groupby_df.copy()
scaled_df[input_columns] = minMaxScaler(groupby_df[input_columns])

if len(input_columns) != 2:
    pass
else:
    plt.scatter(groupby_df[input_columns[0]][:1000],groupby_df[input_columns[1]][:1000])

In [None]:
def optimalK(data, nrefs=3, maxClusters=7):
    """
    Calculates KMeans optimal K using Gap Statistic from Tibshirani, Walther, Hastie
    Params:
        data: ndarry of shape (n_samples, n_features)
        nrefs: number of sample reference datasets to create
        maxClusters: Maximum number of clusters to test for
    Returns: (gaps, optimalK)
    """
    gaps = np.zeros((len(range(1, maxClusters+1)),))
    resultsdf = pd.DataFrame({'clusterCount':[], 'gap':[]})
    for gap_index, k in enumerate(range(1, maxClusters+1)):

        # Holder for reference dispersion results
        refDisps = np.zeros(nrefs)

        # For n references, generate random sample and perform kmeans getting resulting dispersion of each loop
        for i in range(nrefs):
            
            # Create new random reference set
            randomReference = np.random.random_sample(size=data.shape)
            
            # Fit to it
            #km = KMeans(k)
            km = MBK(k)
            km.fit(randomReference)
            
            refDisp = km.inertia_
            refDisps[i] = refDisp

        # Fit cluster to original data and create dispersion
        #km = KMeans(k)
        km = MBK(k)
        km.fit(data)
        
        origDisp = km.inertia_

        # Calculate gap statistic
        gap = np.log(np.mean(refDisps)) - np.log(origDisp)

        # Assign this loop's gap statistic to gaps
        gaps[gap_index] = gap
        
        resultsdf = resultsdf.append({'clusterCount':k, 'gap':gap}, ignore_index=True)
    if True in list(resultsdf['gap'].diff()<0):
        return (gaps.argmax() + 1, resultsdf) # Plus 1 because index of 0 means 1 cluster is optimal, index 2 = 3 clusters are optimal
    else:
        return (4, resultsdf) # or 5

k, gapdf = optimalK(np.array(list(groupby_df[input_columns].values)), nrefs=1, maxClusters=7)
print ('Optimal k is: ', k)
plt.plot(gapdf['gap'])

In [None]:
dict_s = dict(zip(list(scaled_df.index),list(scaled_df[input_columns].values)))
dict_t = dict(zip(list(groupby_df.index),list(groupby_df[input_columns].values)))

In [None]:
def auto_clust(a):
    
    #A : Behavior
    X=list(dict_s.values())
    kmeans = KMeans(n_clusters=a,random_state=0).fit(X)
    df = pd.DataFrame(np.array(list(dict_t.values())))
    df['cluster']=kmeans.labels_
    cluster_info=pd.DataFrame()
    for i in range(a):
        cluster_info['cluster'+str(i+1)] = list(df[df['cluster']==i].mean()[:-1])
    for i in range(a):
         for j in range(len(df.columns)-1):
            cluster_info.loc[len(df.columns)+j,cluster_info.columns[i]]=np.sqrt(df[df['cluster']==i][j].var())
    cluster_info = cluster_info.append(pd.Series(list(df['cluster'].value_counts(sort=False)), index=list(cluster_info.columns),dtype='str'), ignore_index=True) 
    
    index_dict = dict(zip(range(len(df.columns)-1),list(groupby_df.columns)[:len(df.columns)-1]))
    index_dict.update(dict(zip(range(len(df.columns)-1,2*(len(df.columns)-1)), list(map('SD of '.__add__,list(groupby_df.columns)[:len(df.columns)-1])))))
    index_dict.update(dict({len(cluster_info)-1:'# of member'}))
    cluster_info = cluster_info.rename(index=index_dict)
    return display(cluster_info)

factory =interactive(auto_clust, a=widgets.IntSlider(min=2,max=10,value=k,description="클러스터 수"))
display(factory)

# clustering analysis by user

In [None]:
#clustering dataframe
X=list(dict_s.values())
kmeans = KMeans(n_clusters=scope[0],random_state=0).fit(X)
cluster_df = pd.DataFrame(np.array(list(dict_t.values())),index = list(dict_t.keys()))
cluster_df['cluster']=kmeans.labels_
id_list = list(cluster_df[cluster_df['cluster']==scope[1]-1].index) 
scoped_df = df[df['member_id'].isin(id_list)]

In [None]:
#cluster by region
region2_df = pd.concat([scoped_df['region'].value_counts(), df['region'].value_counts()],axis=1)
region2_df.columns = ['cluster','total']
region2_df['ratio'] = region2_df['cluster']*100/region2_df['total']
region2_df = region2_df.sort_values(by='ratio',ascending=False)
region2_df.head(20)

In [None]:
print("클러스터 ?번의 평균연령은 ",end ='')
print(scoped_df[scoped_df['age']<60]['age'].mean())
print("전체 이용건의 평균연령은 ",end ='')
print(df[df['age']<60]['age'].mean())

In [None]:
#cluster by car model
car_model_df = pd.concat([scoped_df['car_model'].value_counts()/len(scoped_df), ])
pd.concat([scoped_df[scoped_df['car_id'].isin(list(scoped_df['car_id'].drop_duplicates()))]['car_model'].value_counts()*100/len(scoped_df),
           df[df['car_id'].isin(list(scoped_df['car_id'].drop_duplicates()))]['car_model'].value_counts()*100/len(df)],axis =1 )

# clustering analysis by zone

In [None]:
#clustering dataframe
X=list(dict_s.values())
kmeans = KMeans(n_clusters=15,random_state=0).fit(X)
cluster_df = pd.DataFrame(np.array(list(dict_t.values())),index = list(dict_t.keys()))
cluster_df['cluster']=kmeans.labels_
cluster_df.columns=input_columns + ['cluster']

In [None]:
zone_df=pd.read_csv('/content/drive/Shared drives/gh_new_zone/--.csv')

zone_df[zone_df['id'].isin(list(cluster_df[cluster_df['cluster']==ㅇㅅㅇ].index))]['region'].value_counts()[:10]
zone_df[zone_df['id'].isin(list(cluster_df[cluster_df['cluster']==ㅎㅅㅎ].index))]['region'].value_counts()[:10]

In [None]:
pd.concat([zone_df[zone_df['id'].isin(list(cluster_df[cluster_df['cluster']==ㅇㅅㅇ].index))]['district_type'].value_counts(),
           zone_df[zone_df['id'].isin(list(cluster_df[cluster_df['cluster']==ㅎㅅㅎ].index))]['district_type'].value_counts()],axis=1)