# Import Libraries

In [1]:
import configparser
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import numpy as np
import operator
import warnings
from datetime import datetime
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Read Config File

In [2]:
config = configparser.ConfigParser()
config.read('config.ini')
input_directory = config['DEFAULT']['Input-Files-Directory']
input_file = config['DEFAULT']['Input_Data']
pca_no = config['DEFAULT']['PCA_Component_No']
kmeans_ini_status = config['DEFAULT']['Kmeans_Init_Status']
kmeans_cluster_no = config['DEFAULT']['Kmean_Cluster_No']
ref_cluster_result_file = config['DEFAULT']['Ref-Cluster-Result-File']
iter_num = config['DEFAULT']['no_iteration']
output_file = config['DEFAULT']['Output-File']

# Read Input Files

In [3]:
df = pd.read_csv("{}/{}.csv".format(input_directory, input_file), sep=',', encoding='utf-8')
ref_df = pd.read_csv("{}.csv".format(ref_cluster_result_file), sep=',', encoding='utf-8')

# Apply Principal Component Analysis (PCA)

In [4]:
features = df.columns.tolist()
features.remove('code')
features.remove('pops')
features.remove('hhs')
features.remove('p15')
features.remove('Median Age')
features.remove('Median Monthly Mortgage')
features.remove('Median Ind Income')
features.remove('Median Weekly Rent')
features.remove('Median Fam Income')
features.remove('Median Hhd Income')
features.remove('Av Household Size')

x = df.loc[:, features].values

# set number of components
num = int(pca_no)

pca = PCA(n_components=num)
pc_columns = []
for i in range(1,num+1): 
    pc_columns.append('pc{}'.format(i))

principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents, columns = pc_columns)

# K-means clustering

In [5]:
def get_cluster_name(df, ref_df, cluster_name):
    """ rename cluster by refering exist clustered data """
    cluster_name_dict = {}
    for i in range(len(set(df.cluster))): 
        selected_df1 = df.loc[df.cluster == i].astype(str) # covert data type as str
        max_sa1_matched = 0
        matched_cluster = cluster_name[0]
        for c in cluster_name:
            selected_df2 = ref_df.loc[ref_df.cluster == c].astype(str) # covert data type as str
            common_sa1 = list(set(selected_df1.code.tolist()).intersection(selected_df2.sa1_7digitcode_2016.tolist()))
            if max_sa1_matched < len(common_sa1):
                max_sa1_matched = len(common_sa1)
                matched_cluster = c
        cluster_name_dict[i] = matched_cluster
        
    return cluster_name_dict

In [6]:
def replace_cluster_number_to_name(df, ref_df, cluster_name, i):
    """ replace cluster number to cluster name. E.g. 1 -> Country Towns """
    cluster_name_dict = get_cluster_name(df, ref_df, cluster_name)
    for index, row in df.iterrows():
        df['cluster'].iloc[index] = cluster_name_dict[row['cluster']]
    df.rename(columns={"cluster": "result_{}".format(i)}, inplace=True)
    return df

In [7]:
def running_kmeans(principalDf, iter_num, df, ref_df, kmeans_ini_status, kmeans_cluster_no):
    cluster_name = list(set(ref_df.cluster))
    result_df = pd.DataFrame()
    result_df['code'] = df['code']
    start_time = datetime.now()

    for i in range(0, int(iter_num)):
        kmeans = KMeans(
            init=kmeans_ini_status,
            n_clusters= int(kmeans_cluster_no),
        )

        kmeans.fit(principalDf)
        result_df['cluster'] = kmeans.labels_ 
        result_df = replace_cluster_number_to_name(result_df, ref_df, cluster_name, i)
        end_time = datetime.now()
        print("Applyig {} kmeans with {} second proceeding times".format(i, end_time - start_time, 2))
    return result_df
    

In [8]:
def search_max_matching_cluster(principalDf, iter_num, df, ref_df, kmeans_ini_status, kmeans_cluster_no):
    """ Iterate kmean algoritm for searching max matching SA1 for cluster """
    result_df = running_kmeans(principalDf, iter_num, df, ref_df, kmeans_ini_status, kmeans_cluster_no)
    #find the most frequent value of each row 
    result_df['most_freq_cluster'] = ''
    result_df['most_freq_cluster'] = result_df.mode(axis=1) 
    return result_df

In [9]:
result_df = search_max_matching_cluster(principalDf, iter_num, df, ref_df, kmeans_ini_status, kmeans_cluster_no)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Applyig 0 kmeans with 0:00:20.142117 second proceeding times


  warn(f"Unable to sort modes: {err}")
  warn(f"Unable to sort modes: {err}")


In [11]:
final_df = result_df[['code', 'most_freq_cluster']]
final_df.rename(columns={"code": "sa1_7digitcode_2016", "most_freq_cluster": "cluster"}, inplace=True)
final_df.to_csv('{}.csv'.format(output_file), sep=',', encoding='utf-8', index=False)