# Import Libraries

In [1]:
import configparser
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import numpy as np
import operator
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Read Config File

In [2]:
config = configparser.ConfigParser()
config.read('config.ini')
input_directory = config['DEFAULT']['Input-Files-Directory']
input_file = config['DEFAULT']['Input_Data']
pca_no = config['DEFAULT']['PCA_Component_No']
kmeans_ini_status = config['DEFAULT']['Kmeans_Init_Status']
kmeans_cluster_no = config['DEFAULT']['Kmean_Cluster_No']
kmeans_random_state = config['DEFAULT']['Kmeans_Random_State']
ref_cluster_result_file = config['DEFAULT']['Ref-Cluster-Result-File']
output_file = config['DEFAULT']['Output-File']

# Apply Principal Component Analysis (PCA)

In [3]:
df = pd.read_csv("{}/{}.csv".format(input_directory, input_file), sep=',', encoding='utf-8')
features = df.columns.tolist()
features.remove('code')
features.remove('pops')
features.remove('hhs')
features.remove('p15')
features.remove('Median Age')
features.remove('Median Monthly Mortgage')
features.remove('Median Ind Income')
features.remove('Median Weekly Rent')
features.remove('Median Fam Income')
features.remove('Median Hhd Income')
features.remove('Av Household Size')

x = df.loc[:, features].values

# set number of components
num = int(pca_no)

pca = PCA(n_components=num)
pc_columns = []
for i in range(1,num+1): 
    pc_columns.append('pc{}'.format(i))

principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents, columns = pc_columns)

# K-means clustering

In [4]:
kmeans = KMeans(
    init = kmeans_ini_status,
    n_clusters = int(kmeans_cluster_no),
    random_state = int(kmeans_random_state)
)

kmeans.fit(principalDf)

# save kmean clustering result to df
df['cluster'] = kmeans.labels_

# convert data types
df['code'] = df['code'].astype(str)
df['cluster'] = df['cluster'].astype(int)
df.rename(columns={'code':'sa1_7digitcode_2016'}, inplace=True)

# renmae clusters for visualizing clusters on map by using NationalMap
for index, row in df.iterrows():
    cluster_no = row['cluster']
    df['cluster'].iloc[index] = "{}_cluster".format(cluster_no)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


# Change cluster number to meaningful name

In [5]:
def get_cluster_name(df, LW_df, cluster_name):
    """ rename cluster by refering exist clustered data """
    cluster_name_dict = {}
    for i in range(len(set(df.cluster))): 
        selected_df1 = df.loc[df.cluster == "{}_cluster".format(i)]
        max_sa1_matched = 0
        for c in cluster_name:
            selected_df2 = LW_df.loc[LW_df.cluster == c].astype(str) # covert data type as str
            common_sa1 = list(set(selected_df1.sa1_7digitcode_2016.tolist()).intersection(selected_df2.sa1_7digitcode_2016.tolist()))
            if max_sa1_matched < len(common_sa1):
                max_sa1_matched = len(common_sa1)
                matched_cluster = c
        cluster_name_dict["{}_cluster".format(i)] = matched_cluster
        
    return cluster_name_dict

In [6]:
def replace_cluster_number_to_name(df, LW_df, cluster_name):
    cluster_name_dict = get_cluster_name(df, LW_df, cluster_name)

    # replace cluster number to name
    for index, row in df.iterrows():
        df['cluster'].iloc[index] = cluster_name_dict[row['cluster']]
    return df

In [7]:
LW_df = pd.read_csv('{}.csv'.format(ref_cluster_result_file), sep=',', encoding='utf-8')
cluster_name = list(set(LW_df.cluster))
LW_df.sort_values(by=['sa1_7digitcode_2016'], inplace=True)
df.sort_values(by=['sa1_7digitcode_2016'], inplace=True)
df = replace_cluster_number_to_name(df, LW_df, cluster_name)

# Save Result

In [8]:
df.to_csv('{}.csv'.format(output_file), sep=',', encoding='utf-8', index=False)

# Evaluate result by comparing exist cluster result

In [9]:
new_df = pd.read_csv('{}.csv'.format(output_file), sep=',', encoding='utf-8')
new_df.sort_values(by=['sa1_7digitcode_2016'], inplace=True)

exist_df = pd.read_csv('{}.csv'.format(ref_cluster_result_file), sep=',', encoding='utf-8')
exist_df.sort_values(by=['sa1_7digitcode_2016'], inplace=True)

merged_df = pd.merge(exist_df, new_df, on='sa1_7digitcode_2016')
count = 0
for index, row in merged_df.iterrows():
    if row['cluster_x'] != row['cluster_y']:
        count += 1

print("{}% of SA1 have identical cluster in the two results".format(round((1 - (count/len(merged_df)))*100, 2)))

95.82% of SA1 have identical cluster in the two results
