# Import Libraries

In [7]:
import configparser
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import numpy as np
import operator
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Read Config File

In [8]:
config = configparser.ConfigParser()
config.read('config.ini')
input_directory = config['DEFAULT']['Input-Files-Directory']
input_file = config['DEFAULT']['Input_Data']
pca_no = config['DEFAULT']['PCA_Component_No']
kmeans_ini_status = config['DEFAULT']['Kmeans_Init_Status']
kmeans_cluster_no = config['DEFAULT']['Kmean_Cluster_No']
kmeans_ini_no = config['DEFAULT']['Kmeans_Init_No']
kmeans_max = config['DEFAULT']['Kmeans_Max_iter']
kmeans_random_state = config['DEFAULT']['Kmeans_Random_State']
output_file = config['DEFAULT']['Output-File']

# Apply Principal Component Analysis (PCA)

In [9]:
df = pd.read_csv("{}/{}.csv".format(input_directory, input_file), sep=',', encoding='utf-8')
features = df.columns.tolist()
features.remove('code')
features.remove('pops')
features.remove('hhs')
features.remove('p15')
features.remove('Median Age')
features.remove('Median Monthly Mortgage')
features.remove('Median Ind Income')
features.remove('Median Weekly Rent')
features.remove('Median Fam Income')
features.remove('Median Hhd Income')
features.remove('Av Household Size')

x = df.loc[:, features].values

# set number of components
num = int(pca_no)

pca = PCA(n_components=num)
pc_columns = []
for i in range(1,num+1): 
    pc_columns.append('pc{}'.format(i))

principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents, columns = pc_columns)

# K-means clustering

In [10]:
kmeans = KMeans(
    init = kmeans_ini_status,
    n_clusters = int(kmeans_cluster_no),
    n_init = int(kmeans_ini_no),
    max_iter = int(kmeans_max),
    random_state = int(kmeans_random_state)
)

kmeans.fit(principalDf)

# save kmean clustering result to df
df['cluster'] = kmeans.labels_

# convert data types
df['code'] = df['code'].astype(str)
df['cluster'] = df['cluster'].astype(int)
df.rename(columns={'code':'sa1_7digitcode_2016'}, inplace=True)

# renmae clusters for visualizing clusters on map by using NationalMap
for index, row in df.iterrows():
    cluster_no = row['cluster']
    df['cluster'].iloc[index] = "{}_cluster".format(cluster_no)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


# Change cluster number to meaningful name

In [11]:
for index, row in df.iterrows():
    if row['cluster'] == '0_cluster':
        df['cluster'].iloc[index] = 'Remote'
    elif row['cluster'] == '1_cluster':
        df['cluster'].iloc[index] = 'Semi Rural'
    elif row['cluster'] == '2_cluster':
        df['cluster'].iloc[index] = 'Country Towns'
    elif row['cluster'] == '3_cluster':
        df['cluster'].iloc[index] = 'Fringe_Growth'
    elif row['cluster'] == '4_cluster':
        df['cluster'].iloc[index] = 'Migrants'
    elif row['cluster'] == '5_cluster':
        df['cluster'].iloc[index] = 'Rural'
    elif row['cluster'] == '6_cluster':
        df['cluster'].iloc[index] = 'Inner city'
    elif row['cluster'] == '7_cluster':
        df['cluster'].iloc[index] = 'Leafy Suburbs'
    elif row['cluster'] == '8_cluster':
        df['cluster'].iloc[index] = 'Inner Suburbs'
    elif row['cluster'] == '9_cluster':
        df['cluster'].iloc[index] = 'CBD'

# Save Result

In [12]:
df.to_csv('{}.csv'.format(output_file), sep=',', encoding='utf-8', index=False)