# Import Libraries

In [1]:
import configparser
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import numpy as np
import operator
import warnings
from datetime import datetime
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Read Config File

In [18]:
config = configparser.ConfigParser()
config.read('config.ini')
input_directory = config['DEFAULT']['Input-Files-Directory']
CODE = config['DEFAULT']['Code']
input_file = config['DEFAULT']['Input_Data']
pca_no = config['DEFAULT']['PCA_Component_No']
kmeans_ini_status = config['DEFAULT']['Kmeans_Init_Status']
kmeans_cluster_no = int(config['DEFAULT']['Kmean_Cluster_No'])
ref_cluster_result_file = config['DEFAULT']['Ref-Cluster-Result-File']
aggre_cols_file = config['DEFAULT']['Aggre_cols_File']
iter_num = int(config['DEFAULT']['no_iteration'])
output_file = config['DEFAULT']['Output-File']

# Read Input Files

In [19]:
df = pd.read_csv("{}/{}.csv".format(input_directory, input_file), sep=',', encoding='utf-8')
ref_df = pd.read_csv("{}.csv".format(ref_cluster_result_file), sep=',', encoding='utf-8')
aggregate_df = pd.read_csv("{}.txt".format(aggre_cols_file), delimiter = ",", comment='#', header=0)

# Apply Principal Component Analysis (PCA)

In [4]:
features = df.columns.tolist()
features.remove(CODE)
features.remove('pops')
features.remove('hhs')
features.remove('p15')
features.remove('Median Age')
features.remove('Median Monthly Mortgage')
features.remove('Median Ind Income')
features.remove('Median Weekly Rent')
features.remove('Median Fam Income')
features.remove('Median Hhd Income')
features.remove('Av Household Size')

x = df.loc[:, features].values

# set number of components
num = int(pca_no)

pca = PCA(n_components=num)
pc_columns = []
for i in range(1,num+1): 
    pc_columns.append('pc{}'.format(i))

principalComponents = pca.fit_transform(x)
pca_df = pd.DataFrame(data = principalComponents, columns = pc_columns)

# K-means clustering

In [5]:
def get_cluster_name(df, ref_df, cluster_name):
    """ rename cluster by refering exist clustered data """
    cluster_name_dict = {}
    for i in range(len(set(df.cluster))): 
        selected_df1 = df.loc[df.cluster == i].astype(str) # covert data type as str
        max_sa1_matched = 0
        matched_cluster = cluster_name[0]
        for c in cluster_name:
            selected_df2 = ref_df.loc[ref_df.cluster == c].astype(str) # covert data type as str
            common_sa1 = list(set(selected_df1[CODE].tolist()).intersection(selected_df2[CODE].tolist()))
            if max_sa1_matched < len(common_sa1):
                max_sa1_matched = len(common_sa1)
                matched_cluster = c
        cluster_name_dict[i] = matched_cluster
        
    return cluster_name_dict

In [6]:
def replace_cluster_number_to_name(df, ref_df, cluster_name, i):
    """ replace cluster number to cluster name. E.g. 1 -> Country Towns """
    cluster_name_dict = get_cluster_name(df, ref_df, cluster_name)
    for index, row in df.iterrows():
        df['cluster'].iloc[index] = cluster_name_dict[row['cluster']]
    df.rename(columns={"cluster": "result_{}".format(i)}, inplace=True)
    return df

In [7]:
def running_kmeans(kmeans_cluster_no, kmeans_ini_status, pca_df, iter_num, df, ref_df):
    cluster_name = list(set(ref_df.cluster))
    result_df = pd.DataFrame()
    result_df[CODE] = df[CODE]
    start_time = datetime.now()

    for i in range(0, iter_num):
        kmeans = KMeans(
            init=kmeans_ini_status,
            n_clusters=kmeans_cluster_no,
        )
        kmeans.fit(pca_df)
        result_df['cluster'] = kmeans.labels_ 
        result_df = replace_cluster_number_to_name(result_df, ref_df, cluster_name, i)
        end_time = datetime.now()
        if i % 10 == 0:
            print("Applyig {} kmeans with {} second proceeding times".format(i, end_time - start_time, 2))
    return result_df
    

In [8]:
def find_top_matched_communities(df):
    """ read clustering results and save top matching communities in a new dataframe """
    new_df = pd.DataFrame()
    for index, row in df.iterrows():
        count_dic = {}
        for i in range(0, len(df.columns.tolist())-1):
            # read 100 times kmean clustering results and save into dictionary
            matched_community = row['result_{}'.format(i)]
            if matched_community not in count_dic.keys():
                count_dic[matched_community] = 1
            else:
                count_dic[matched_community] += 1 
        # sort the dictionary by value and save into new DF
        matching_result = sorted(count_dic.items(), key=lambda item: item[1], reverse=True)
        if len(matching_result) > 1:
            new_df = new_df.append({CODE: int(row[CODE]),
                                   'top1_community':matching_result[0][0],
                                    'top2_community':matching_result[1][0],
                                    'top1_community_rate':matching_result[0][1],
                                    'top2_community_rate':matching_result[1][1],
                                   }, ignore_index=True)
        else:
            new_df = new_df.append({CODE: int(row[CODE]),
                                   'top1_community':matching_result[0][0],
                                    'top2_community':'',
                                    'top1_community_rate':matching_result[0][1],
                                    'top2_community_rate':0,
                                   }, ignore_index=True)
    return new_df

In [9]:
clustering_result_df = running_kmeans(kmeans_cluster_no, kmeans_ini_status, pca_df, iter_num, df, ref_df)
updated_clustering_result_df = find_top_matched_communities(clustering_result_df)
result_df = pd.merge(df, updated_clustering_result_df, on=CODE)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Applyig 0 kmeans with 0:00:23.513353 second proceeding times


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, val

# Aggregate columns

After we get clustering results, we aggregate some variables(columns) to improve our analysability on clustered census data (Australia Community). 

In [20]:
def aggregate_columns(aggregate_df, df):
    copied_df = df.copy()
    row = 0
    for columns in aggregate_df['columns_to_aggregate']:
        total_n = 0
        for col in columns.split(','):
            total_n += copied_df[col]
            # drop column
            copied_df.drop(columns=[col], inplace=True)
        copied_df[aggregate_df['aggregated_column'][row]]  = total_n
        row += 1
    return copied_df

In [21]:
aggregated_df = aggregate_columns(aggregate_df, result_df)

### Change 'Remote' to 'Remote or Disadvantaged'

In [22]:
for index, row in aggregated_df.iterrows():
    if row['top1_community'] == 'Remote':
        aggregated_df['top1_community'].iloc[index] = 'Remote or Disadvantaged'
    elif row['top2_community'] == 'Remote':
        aggregated_df['top2_community'].iloc[index] = 'Remote or Disadvantaged'

In [25]:
aggregated_df.to_csv('{}.csv'.format(output_file), sep=',', encoding='utf-8', index=False)