In [None]:
import csv
import json
import numpy as np
import math,random
from scipy.spatial.distance import pdist, squareform
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from pandas.tools.plotting import parallel_coordinates
import seaborn as sns

# Set resolution
fig = plt.figure(figsize=(16.0, 9.0))  
fig.patch.set_facecolor('grey')

def VAT(R):
    """

    VAT algorithm adapted from matlab version:
    http://www.ece.mtu.edu/~thavens/code/VAT.m

    Args:
        R (n*n double): Dissimilarity data input
        R (n*D double): vector input (R is converted to sq. Euclidean distance)
    Returns:
        RV (n*n double): VAT-reordered dissimilarity data
        C (n int): Connection indexes of MST in [0,n)
        I (n int): Reordered indexes of R, the input data in [0,n)
    """
        
    R = np.array(R)
    N, M = R.shape
    if N != M:
        R = squareform(pdist(R))
        
    J = list(range(0, N))
    
    y = np.max(R, axis=0)
    i = np.argmax(R, axis=0)
    j = np.argmax(y)
    y = np.max(y)

    I = i[j]
    del J[I]

    y = np.min(R[I,J], axis=0)
    j = np.argmin(R[I,J], axis=0)
    
    I = [I, J[j]]
    J = [e for e in J if e != J[j]]
    
    C = [1,1]
    for r in range(2, N-1):   
        y = np.min(R[I,:][:,J], axis=0)
        i = np.argmin(R[I,:][:,J], axis=0)
        j = np.argmin(y)        
        y = np.min(y)      
        I.extend([J[j]])
        J = [e for e in J if e != J[j]]
        C.extend([i[j]])
    
    y = np.min(R[I,:][:,J], axis=0)
    i = np.argmin(R[I,:][:,J], axis=0)
    
    I.extend(J)
    C.extend(i)

    RI = list(range(N))
    for idx, val in enumerate(I):
        RI[val] = idx

    RV = R[I,:][:,I]
    
    return RV.tolist(), C, I

# Read jsonfile
with open('population_parse.json') as L:
    data = json.load(L)  
# Read CSV file
with open('allschoolslist2016.csv', 'r') as csvfile:
    reader = csv.reader(csvfile)
    # Ignore the header row
    next(reader)
    school_name = []
    s_suburb = []
    for each in reader:
        school_name = school_name + [each[3]]#create list of school names
        s_suburb = s_suburb + [each[8]]#create list of school location names

# getting unique values and sorted values on suburbs
unique_suburb = set([each.lower() for each in s_suburb])
sorted_suburb = sorted([each.lower() for each in s_suburb])

loc_sch = []
# create list contain -> [suburb : number of school in each suburb]
for loc in unique_suburb:
    loc_sch.append([loc, 0])
# every time a name appears, school number will be incremented respectively
for loc in sorted_suburb:
    for each in loc_sch:
        if loc == each[0]:
            each[1] += 1
#------------------------------------------------------------------------------#
sch_loc = []
sch_num = []
loc_and_sch = []
# group suburb and number of schools seperately.
for each in loc_sch:
    # Delete all spaceing
    each[0] = each[0].replace(' ','')
    sch_loc.append(each[0])
    sch_num.append(each[1])
# Creating list -> [suburb : amount of school]
loc_and_sch.append(sch_loc)
loc_and_sch.append(sch_num)
    
loc = []
pop = []
loc_and_pop = []
# Retrieve respective data
for feature in data['melbourne']:
    p_suburb = feature['suburb'].encode('utf-8')
    people = feature['population'].encode('utf-8')
    if people != "0":
        # replace all unnecessary characters
        p_suburb = p_suburb.replace(' ','').replace('-','')\
            .replace('Vic.','').replace('(','').replace(')','').lower()
        loc.append(p_suburb)
        pop.append(int(people))
# Creating list -> [suburb : population]        
loc_and_pop.append(loc)
loc_and_pop.append(pop)
#print (loc_and_pop[0][0], loc_and_pop[1][0])

#------------------------------------------------------------------------------#
pop_sch = []
# Comparison test to gather matching suburb name
for i in loc_and_pop[0]:
    for j in loc_and_sch[0]:
        if i==j:
            pop_val = loc_and_pop[1][loc_and_pop[0].index(i)]
            sch_val = loc_and_sch[1][loc_and_sch[0].index(j)]
            # Creating list -> [matched suburb : population : amount of school] 
            pop_sch.append([pop_val,sch_val])
# 274 suburbs in 'loc'
# 954 suburbs in 'sch_loc'
# ====> 181 suburbs matches
#------------------------------------------------------------------------------#

# normaliase the dataset ( we skip the fist column )
pop_sch_std = StandardScaler().fit_transform(pop_sch)

# Apply VAT Algorithm and visualise using heatmap
RV, R, I = VAT(pop_sch_std)
x = sns.heatmap(RV, cmap='magma', xticklabels=False, yticklabels=False)
x.set(xlabel='School', ylabel='population')
plt.show()