## Viz and PCA

This notebook contains code to generate visualisations for manual export. The intended end-user experiences is via the web app.

In [1]:
# imports

# libraries

import pandas as pd
import geopandas as gpd
import numpy as np
import plotly.express as px
import requests
from io import BytesIO

# finalised data sources back to wgs84 as necessary

lga_gdf = gpd.read_feather('data/final/final_lga.feather').to_crs(epsg=4326)
poa_gdf = gpd.read_feather('data/final/final_poa.feather').to_crs(epsg=4326)
sal_gdf = gpd.read_feather('data/final/final_sal.feather').to_crs(epsg=4326)
sa1_gdf = gpd.read_feather('data/final/final_sa1.feather').to_crs(epsg=4326)
#nodes_gdf = gpd.read_feather('data/final/final_nodes.feather').to_crs(epsg=4326)

# the last one is a bit too big to stick on github but you can read it from here instead to run the below viz without doing the prior processing steps
r = requests.get('https://tompisel.com/data/walkability_by_node.parquet')
if r.status_code == 200: nodes_gdf = gpd.read_parquet(BytesIO(r.content))


In [2]:
# parquet writes

lga_gdf.to_parquet('data/final/parquet/walkability_by_LGA.parquet')
poa_gdf.to_parquet('data/final/parquet/walkability_by_POA.parquet')
sal_gdf.to_parquet('data/final/parquet/walkability_by_SAL.parquet')
sa1_gdf.to_parquet('data/final/parquet/walkability_by_SA1.parquet')
nodes_gdf.to_parquet('data/final/parquet/walkability_by_node.parquet')





https://tompisel.com/data/walkability_by_LGA.parquet
https://tompisel.com/data/walkability_by_POA.parquet
https://tompisel.com/data/walkability_by_SAL.parquet
https://tompisel.com/data/walkability_by_SA1.parquet
https://tompisel.com/data/walkability_by_node.parquet

In [3]:
nodes_gdf.shape

(1689061, 105)

In [None]:
# create geojsons for uploading to kepler -- try it: https://kepler.gl/demo

# sal_gdf.to_file("data/final/sal.json", driver='GeoJSON')
# sa1_gdf.to_file("data/final/sa1.json", driver='GeoJSON')
# poa_gdf.to_file("data/final/poa.json", driver='GeoJSON')
# lga_gdf.to_file("data/final/lga.json", driver='GeoJSON')


In [20]:
# sample the point data for parquet

print('rows: ' + '{:,}'.format(nodes_gdf.shape[0]))

out_nodes = nodes_gdf.sample(100_000)

nodes_gdf.to_parquet('data/final/nodes.parquet')
out_nodes.to_parquet('data/final/sample_nodes.parquet')

out_nodes.to_file("data/final/sample_nodes.json", driver='GeoJSON')

out_nodes.head(10)

rows: 1,689,061


Unnamed: 0,osmid,x,y,geometry,node_weight,restaurant - within 2km,restaurant - within 1km,restaurant - within 500m,restaurant - within 200m,restaurant - closest,...,museum - within 2km,museum - within 1km,museum - within 500m,museum - within 200m,museum - closest,coffee_available - within 2km,coffee_available - within 1km,coffee_available - within 500m,coffee_available - within 200m,coffee_available - closest
1350803,9078922964,295622.427415,5801789.0,POINT (144.67521 -37.90849),5.55,81,2,0,0,554.682983,...,0,0,0,0,3397.437012,20,1,0,0,554.682983
148765,718586261,327161.277949,5819404.0,POINT (145.03796 -37.75637),1.293515,67,8,3,3,22.163,...,0,0,0,0,,18,2,0,0,800.705017
742933,1707986129,327564.990274,5821443.0,POINT (145.04302 -37.73807),2.195021,28,6,1,0,491.222992,...,0,0,0,0,,10,0,0,0,1078.189941
302222,5421659951,335043.793088,5801446.0,POINT (145.12330 -37.91959),1.891566,88,33,1,0,447.507996,...,0,0,0,0,,22,6,1,0,447.507996
1215227,2561069620,300952.665391,5808072.0,POINT (144.73753 -37.85309),3.764706,5,2,0,0,583.73999,...,0,0,0,0,,1,0,0,0,1893.432007
439310,9279635953,329481.847153,5810590.0,POINT (145.06222 -37.83620),3.297521,146,69,3,1,197.626007,...,0,0,0,0,,62,26,3,0,229.531998
1537159,5993324909,306690.388208,5821493.0,POINT (144.80628 -37.73345),1.513333,65,14,1,1,93.594002,...,0,0,0,0,,11,3,0,0,707.804016
1321768,8979766607,300427.361033,5802521.0,POINT (144.73003 -37.90297),3.110553,26,11,0,0,668.195007,...,0,0,0,0,,8,5,0,0,668.195007
464536,9557441104,323586.282099,5820847.0,POINT (144.99775 -37.74268),1.852732,163,63,4,1,113.829002,...,0,0,0,0,2611.657959,60,18,0,0,604.302002
227787,1925197568,347711.340516,5804536.0,POINT (145.26801 -37.89396),4.343434,20,0,0,0,1223.115967,...,0,0,0,0,,6,0,0,0,1501.319946


In [22]:
sal_gdf['geometry']

0      POLYGON ((145.00195 -37.79665, 145.00190 -37.7...
1      POLYGON ((144.89576 -37.76514, 144.89547 -37.7...
2      POLYGON ((144.67179 -37.72628, 144.67022 -37.7...
3      POLYGON ((144.87979 -37.71565, 144.88016 -37.7...
4      POLYGON ((144.76926 -37.74082, 144.77088 -37.7...
                             ...                        
343    POLYGON ((145.10068 -37.72707, 145.09982 -37.7...
344    POLYGON ((145.16108 -37.55947, 145.16004 -37.5...
345    POLYGON ((145.16657 -37.62474, 145.16668 -37.6...
346    POLYGON ((144.88986 -37.80977, 144.88992 -37.8...
347    POLYGON ((144.85250 -37.55800, 144.85303 -37.5...
Name: geometry, Length: 348, dtype: geometry

In [3]:
# maps

def px_point_map(gdf,color_column,invert_colors=False):

    fig = px.scatter_mapbox(
        gdf,
        lat=gdf.geometry.y,
        lon=gdf.geometry.x,
        opacity=0.8,
        color=color_column,
        color_continuous_scale=px.colors.sequential.Plasma_r if invert_colors else px.colors.sequential.Plasma,
        zoom=10,
        height=1200,
        size_max=1
    )

    fig.update_layout(
        mapbox_style="carto-darkmatter",
        margin={"r":0,"t":0,"l":0,"b":0}
    )

    fig.show()

downsampled = nodes_gdf.sample(100000)

## Distance to a library

In [None]:
px_point_map(downsampled,downsampled['library - closest'].apply(lambda x: np.nan if x > 2000 else x),True)

## Distance to a cafe

In [None]:
px_point_map(downsampled,downsampled['cafe - closest'].apply(lambda x: np.nan if x > 2000 else x),True)

## places without supermarkets in walking distance

In [7]:
px_point_map(downsampled,downsampled['grocery or supermarket - within 1km'].apply(lambda x: 0 if x==1 else np.nan),False)

In [11]:
def px_scatter(gdf,x,y,c,invert_colors=False):
    fig=px.scatter(gdf,x,y,color=c,color_continuous_scale='Plasma_r' if invert_colors else 'Plasma')
    fig.update_layout({
        'plot_bgcolor': 'rgb(17,17,17)',
        'paper_bgcolor': 'rgb(17,17,17)',
        'font': {
            'color': 'white'
        }
    })
    fig.update_xaxes(showline=True, linewidth=2, linecolor='white', gridcolor='grey')
    fig.update_yaxes(showline=True, linewidth=2, linecolor='white', gridcolor='grey')
    fig.show() 



px_scatter(sa1_gdf,'median_rent_weekly','cafe - within 500m','average_household_size')


In [13]:

px_scatter(sa1_gdf,'bar or pub - within 500m','park area - closest','median_rent_weekly')

In [29]:
px_scatter(sa1_gdf,'child care - within 2km','pct_owner_occupiers','median_age')

In [56]:
px_scatter(sa1_gdf,'pct_households_wo_cars','restaurant - within 1km','pct_apartments')

In [33]:
cols = [col for col in sa1_gdf.columns if col.endswith(' - within 1km')]

corr_matrix = sa1_gdf[cols].corr()


fig = px.imshow(corr_matrix,
                text_auto=True,
                aspect="auto",
                labels=dict(x="Variable", y="Variable", color="Correlation"),
                x=corr_matrix.columns,
                y=corr_matrix.columns,
                color_continuous_scale='Plasma'
               )

fig.update_layout({
    'plot_bgcolor': 'rgb(17,17,17)',
    'paper_bgcolor': 'rgb(17,17,17)',
    'font': {
        'color': 'white'
    }
})

fig.update_xaxes(showline=True, linewidth=2, linecolor='white', gridcolor='grey')
fig.update_yaxes(showline=True, linewidth=2, linecolor='white', gridcolor='grey')

fig.show()


In [52]:
stat_cols = ['median_age',
'median_mortgage_repayment_monthly',
'median_rent_weekly',
'average_persons_per_bedroom',
'median_household_income_weekly',
'average_household_size',
'pct_dwellings_unoccupied',
'pct_households_wo_cars',
'pct_renters',
'pct_owner_occupiers',
'pct_apartments',
'pct_houses',
'pct_townhouses',
'population_density',
'dwelling_density']

dist_cols = [col for col in sa1_gdf.columns if col.endswith(' - within 1km')]

corr_matrix = sa1_gdf[stat_cols+dist_cols].corr().loc[dist_cols,stat_cols]

fig = px.imshow(corr_matrix,
                text_auto=True,
                aspect="auto",
                labels=dict(x="Geographic statistic", y="Walkability measure", color="Correlation"),
                x=corr_matrix.columns,
                y=corr_matrix.index,
                color_continuous_scale='Plasma'
               )

fig.update_layout({
    'plot_bgcolor': 'rgb(17,17,17)',
    'paper_bgcolor': 'rgb(17,17,17)',
    'font': {
        'color': 'white'
    }
})

fig.show()

In [49]:
corr_matrix.loc[dist_cols,stat_cols]

Unnamed: 0,median_age,median_mortgage_repayment_monthly,median_rent_weekly,average_persons_per_bedroom,median_household_income_weekly,average_household_size,pct_dwellings_unoccupied,pct_households_wo_cars,pct_renters,pct_owner_occupiers,pct_apartments,pct_houses,pct_townhouses,population_density,dwelling_density
restaurant - closest,0.001967,-0.067985,-0.032984,-0.219098,0.076554,0.417703,-0.307843,-0.360649,-0.335406,0.366716,-0.364283,0.475415,-0.286273,-0.143383,-0.134689
grocery or supermarket - closest,-0.007285,-0.036773,-0.003679,-0.217179,0.123805,0.399529,-0.28085,-0.363673,-0.322543,0.359031,-0.346208,0.441587,-0.258973,-0.146348,-0.136732
cafe - closest,-0.059312,-0.165959,-0.104541,-0.18601,-0.016234,0.49773,-0.363928,-0.398198,-0.35166,0.367948,-0.420398,0.524149,-0.293911,-0.153818,-0.149532
bar or pub - closest,-0.042814,-0.109149,-0.065716,-0.209556,-0.002123,0.49328,-0.364841,-0.41769,-0.355141,0.386127,-0.445838,0.52802,-0.259749,-0.169205,-0.164861
place of worship - closest,-0.082046,-0.253318,-0.172835,-0.196778,-0.071552,0.452765,-0.364982,-0.378964,-0.291777,0.326854,-0.421235,0.512061,-0.271735,-0.140725,-0.13755
tourist attraction - closest,-0.029266,-0.213877,-0.122083,-0.19011,-0.080369,0.501209,-0.394793,-0.405734,-0.343019,0.360237,-0.455734,0.543335,-0.273081,-0.169539,-0.168077
community area - closest,0.019554,-0.121453,-0.056887,-0.183904,-0.004511,0.337118,-0.259448,-0.309716,-0.300678,0.309362,-0.310263,0.425654,-0.286424,-0.129274,-0.120595
aged care - closest,-0.178593,-0.127402,-0.078193,-0.086792,0.063795,0.353927,-0.154239,-0.190158,-0.135682,0.187297,-0.193449,0.335594,-0.301495,-0.005783,-0.003106
park area - closest,0.092183,-0.031898,-0.013626,-0.068319,0.010664,0.040491,0.029321,-0.046922,-0.100559,0.084513,-0.036482,0.105098,-0.14058,-0.001442,0.012359
school - closest,0.101312,-0.016531,0.006463,-0.210961,0.067127,0.182113,-0.167997,-0.260328,-0.232025,0.244397,-0.230381,0.272739,-0.146535,-0.10976,-0.097604


In [15]:
sal_gdf[['geography_name','coffee_available - within 200m']].sort_values(['coffee_available - within 200m'],ascending=False).head(10)

Unnamed: 0,geography_name,coffee_available - within 200m
202,Melbourne,6.038803
132,Fitzroy (Vic.),3.923972
65,Carlton (Vic.),3.550501
108,Docklands,3.542965
278,South Yarra,3.381208
275,South Melbourne,2.890231
91,Cremorne (Vic.),2.879659
286,St Kilda (Vic.),2.551097
331,West Melbourne,2.256206
21,Balaclava (Vic.),2.130121


In [28]:
#pca

# List comprehension with endswith()
cols = [col for col in sa1_gdf.columns if col.endswith(' - within 1km')]

data = sa1_gdf[cols]


from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

pca = PCA(n_components=2) 

pca.fit(data_scaled)

data_pca = pca.transform(data_scaled)

data_pca_df = pd.DataFrame(data_pca, columns=['PC1', 'PC2'])

print(pca.explained_variance_ratio_)

loadings = pca.components_


loadings_df = pd.DataFrame(loadings.T, columns=['PC1', 'PC2'], index=data.columns)

print(loadings_df)

[0.43876309 0.09305747]
                                          PC1       PC2
restaurant - within 1km              0.325466  0.075778
grocery or supermarket - within 1km  0.277125 -0.103115
cafe - within 1km                    0.301489  0.180474
bar or pub - within 1km              0.307257  0.099128
place of worship - within 1km        0.239556 -0.115263
tourist attraction - within 1km      0.142525 -0.137810
community area - within 1km          0.216821 -0.220012
aged care - within 1km               0.055433 -0.349011
park area - within 1km               0.070281 -0.495748
school - within 1km                  0.206176 -0.168558
child care - within 1km              0.213718 -0.241546
library - within 1km                 0.179352 -0.214491
emergency services - within 1km      0.227510 -0.013816
medical facility - within 1km        0.149204 -0.180438
entertainment centre - within 1km    0.212811  0.237095
swimming pool - within 1km           0.158399  0.017966
tertiary institution - w


is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.


is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.


is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.


is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.


is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.


is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.

