In [1]:
from itertools import chain
import pandas as pd
import data_transforms
import from_r_gen
import viz
import similarity
import map_fi_plot
import matplotlib.pyplot as plt
%matplotlib inline
import os
filepath_prefix = ''
os.chdir('../')

In [None]:
data = from_r_gen.load_r_data(filepath_prefix + 'paavo_counts.csv')
data2 = from_r_gen.load_r_data(filepath_prefix + 'paavo_shares.csv')

#combine nominal and share vars
data = data.loc[:, list(chain.from_iterable([['pono', 'pono.level', 'vuosi', 'nimi'], data_transforms.NOMINAL_VARS]))]
data2 = data2.loc[:, list(chain.from_iterable([['pono', 'vuosi', 'nimi'], data_transforms.SHARES_VARS]))]


cols_to_use = data2.columns.difference(data.columns)
data = pd.merge(data, data2[cols_to_use], left_index=True, right_index=True, how='outer')
data = data.reindex()

data.loc[data['pono.level'] == 5, 'pono'] = [format(x, '05d') for x in data.loc[data['pono.level'] == 5, 'pono']]
data.loc[data['pono.level'] == 3, 'pono'] = [format(x*100, '05d') for x in data.loc[data['pono.level'] == 3, 'pono']]
data.loc[data['pono.level'] == 2, 'pono'] = [format(x*100, '05d') for x in data.loc[data['pono.level'] == 2, 'pono']]


to_format = ['he_kika', 'tr_mtu', 'ra_as_kpa', 'hr_pi_tul', 'hr_ke_tul', 'hr_hy_tul',
             'pt_tyoll', 'pt_tyott', 'pt_tyovu', 'pt_0_14', 'pt_opisk', 'pt_elakel', 'hr_ovy',
             'tr_pi_tul', 'tr_ke_tul', 'tr_hy_tul', 'te_nuor', 'te_eil_np', 'te_laps', 'te_aik',
             'te_elak', 'te_omis_as', 'te_vuok_as', 'te_takk', 'te_as_valj']
for c in to_format:
    data[c] = [float(str(x).replace(",", ".")) for x in data[c]]


data.fillna(0, inplace=True)

#select only one year
data = data.loc[data['vuosi'] == 2018, :]

Ok, data has been loaded.

Let's do an exploration of PCA variancee explanation, to see how many dimensions we might want to use.

In [None]:
X, y, target_names = viz.get_pca_data(data, 2018, 5)
target_names.index = range(len(target_names))
viz.exploratory_pca(X, 20)

Well, based on the plot it looks like we might need quite a few dimensions to capture the variance! I was expecting the data to reduce to fewer dimensions quite nicely, but perhaps not. Let's plot a two-factor PCA to see what it looks like:

In [None]:
X_pca, pipe = viz.do_pca(X, 2)
viz.pca_plot(X_pca, target_names, y.ravel())

Yeah, it really looks like there is some structure in the data that is not captured in the two dimensions.

In [None]:
X_pca, pipe = viz.do_pca(X, 2)
data_l2 = data.loc[data['pono.level'] == 5, :].assign(max_factor=pd.DataFrame(X_pca.argmax(axis=1)))
map_fi_plot.map_fi_postinumero(data_l2, "Highest factors per area", color_var='max_factor')


It's not that great, is it. All the cities have the same max loading factor, but there are a ton of other areas with the same property, too. I'm guessing we could do better.

Let's try with five dimensions.

In [None]:
X_pca, pipe = viz.do_pca(X, 5)
data_l5 = data.loc[data['pono.level'] == 5, :].assign(max_factor=pd.DataFrame(X_pca.argmax(axis=1)))
map_fi_plot.map_fi_postinumero(data_l5, "Highest factors per area", color_var='max_factor')

Excellent! This looks much better already! It seems that five dimensions are enough to capture the differences between cities, their surrounding boroughs, and the countryside.

The functions also allow for printing and plotting postcode areas that are "most similar" to a given postcode area. For example, which areas are most similar to Otaniemi?

In [None]:
d = similarity.pairwise_distances(X_pca, X_pca, 'euclidean')
names = similarity.get_n_most_similar_with_name("Otaniemi", d, target_names, 15)
print(names)

In [None]:
map_fi_plot.map_with_highlights_names(data_l5, "15 areas most similar to Otaniemi", 'Otaniemi', names)


All the areas above are at leas plausible on their face: each postcode area is somewhat close to a University.

In [None]:
names = similarity.get_n_most_similar_with_name("Kallio", d, target_names, 15)
print(names)

In [None]:
map_fi_plot.map_with_highlights_names(data_l5, "15 areas most similar to Kallio", 'Kallio', names)

This result is maybe a little bit more surprising. We get areas close in geography, like Sörnäinen and Alppila, and also some central areas of other cities. But there is also Kamppi (dense city but richer) and Kirkonkylä from Vantaa. Lastly, a close match is the factory district of Pitäjänmäki, no idea why.