In [35]:
import pandas as pd
import os
import pickle
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans

import scikitplot as skp

%matplotlib inline

## Read the Data

In [95]:
# read the pickled data
pickle_data = open(os.path.join(os.getcwd(), '..', 'data', 'processed', 'clean_data.pickle'), 'rb')
data = pickle.load(pickle_data)
# rename the columns for plotting
data.columns = ['Year', 'Income Index', 'Education Index', 'Percent White',
                'Rent Index', 'Value Index', 'Percent Employed']
# first column is year, which I don't want to mess with
cols = data.columns[1:]

## Standardization

In [93]:
# create instance of RobustScaler class
rs = RobustScaler()
# fit and transform the data and put back into a dataframe
data_rs = pd.DataFrame(rs.fit_transform(data[cols]), index=data.index, columns=cols)

## Dimension Reduction (Plotting)

In [31]:
# create instance of PCA class
pca = PCA()
# fit and transform the data and put back into dataframe
data_pca = pd.DataFrame(pca.fit_transform(data_rs), index=data.index)
# rename the columns
data_pca.columns = ['pca{}'.format(i) for i in range(1, 7)]

In [32]:
# create tsne instance
tsne = TSNE(n_components=2, init='pca')
# fit and transform the scaled data and put back into a dataframe
data_tsne = pd.DataFrame(tsne.fit_transform(data_rs), index=data.index)
# rename the columns
data_tsne.columns = ['tsne1', 'tsne2']

## KMeans Clustering (6 clusters)

In [96]:
# create kmeans model
kmeans = KMeans(n_clusters=6)
# fit the model and predict the labels
labels = kmeans.fit_predict(data_rs)
# assign the clusters to a new column in the original dataset
data['clusters'] = labels
data.head()

Unnamed: 0_level_0,Year,Income Index,Education Index,Percent White,Rent Index,Value Index,Percent Employed,clusters
geoid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
48021950100,2000,73599.981153,3.617289,86.192698,723.087018,153375.72937,98.308865,1
48021950200,2000,67668.904461,3.154098,67.591995,752.448335,118748.232408,96.715168,3
48021950300,2000,78678.9354,3.669384,81.097069,808.205215,178808.676101,97.032326,1
48021950400,2000,75438.539903,4.075207,76.605317,869.478211,158917.899778,98.340471,1
48021950600,2000,67172.153161,3.465056,88.921283,737.822784,134719.422878,96.891342,1


## Clustering Analysis

### Ordering the Clusters
I'll start by logically ordering the clusters by median value from lowest to highest. The reason for this is that the numbering of kmeans cluster centers is random and changes from model run to model run. Ordering them logically will come in handy when seeing how the neighborhoods move between clusters. 

In [97]:
clust_rank = pd.DataFrame(index=np.unique(data['clusters']))
# assign the medians for each column
for col in cols:
    clust_rank[col] = data.groupby('clusters')[col].median()
display(clust_rank)
# rank the medians by value
clust_rank = clust_rank.rank()
# weighting home value rank more since it's the best indicator 
# of price inequality; using this to break ties basically
clust_rank['Value Index'] *= 1.1
# take the rank mean for each cluster
clust_rank = clust_rank.mean(axis=1).to_frame()
# rank the rank mean and convert to int
clust_rank = clust_rank.rank().astype('int')
display(clust_rank)
# now create the map to apply to the clusters
clust_map = clust_rank.to_dict()[0]

Unnamed: 0,Income Index,Education Index,Percent White,Rent Index,Value Index,Percent Employed
0,119974.157156,5.174074,85.690739,1651.891428,270575.695857,95.902872
1,76820.601767,4.463532,81.273048,1126.865136,192838.470812,94.499811
2,171051.091105,5.918578,89.250163,1633.785775,662578.417609,95.994718
3,53574.019778,3.449301,55.980087,964.68327,137092.582313,93.516633
4,97763.340782,5.399161,88.518223,1175.66055,386811.604218,95.493763
5,52016.719047,3.685408,70.942672,949.121473,153772.489388,87.495723


Unnamed: 0,0
0,5
1,3
2,6
3,1
4,4
5,2


In [98]:
# map the new cluster labels
data['clusters'] = data['clusters'].map(clust_map)
data.head()

Unnamed: 0_level_0,Year,Income Index,Education Index,Percent White,Rent Index,Value Index,Percent Employed,clusters
geoid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
48021950100,2000,73599.981153,3.617289,86.192698,723.087018,153375.72937,98.308865,3
48021950200,2000,67668.904461,3.154098,67.591995,752.448335,118748.232408,96.715168,1
48021950300,2000,78678.9354,3.669384,81.097069,808.205215,178808.676101,97.032326,3
48021950400,2000,75438.539903,4.075207,76.605317,869.478211,158917.899778,98.340471,3
48021950600,2000,67172.153161,3.465056,88.921283,737.822784,134719.422878,96.891342,3
