In [None]:
# import packages
import pickle
import pandas as pd
import numpy as np

# adjust display options
pd.options.display.max_rows = 2000
pd.options.display.max_columns = 2000
pd.options.display.max_colwidth = 10000

In [None]:
data_dir = '../data/'
city = 'Tucson'

In [None]:
# read pickle
intersections_df_features = data_dir + city + '_intersections_df_features.pkl' 
intersections_df = pd.read_pickle(intersections_df_features)

In [None]:
# take only those intersections which have 3 or more ways
intersections_df = intersections_df[intersections_df['no_of_ways']>2]
len(intersections_df)

In [None]:
# add feature - number of unique ways at a node
intersections_df['no_of_unique_ways'] = intersections_df['name'].apply(lambda x : len(set(x)))

In [None]:
ways_features_names = list(set([u'hgv', u'lanes', u'oneway', u'bicycle',
       u'highway', u'bridge', u'layer', u'cycleway', u'sidewalk','maxspeed',
       u'busway', u'abutters', u'bicycle_road', u'driving_side', u'ford',
       u'ice_road', u'incline', u'junction', u'lit', u'motorroad','name',
       u'mountain_pass', u'mtb:scale', u'mtb:scale:uphill', u'mtb:description',
       u'overtaking', u'parking:condition', u'parking:lane', u'parking_places',
       u'sac_scale', u'service', u'surface', u'tactile_paving', u'tracktype',
       u'traffic_calming', u'trail_visibility', u'winter_road', u'place',
       u'railway', u'electrified', u'embankment', u'route', u'tourism',
       u'charge', u'location', u'narrow', u'tunnel', u'width', u'access',
       u'agriculture', u'maxheight', u'maxlength', u'maxstay', u'maxwidth',
       u'maxweight', u'minspeed', u'noexit']))

## Features (expanded with values for each category in a feature)

In [None]:
import collections
import copy

In [None]:
intersections_expanded_df = copy.deepcopy(intersections_df)

# function to count how many ways posses the feature in the given intersection 
def convert_to_bag_of_categories(x, categories):
    v = [0]*len(categories)
    x = collections.Counter(x)
    for i in range(len(categories)):
        if categories[i] in  x:
            v[i]=x[categories[i]]
    return v
    
    
categories_all = []

# features not to be expanded
skip_features = ['name','no_of_ways']

for feature in ways_features_names:
    if feature in skip_features:
        continue
    categories = []
    intersections_expanded_df[feature].apply(lambda x : categories.extend(x))
    categories = list(set(categories))
    intersections_expanded_df[feature] = intersections_expanded_df[feature].apply(convert_to_bag_of_categories, args = (categories,))
    
intersections_expanded_df['highway'][:10]

# Clustering

## Flatten features

In [None]:
clustering_feature_names = list(set([u'hgv', u'lanes', u'oneway', u'bicycle','maxspeed', 
    u'no_of_unique_ways', u'traffic_sign',u'highway', u'bridge', u'layer', u'cycleway', u'sidewalk',
    u'busway', u'abutters', u'bicycle_road', u'driving_side', u'incline', u'junction', u'lit', u'motorroad',
    u'overtaking',u'service', u'surface', u'tactile_paving', u'tracktype',u'maxheight', u'maxlength', 
    u'maxstay', u'maxwidth', u'maxweight', u'minspeed', u'noexit',u'place',u'crossing', u'toll',
    u'traffic_calming', u'trail_visibility', u'winter_road', u'place',u'railway', u'electrified', 
    u'embankment', u'route', u'tourism',
    u'charge', u'location', u'narrow', u'tunnel', u'width', u'access', u'ford',u'ice_road',
    u'agriculture', u'parking:condition', u'parking:lane', u'parking_places',u'sac_scale', 
    u'mountain_pass', u'mtb:scale', u'mtb:scale:uphill', u'mtb:description',u'no_of_ways']))

## Create a dataframe for clustering
1. create one feature vector by combining all features in clustering_feature_names
2. split this feature vector to form column for each property in clustering_feature_names

In [None]:
clustering_df = pd.DataFrame()
clustering_df['feature_vector']= intersections_expanded_df[clustering_feature_names].values.tolist()

In [None]:
def flatten(nested):
    flattened_list = []
    for item in nested:
        if str(type(item)) == "<type 'list'>":
            flattened_list.extend(item)
        else:
            flattened_list.append(item)
    return flattened_list

clustering_df['feature_vector'] = clustering_df['feature_vector'].apply(flatten)

In [None]:
split_vectors = lambda x: pd.Series([i for i in '#*#'.join(map(str, x)).split('#*#')])
clustering_df_sep =  clustering_df['feature_vector'][:].apply(split_vectors)

In [None]:
# convert to categorical values
colnames = clustering_df_sep.columns
for col in colnames:
    clustering_df_sep[col] = clustering_df_sep[col].astype('category')
    clustering_df_sep[col] = clustering_df_sep[col].cat.codes

In [None]:
# create feature matrix
intersections_feature_matrix = clustering_df_sep.as_matrix()
intersections_feature_matrix.shape

## Clustering Algorithms

In [None]:
import matplotlib
%matplotlib inline
matplotlib.use('Agg')
from sklearn import metrics
import matplotlib.pyplot as plt

## K-Means

In [None]:
kmeans_name = 'kmeans' + city + '_clustered_intersection_df'

In [None]:
# number of clusters
from sklearn.cluster import KMeans
k = 20
kmeans_labels = KMeans(n_clusters=k, max_iter=500).fit_predict(intersections_feature_matrix)

In [None]:
intersections_expanded_df['cluster_no'] = kmeans_labels

In [None]:
intersections_expanded_df.to_pickle(data_dir+kmeans_name+'.pkl')

## Spectral Clustering

In [None]:
spectral_name = 'spectral' + city + '_clustered_intersection_df'

In [None]:
from sklearn.cluster import SpectralClustering
spectral_labels = SpectralClustering(n_clusters=30, affinity= 'nearest_neighbors',
                           assign_labels='kmeans').fit_predict(intersections_feature_matrix)

In [None]:
intersections_expanded_df['cluster_no'] = spectral_labels 

In [None]:
intersections_expanded_df.to_pickle(data_dir+spectral_name+'.pkl')

## DBScan

In [None]:
dbscan_name = 'dbscan' + city + '_clustered_intersection_df_10'

In [None]:
from sklearn.cluster import DBSCAN
dbscan_labels = DBSCAN(eps=0.02, min_samples=30, metric='euclidean', metric_params=None, algorithm ='auto', 
                       leaf_size=30, p=None, n_jobs=1).fit_predict(intersections_feature_matrix)

In [None]:
intersections_expanded_df['cluster_no'] = dbscan_labels 

In [None]:
intersections_expanded_df.to_pickle('../data/'+ dbscan_name+'.pkl')