In [1]:
# import packages
import pickle
import pandas as pd
import numpy as np

# adjust display options
pd.options.display.max_rows = 2000
pd.options.display.max_columns = 2000
pd.options.display.max_colwidth = 10000

In [2]:
#Read pickle
intersections_df = pd.read_pickle('../data/Tucson_intersections_df_features.pkl')

In [3]:
# Take only those intersections which have 3 or more e
intersections_df = intersections_df[intersections_df['no_of_ways']>2]
len(intersections_df)

4690

In [5]:
#Add feature  - number of unique ways at a node
intersections_df['no_of_unique_ways'] = intersections_df['name'].apply(lambda x : len(set(x)))
intersections_df['no_of_unique_ways'][:10]-intersections_df['no_of_ways'][:10]

31    -2
32    -2
42    -2
46    -2
88    -1
97    -2
99     0
101   -1
117   -2
121    0
dtype: int64

In [6]:
ways_features_names = list(set([u'hgv', u'lanes', u'oneway', u'bicycle',
       u'highway', u'bridge', u'layer', u'cycleway', u'sidewalk','maxspeed',
       u'busway', u'abutters', u'bicycle_road', u'driving_side', u'ford',
       u'ice_road', u'incline', u'junction', u'lit', u'motorroad','name',
       u'mountain_pass', u'mtb:scale', u'mtb:scale:uphill', u'mtb:description',
       u'overtaking', u'parking:condition', u'parking:lane', u'parking_places',
       u'sac_scale', u'service', u'surface', u'tactile_paving', u'tracktype',
       u'traffic_calming', u'trail_visibility', u'winter_road', u'place',
       u'railway', u'electrified', u'embankment', u'route', u'tourism',
       u'charge', u'location', u'narrow', u'tunnel', u'width', u'access',
       u'agriculture', u'maxheight', u'maxlength', u'maxstay', u'maxwidth',
       u'maxweight', u'minspeed', u'noexit']))

## Features (expanded with values for each category in a feature)

In [7]:
import collections
import copy

In [8]:
intersections_expanded_df = copy.deepcopy(intersections_df)

def convert_to_bag_of_categories(x, categories):
    v = [0]*len(categories)
    x = collections.Counter(x)
    for i in range(len(categories)):
        if categories[i] in  x:
            v[i]=x[categories[i]]
    return v
    
    
categories_all = []    
skip_features = ['name','no_of_ways']

for feature in ways_features_names:
    if feature in skip_features:
        continue
    categories = []
    intersections_expanded_df[feature].apply(lambda x : categories.extend(x))
    categories = list(set(categories))
    #print feature
    #print len(categories)
    intersections_expanded_df[feature] = intersections_expanded_df[feature].apply(convert_to_bag_of_categories, args = (categories,))
    
intersections_expanded_df['highway'][:10]

31     [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
32     [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
42     [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
46     [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]
88     [0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
97     [0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
99     [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
101    [0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
117    [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
121    [0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Name: highway, dtype: object

## Padded Dataframe without expansion

In [None]:
#padded dataframe
intersections_padded_df = intersections_df
for fname in ways_features_names:
    intersections_padded_df[fname].apply(lambda x: x.extend([None] * (max_length - len(x))))

In [None]:
#Consider intersections with 3 or more nodes
print len(intersections_padded_df)
intersections_padded_df = intersections_padded_df[intersections_padded_df['no_of_ways']>2]
print len(intersections_df)

#convert names of intersections to unicode
intersections_padded_df['name'] = intersections_padded_df['name'].apply(lambda x: [i.encode('utf-8') if i != None else 'None' for i in x ])

In [None]:
#check maxlength for each feature after padding
for fname in ways_features_names:
    print fname
    print intersections_padded_df[fname].apply(lambda x : len(x)).max()

# Clustering

## Flatten features

In [9]:
clustering_feature_names = list(set([u'crossing', u'toll', u'traffic_sign',
       u'hgv', u'lanes', u'oneway', u'bicycle','maxspeed', u'no_of_unique_ways',
       u'highway', u'bridge', u'layer', u'cycleway', u'sidewalk',
       u'busway', u'abutters', u'bicycle_road', u'driving_side', u'ford',
       u'ice_road', u'incline', u'junction', u'lit', u'motorroad',
       u'mountain_pass', u'mtb:scale', u'mtb:scale:uphill', u'mtb:description',
       u'overtaking', u'parking:condition', u'parking:lane', u'parking_places',
       u'sac_scale', u'service', u'surface', u'tactile_paving', u'tracktype',
       u'traffic_calming', u'trail_visibility', u'winter_road', u'place',
       u'railway', u'electrified', u'embankment', u'route', u'tourism',
       u'charge', u'location', u'narrow', u'tunnel', u'width', u'access',
       u'agriculture', u'maxheight', u'maxlength', u'maxstay', u'maxwidth',
       u'maxweight', u'minspeed', u'noexit']))
# 'name','no_of_ways'

In [10]:
clustering_df = pd.DataFrame()
clustering_df['feature_vector']= intersections_expanded_df[clustering_feature_names].values.tolist()

In [11]:
def flatten(nested):
    f = []
    for item in nested:
        if str(type(item)) == "<type 'list'>":
            f.extend(item)
        else:
            f.append(item)
    return f

clustering_df['feature_vector'] = clustering_df['feature_vector'].apply(flatten)

In [12]:
split_vectors = lambda x: pd.Series([i for i in '#*#'.join(map(str, x)).split('#*#')])
clustering_df_sep =  clustering_df['feature_vector'][:].apply(split_vectors)
print clustering_df_sep.head()

  0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15  16  17  18   \
0   1   2   1   0   2   0   0   0   0   0   0   0   0   0   0   0   3   0   0   
1   1   2   1   0   2   0   0   0   0   0   0   0   0   0   0   0   3   0   0   
2   0   3   0   0   3   0   0   0   0   0   0   3   0   0   0   0   0   0   0   
3   0   3   0   0   3   0   0   0   0   0   0   2   0   0   0   0   1   0   0   
4   0   3   0   0   3   0   0   0   0   0   0   0   0   0   0   0   3   0   0   

  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37   \
0   0   3   0   0   0   0   0   0   3   3   3   0   3   0   0   0   3   3   3   
1   0   3   0   0   0   0   0   0   3   3   3   0   3   0   0   0   3   3   3   
2   0   3   0   0   0   0   0   0   3   3   3   0   3   0   0   0   3   3   3   
3   0   2   0   0   0   1   0   0   3   3   3   0   3   0   0   0   3   3   3   
4   0   0   0   0   0   3   0   0   3   3   3   0   3   0   0   0   3   3   3   

  38  39  40  41  42  43  

In [13]:
#Convert to categorical values
colnames = clustering_df_sep.columns
for col in colnames:
    clustering_df_sep[col] = clustering_df_sep[col].astype('category')
    clustering_df_sep[col] = clustering_df_sep[col].cat.codes

In [14]:
#Create feature matrix
intersections_feature_matrix = clustering_df_sep.as_matrix()
intersections_feature_matrix.shape

(4690, 176)

## Clustering Algorithms

## K-Means

In [15]:
import matplotlib
%matplotlib inline
matplotlib.use('Agg')
from sklearn import metrics
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.



In [16]:
k = 20
kmeans = KMeans(n_clusters=k, max_iter=500).fit(intersections_feature_matrix)

In [17]:
kmeans.labels_[:100]

array([10, 10, 10, 10, 18, 18, 13, 18, 19, 18, 18, 18, 18,  2, 18,  4, 18,
       18,  7, 18, 18, 18, 10,  4,  2,  4,  4,  6, 18, 18, 18, 18,  6,  6,
       18, 18,  6,  7, 18, 18, 18, 13,  0, 18, 10,  8, 18,  4, 18, 18,  8,
       13,  4, 18, 18, 13,  3,  0,  6, 19, 18, 18,  6, 18, 15, 16, 10, 12,
       18,  5,  6, 18, 11, 18, 18,  4, 13, 18, 18, 18, 18, 10, 18,  6, 18,
       18, 18,  4, 18,  7, 18, 13, 18, 18,  4, 11, 18,  4, 10, 13], dtype=int32)

In [18]:
intersections_df['cluster_no'] = kmeans.labels_

In [21]:
intersections_df[intersections_df['cluster_no'] == 6][:5]

Unnamed: 0,crossing,id,lat,lon,toll,traffic_sign,version,ways,name,hgv,lanes,oneway,bicycle,highway,bridge,layer,cycleway,sidewalk,busway,abutters,bicycle_road,driving_side,ford,ice_road,incline,junction,lit,motorroad,mountain_pass,mtb:scale,mtb:scale:uphill,mtb:description,overtaking,parking:condition,parking:lane,parking_places,sac_scale,service,surface,tactile_paving,tracktype,traffic_calming,trail_visibility,winter_road,place,railway,electrified,embankment,route,tourism,charge,location,narrow,tunnel,width,access,agriculture,maxheight,maxspeed,maxlength,maxstay,maxwidth,maxweight,minspeed,noexit,no_of_ways,no_of_unique_ways,cluster_no
398,,175030459,32.318116,-110.9750406,,,1,"[16876230, 16876231, 16888483]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[service, service, service]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]",3,1,6
467,,175040919,32.2766913,-111.0013212,,,1,"[16876848, 16876849, 16884124]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[service, service, service]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[private, private, private]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]",3,1,6
471,,175040931,32.277493,-111.0012546,,,1,"[16876849, 16884124, 159790255]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[service, service, service]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[private, private, private]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]",3,1,6
490,,175045483,32.2702737,-110.9929719,,,1,"[16877117, 217178836, 343197347]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[residential, service, service]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]",3,1,6
774,,175077768,32.2708872,-110.9885107,,,1,"[16879284, 16939784, 343198608]","[None, Columbia Avenue, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[service, residential, service]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]","[None, None, None]",3,2,6


In [None]:
intersections_df[intersections_df['id']=='']