In [None]:
import pandas as pd
import csv
import math
import numpy as np # library to handle data in a vectorized manner
import folium # map rendering library

import matplotlib.pyplot as plt # plotting library
# backend for rendering plots within the browser
%matplotlib inline 

from sklearn.cluster import KMeans

import matplotlib.cm as cm
import matplotlib.colors as colors


print('Libraries imported.')


In [None]:
FILENAME='scraped_venues_tiled.csv'

df_all=pd.read_csv(FILENAME)
df_all.drop_duplicates(subset='Id',inplace=True)
print('{} unique venues in total'.format(len(df_all)))

lat_list=df_all['Latitude'].tolist()
lng_list=df_all['Longtitude'].tolist()

lng_list_equidistant=[]

for i in range(0, len(lat_list)):
    lng_list_equidistant.append(lng_list[i]*math.cos(math.radians(lat_list[i]))) 
    
df_all['Longtitude_equidistant'] = lng_list_equidistant
df_all



In [None]:
n_clusters = 991

k_means_all = KMeans(n_clusters = n_clusters, verbose=2 )
k_means_all.fit(df_all[['Latitude','Longtitude_equidistant']])

df_all['Neighborhood_label'] = k_means_all.labels_

df_all

In [3]:
london_rows = df_all.City.str.contains('London')
manchester_rows = df_all.City.str.contains('Manchester')
birmingham_rows = df_all.City.str.contains('Birmingham')
berlin_rows = df_all.City.str.contains('Berlin')
rome_rows = df_all.City.str.contains('Rome')


df_london = df_all[london_rows].reset_index(drop=True)
df_manchester = df_all[manchester_rows].reset_index(drop=True)
df_birmingham = df_all[birmingham_rows].reset_index(drop=True)
df_berlin = df_all[berlin_rows].reset_index(drop=True)
df_rome = df_all[rome_rows].reset_index(drop=True)

print('Scraped venue counts:')
print('London: {}'.format(len(df_london)))
print('Manchester: {}'.format(len(df_manchester)))
print('Birmingham: {}'.format(len(df_birmingham)))
print('Berlin: {}'.format(len(df_berlin)))
print('Rome: {}'.format(len(df_rome)))





Scraped venue counts:
London: 40635
Manchester: 12296
Birmingham: 11570
Berlin: 19713
Rome: 14859


In [14]:
pd.get_dummies(df_all[['Categories']],prefix='', prefix_sep='').shape

(99073, 673)

In [None]:
mean_latitude_list_berlin=df_berlin.groupby('Neighborhood_label').mean()['Latitude'].tolist()
mean_longtitude_list_berlin=df_berlin.groupby('Neighborhood_label').mean()['Longtitude'].tolist()

import statistics

mean_latitude_berlin=statistics.mean(mean_latitude_list_berlin)
mean_longtitude_berlin=statistics.mean(mean_longtitude_list_berlin)



In [None]:
map_clusters = folium.Map(location=[mean_latitude_berlin, mean_longtitude_berlin], zoom_start=11)

for lat,lng in zip(mean_latitude_list_berlin, mean_longtitude_list_berlin):
    
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_clusters)

for lat,lng in zip(df_berlin['Latitude'].head(2000), df_berlin['Longtitude'].head(2000)):
    
    folium.CircleMarker(
        [lat, lng],
        radius=2,
        color='red',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_clusters)    
    

map_clusters

In [None]:
mean_latitude_list_rome=df_rome.groupby('Neighborhood_label').mean()['Latitude'].tolist()
mean_longtitude_list_rome=df_rome.groupby('Neighborhood_label').mean()['Longtitude'].tolist()

import statistics

mean_latitude_rome=statistics.mean(mean_latitude_list_rome)
mean_longtitude_rome=statistics.mean(mean_longtitude_list_rome)

map_clusters_rome = folium.Map(location=[mean_latitude_rome, mean_longtitude_rome], zoom_start=11)

for lat,lng in zip(mean_latitude_list_rome, mean_longtitude_list_rome):
    
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_clusters_rome)

for lat,lng in zip(df_rome['Latitude'].head(2000), df_rome['Longtitude'].head(2000)):
    
    folium.CircleMarker(
        [lat, lng],
        radius=2,
        color='red',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_clusters_rome)    
    

map_clusters_rome

In [None]:
mean_latitude_list_london=df_london.groupby('Neighborhood_label').mean()['Latitude'].tolist()
mean_longtitude_list_london=df_london.groupby('Neighborhood_label').mean()['Longtitude'].tolist()

import statistics

mean_latitude_london=statistics.mean(mean_latitude_list_london)
mean_longtitude_london=statistics.mean(mean_longtitude_list_london)

map_clusters_london = folium.Map(location=[mean_latitude_london, mean_longtitude_london], zoom_start=11)

for lat,lng in zip(mean_latitude_list_london, mean_longtitude_list_london):
    
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_clusters_london)

for lat,lng in zip(df_london['Latitude'].head(2000), df_london['Longtitude'].head(2000)):
    
    folium.CircleMarker(
        [lat, lng],
        radius=2,
        color='red',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_clusters_london)    
    

map_clusters_london


In [None]:
df_london.to_csv('clustered_london.csv',index=False)
df_manchester.to_csv('clustered_manchester.csv',index=False)
df_birmingham.to_csv('clustered_birmigham.csv',index=False)
df_berlin.to_csv('clustered_berlin.csv',index=False)
df_rome.to_csv('clustered_rome.csv',index=False)
df_all.to_csv('clustered_all.csv',index=False)

df_uk = pd.concat([df_london, df_manchester, df_birmingham])
df_capitals = pd.concat([df_london, df_berlin, df_rome])
df_b2uk = pd.concat([df_berlin, df_birmingham, df_manchester])

df_uk.to_csv('clustered_uk.csv',index=False)
df_capitals.to_csv('clustered_capitals.csv',index=False)
df_b2uk.to_csv('clustered_b2uk.csv',index=False)

