# Capstone Project - Clustering Paris Neighborhoods

### Part 1 - Geographical Dataframe

In [1]:
#defining current working directory
import os
os.chdir(r"***")

We will use data provided by Paris data. <br>
Geographical csv data : https://opendata.paris.fr/explore/dataset/quartier_paris/download/?format=csv&timezone=Europe/Berlin&lang=fr&use_labels_for_header=true&csv_separator=%3B

In [2]:
import pandas as pd

In [3]:
geographical_df = pd.read_csv("quartier_paris.csv", sep=';') #importing data form csv to dataframe

In [4]:
geographical_df = geographical_df[['N_SQ_QU', 'C_QUINSEE', 'C_AR', 'L_QU',   'Geometry X Y']]
geographical_df.rename(columns={"C_QUINSEE" : "Code INSEE", "L_QU": "Neighborhood", "C_AR" : "Borough", 'N_SQ_QU' : 'Id_seq'}, inplace=True)
geographical_df.head() #conserving only useful data

Unnamed: 0,Id_seq,Code INSEE,Borough,Neighborhood,Geometry X Y
0,750000023,7510603,6,Notre-Dame-des-Champs,"48.846427594,2.32735687823"
1,750000055,7511403,14,Petit-Montrouge,"48.8266526255,2.32643699922"
2,750000074,7511902,19,Pont-de-Flandre,"48.8955557746,2.38477722927"
3,750000062,7511602,16,Muette,"48.8632745438,2.2599358317"
4,750000064,7511604,16,Chaillot,"48.8684336145,2.29167904274"


In [5]:
#getting neighborhood coordinates
transitory = geographical_df['Geometry X Y'].str.split(",", n=1, expand = True)
geographical_df['Latitude'] = transitory[0]
geographical_df['Longitude'] = transitory[1]
geographical_df.drop(columns=['Geometry X Y'], inplace=True)
geographical_df['Code INSEE'] = geographical_df['Code INSEE']+0.0 #uniformisation of data, necessary when merging
geographical_df[['Latitude', 'Longitude']] = geographical_df[['Latitude', 'Longitude']].apply(pd.to_numeric) #will be necessary when creating a map
geographical_df.head()

Unnamed: 0,Id_seq,Code INSEE,Borough,Neighborhood,Latitude,Longitude
0,750000023,7510603.0,6,Notre-Dame-des-Champs,48.846428,2.327357
1,750000055,7511403.0,14,Petit-Montrouge,48.826653,2.326437
2,750000074,7511902.0,19,Pont-de-Flandre,48.895556,2.384777
3,750000062,7511602.0,16,Muette,48.863275,2.259936
4,750000064,7511604.0,16,Chaillot,48.868434,2.291679


In [6]:
geographical_df = geographical_df[geographical_df.Neighborhood != 'Bel-Air']
geographical_df = geographical_df[geographical_df.Neighborhood != 'Picpus']

#retriving appendix neighborhoods creating clutsters itself

### Part 2 - Finding venues from foursquare

In [7]:
CLIENT_ID = '***' # your Foursquare ID
CLIENT_SECRET = '***' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

In [8]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

import requests

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

from pandas.io.json import json_normalize

import folium # map rendering library

In [9]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [11]:
paris_venues = getNearbyVenues(names=geographical_df['Neighborhood'],
                                   latitudes=geographical_df['Latitude'],
                                   longitudes=geographical_df['Longitude']
                                  )
#extracting venues arround each neighborhood from foursquare

In [12]:
print('There are {} uniques categories.'.format(len(paris_venues['Venue Category'].unique())))
paris_venues[['Neighborhood','Venue']].groupby('Neighborhood').count().head()
#checking extracted data from foursquare

There are 298 uniques categories.


Unnamed: 0_level_0,Venue
Neighborhood,Unnamed: 1_level_1
Amérique,14
Archives,100
Arsenal,68
Arts-et-Métiers,100
Auteuil,17


### Part 3 - First clustering

In [13]:
# one hot encoding
paris_onehot = pd.get_dummies(paris_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
paris_onehot['Neighborhood'] = paris_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [paris_onehot.columns[-1]] + list(paris_onehot.columns[:-1])
paris_onehot = paris_onehot[fixed_columns]

paris_grouped = paris_onehot.groupby('Neighborhood').mean().reset_index()

In [14]:
import numpy as np

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 20

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = paris_grouped['Neighborhood']

for ind in np.arange(paris_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(paris_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head() #checing data treatment

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue,16th Most Common Venue,17th Most Common Venue,18th Most Common Venue,19th Most Common Venue,20th Most Common Venue
0,Amérique,Pool,French Restaurant,Supermarket,Park,Bistro,Plaza,Street Art,Bed & Breakfast,Café,Asian Restaurant,Health Food Store,Empanada Restaurant,English Restaurant,Ethiopian Restaurant,Exhibit,Fast Food Restaurant,Falafel Restaurant,Farmers Market,Fish & Chips Shop,Fish Market
1,Archives,French Restaurant,Hotel,Italian Restaurant,Bookstore,Burger Joint,Bar,Coffee Shop,Bistro,Clothing Store,Plaza,Cocktail Bar,Art Gallery,Restaurant,Tapas Restaurant,Japanese Restaurant,Sandwich Place,Pizza Place,Falafel Restaurant,Pastry Shop,Deli / Bodega
2,Arsenal,French Restaurant,Hotel,Tapas Restaurant,Plaza,Park,Italian Restaurant,Thai Restaurant,Seafood Restaurant,Cocktail Bar,Museum,Pedestrian Plaza,Vegetarian / Vegan Restaurant,Gastropub,Boat or Ferry,Garden,Gym,Spa,Ice Cream Shop,Brazilian Restaurant,Southwestern French Restaurant
3,Arts-et-Métiers,French Restaurant,Hotel,Bar,Italian Restaurant,Restaurant,Cocktail Bar,Wine Bar,Chinese Restaurant,Vietnamese Restaurant,Moroccan Restaurant,Japanese Restaurant,Steakhouse,Seafood Restaurant,Café,Bakery,Theater,Wine Shop,Vegetarian / Vegan Restaurant,Art Gallery,Coffee Shop
4,Auteuil,Tennis Court,Stadium,Garden,Outdoors & Recreation,Museum,Racecourse,Botanical Garden,French Restaurant,Office,Sporting Goods Shop,Department Store,Design Studio,Flower Shop,Costume Shop,Coworking Space,Flea Market,Fish Market,Creperie,Fish & Chips Shop,Cultural Center


In [15]:
# set number of clusters
kclusters = 4

paris_grouped_clustering = paris_grouped.drop('Neighborhood', 1)

In [16]:
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(paris_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:30]

array([2, 2, 1, 2, 0, 1, 2, 2, 2, 3, 3, 2, 3, 1, 1, 1, 3, 2, 1, 1, 1, 3,
       2, 2, 2, 2, 1, 3, 1, 1])

In [17]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

paris_merged = geographical_df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
paris_merged = paris_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

paris_merged = paris_merged.dropna() #make we are able to assign a cluster to every Neighborhood

In [18]:
paris_merged.head() #find cluster labels

Unnamed: 0,Id_seq,Code INSEE,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue,15th Most Common Venue,16th Most Common Venue,17th Most Common Venue,18th Most Common Venue,19th Most Common Venue,20th Most Common Venue
0,750000023,7510603.0,6,Notre-Dame-des-Champs,48.846428,2.327357,1,French Restaurant,Hotel,Bakery,Japanese Restaurant,Italian Restaurant,Bistro,Creperie,Steakhouse,Bagel Shop,Ice Cream Shop,Pizza Place,Cosmetics Shop,Chocolate Shop,Café,Tea Room,Theater,Pub,Art Gallery,Cajun / Creole Restaurant,Korean Restaurant
1,750000055,7511403.0,14,Petit-Montrouge,48.826653,2.326437,3,Hotel,French Restaurant,Italian Restaurant,Supermarket,Bakery,Discount Store,Sandwich Place,Fast Food Restaurant,Food & Drink Shop,Sushi Restaurant,Bistro,Plaza,Japanese Restaurant,African Restaurant,Szechuan Restaurant,EV Charging Station,Dessert Shop,Garden,Restaurant,Organic Grocery
2,750000074,7511902.0,19,Pont-de-Flandre,48.895556,2.384777,2,Restaurant,Hotel,French Restaurant,Movie Theater,Café,Bistro,Rock Club,Supermarket,Asian Restaurant,Tram Station,Spanish Restaurant,Boat or Ferry,Dive Spot,Steakhouse,Food & Drink Shop,Music Venue,Cocktail Bar,Metro Station,Mexican Restaurant,Science Museum
3,750000062,7511602.0,16,Muette,48.863275,2.259936,1,French Restaurant,Pool,Boat or Ferry,Lake,Diner,Gym / Fitness Center,Cycle Studio,Empanada Restaurant,Food & Drink Shop,Food,Fondue Restaurant,Flower Shop,Flea Market,Creperie,Fish Market,Fish & Chips Shop,Fast Food Restaurant,Farmers Market,Falafel Restaurant,Exhibit
4,750000064,7511604.0,16,Chaillot,48.868434,2.291679,3,French Restaurant,Hotel,Italian Restaurant,Bakery,Café,Art Museum,Chinese Restaurant,Salad Place,Japanese Restaurant,Pub,Plaza,Seafood Restaurant,Food & Drink Shop,Burger Joint,Spa,Brasserie,Molecular Gastronomy Restaurant,Lounge,Mediterranean Restaurant,Cantonese Restaurant


### Part 4 - Geographical display

In [19]:
address = 'Paris'

geolocator = Nominatim(user_agent="paris_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Paris are {}, {}.'.format(latitude, longitude))

map_paris = folium.Map(location=[latitude, longitude], zoom_start=12)

The geograpical coordinate of Paris are 48.8566969, 2.3514616.


In [20]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters+1)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(paris_merged['Latitude'], paris_merged['Longitude'], paris_merged['Neighborhood'], paris_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Part 5 - Second clustering

Suppose we are considering a move from Petit-Montrouge neighborhood. To inform our choice, we consider additional data.

We will consider population data to cluster remaining neighborhoods.

Population data collected by INSEE from apur (https://opendata.apur.org/datasets/recensement-commune-population/data).

In [21]:
current_neighborhood = 'Petit-Montrouge' 
paris_merged.set_index(['Neighborhood'], inplace=True)  #necessary to extract label
current_label = paris_merged.loc[current_neighborhood, 'Cluster Labels']
paris_merged = paris_merged[['Borough', 'Cluster Labels', 'Code INSEE', 'Id_seq']]
paris_merged = paris_merged[paris_merged['Cluster Labels'] == current_label] #considering only neighborhoods similar to Petit-Montrouge
paris_merged.drop(['Cluster Labels'], axis=1, inplace=True) #not relevant anymore
paris_merged['Code INSEE commune'] = round(paris_merged['Code INSEE'], -2)/100 #necessar to merge data
paris_merged.head()

Unnamed: 0_level_0,Borough,Code INSEE,Id_seq,Code INSEE commune
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Petit-Montrouge,14,7511403.0,750000055,75114.0
Chaillot,16,7511604.0,750000064,75116.0
Necker,15,7511502.0,750000058,75115.0
Place-Vendôme,1,7510104.0,750000004,75101.0
Saint-Germain-des-Prés,6,7510604.0,750000024,75106.0


In [22]:
population_df = pd.read_csv("RECENSEMENT_COMMUNE_POPULATION.csv", sep=',')
population_df = population_df[['c_cainsee', 'nb_densite', 'nb_pop', 'nb_p_age_40']] #considering only relevant data
population_df.rename(columns={"c_cainsee" : "Code INSEE commune", "nb_densite" : "Density", "nb_p_age_40" : "Population under 40", "nb_pop" : "Population"}, inplace=True)
population_df['Code INSEE commune'] = round(population_df['Code INSEE commune'],0)
population_df.head()

Unnamed: 0,Code INSEE commune,Density,Population,Population under 40
0,91479.0,12.112378,7437.0,483.014882
1,91432.0,27.1375,13026.0,910.976631
2,93030.0,27.005141,10505.0,772.679731
3,92035.0,166.752809,29682.0,2344.732662
4,92073.0,128.139842,48565.0,4184.0182


In [23]:
#Cartesian product
paris_merged.reset_index(inplace=True) #necessary to keep information on Neighborhood
paris1_df = pd.merge(paris_merged, population_df,on='Code INSEE commune')
paris1_df.head() #merging geographical and population related data

Unnamed: 0,Neighborhood,Borough,Code INSEE,Id_seq,Code INSEE commune,Density,Population,Population under 40
0,Petit-Montrouge,14,7511403.0,750000055,75114.0,248.212766,139992.0,8510.003368
1,Plaisance,14,7511404.0,750000056,75114.0,248.212766,139992.0,8510.003368
2,Chaillot,16,7511604.0,750000064,75116.0,101.091631,165487.0,10713.616248
3,Necker,15,7511502.0,750000058,75115.0,277.115566,234994.0,15453.596566
4,Grenelle,15,7511503.0,750000059,75115.0,277.115566,234994.0,15453.596566


In [24]:
#Uniformization of data to make sure one won't be dominant when clustering
paris1_df['Pourcentage population under 40'] = paris1_df['Population under 40']/paris1_df['Population']
density_max = int(paris1_df[['Density']].max())+1
paris1_df['Density index'] = paris1_df['Density'].div(density_max)
paris1_df = paris1_df[['Id_seq', 'Code INSEE commune', 'Neighborhood', 'Density index', 'Pourcentage population under 40']]
paris1_df.head()

Unnamed: 0,Id_seq,Code INSEE commune,Neighborhood,Density index,Pourcentage population under 40
0,750000055,75114.0,Petit-Montrouge,0.832929,0.060789
1,750000056,75114.0,Plaisance,0.832929,0.060789
2,750000064,75116.0,Chaillot,0.339234,0.06474
3,750000058,75115.0,Necker,0.929918,0.065762
4,750000059,75115.0,Grenelle,0.929918,0.065762


In [25]:
#clustering
from sklearn.preprocessing import StandardScaler

X = paris1_df.values[:,3:]
X = np.nan_to_num(X)
Clus_dataSet = StandardScaler().fit_transform(X)
Clus_dataSet

clusterNum = 3
k_means = KMeans(init = "k-means++", n_clusters = clusterNum, n_init = 12)
k_means.fit(X)
labels = k_means.labels_
paris1_df["Cluster Labels 2"] = labels

In [26]:
paris1_df.set_index(['Neighborhood'], inplace=True)
current_label_2 = paris1_df.iloc[1]['Cluster Labels 2']
paris1_df = paris1_df[['Id_seq', 'Code INSEE commune', 'Cluster Labels 2']]
paris1_df = paris1_df[paris1_df['Cluster Labels 2'] == current_label_2]
paris1_df.drop(['Cluster Labels 2'], axis=1, inplace=True)
paris1_df #final selection

Unnamed: 0_level_0,Id_seq,Code INSEE commune
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1
Petit-Montrouge,750000055,75114.0
Plaisance,750000056,75114.0
Necker,750000058,75115.0
Grenelle,750000059,75115.0
Plaine de Monceaux,750000066,75117.0
Saint-Georges,750000033,75109.0
Chaussée-d'Antin,750000034,75109.0


In [27]:
paris1_df.index

Index(['Petit-Montrouge', 'Plaisance', 'Necker', 'Grenelle',
       'Plaine de Monceaux', 'Saint-Georges', 'Chaussée-d'Antin'],
      dtype='object', name='Neighborhood')

To conclude, as a data scientist, we advise the company to move headquarters to one of the 6 neighborhoods previously highlighted.