# Final project
## Identifying the best neighborhood for opening Korean restaurant
First we import libraries and packages

In [1]:
# importing necessery libraries and packages for the project
import numpy as np
import pandas as pd

!pip install bs4
from bs4 import BeautifulSoup

!pip install requests
import requests

!pip install ibm-db
import ibm_db

from sklearn.cluster import KMeans 



## Data Collection
 We will collect the data about average and mean annual income of each neighborhood from webpage by webscrapping using beautifulsoup package

In [2]:
# creating url and beautiful soup instance for further webscrapping
avg_income_url = "https://www03.cmhc-schl.gc.ca/hmip-pimh/en/TableMapChart/TableMatchingCriteria?GeographyType=MetropolitanMajorArea&GeographyId=2270&CategoryLevel1=Population%2C%20Households%20and%20Housing%20Stock&CategoryLevel2=Household%20Income&ColumnField=HouseholdIncomeRange&RowField=Neighbourhood&SearchTags%5B0%5D.Key=Households&SearchTags%5B0%5D.Value=Number&SearchTags%5B1%5D.Key=Statistics&SearchTags%5B1%5D.Value=AverageAndMedian"
avg_income_data = requests.get(avg_income_url).text
soup = BeautifulSoup(avg_income_data, 'html5lib')
tables_list = soup.find_all('table')
# identifying the number of tables in the webpage
print('There are {} table(s) in this web page'.format(len(tables_list)))

There are 1 table(s) in this web page


In [3]:
# Getting the first table html code
avg_income_table = tables_list[0]

In [4]:
# Webscrapping the table
columns = ['Neigbhorhood']
first_row = avg_income_table.find('tr')

ths = first_row.find_all('th')
for th in ths:
    col = th.find('span').text
    columns.append(col)
print(columns)

['Neigbhorhood', 'Average Household Income Before Taxes', 'Median Household Income Before Taxes', 'Average Household Income After Taxes', 'Median Household Income After Taxes']


In [5]:
#creating the income dataframe
income_df = pd.DataFrame(columns = columns)
income_df


Unnamed: 0,Neigbhorhood,Average Household Income Before Taxes,Median Household Income Before Taxes,Average Household Income After Taxes,Median Household Income After Taxes


In the code below we iterating over rows of the table on the webpage to retreive the data in each cell of the table and insert it into our income dataframe

In [6]:
rows = avg_income_table.find_all('tr')
rows = rows[1:]
for row in rows:
    col = row.find_all("td")
    if (col != []):
        neighborhood = row.find('th').text
        avg_inc_before = col[0].text
        mean_inc_before = col[1].text
        avg_inc_after = col[2].text.strip()
        mean_inc_after = col[3].text.strip()
        income_df = income_df.append({"Neighborhood":neighborhood,
                                      "Average Household Income Before Taxes":avg_inc_before, 
                                      "Median Household Income Before Taxes":mean_inc_before,
                                      "Average Household Income After Taxes":avg_inc_after, 
                                      "Median Household Income After Taxes":mean_inc_after}, ignore_index=True)
income_df = income_df.dropna(axis = 1)

In [7]:
income_df = income_df[['Neighborhood', 'Average Household Income Before Taxes',
                       'Median Household Income Before Taxes', 'Average Household Income After Taxes',
                       'Median Household Income After Taxes']]
print('The size of the table: {}'.format(income_df.shape))
income_df.to_csv(r'C:\Users\Beket\Documents\GitHub\Coursera_Capstone\income_df.csv', index = False, header = True)
pd.set_option('display.max_rows', None)
print(income_df)

The size of the table: (135, 5)
                                      Neighborhood  \
0                                          Toronto   
1                                Agincourt/Malvern   
2                                   Ajax/Pickering   
3                                        Alderwood   
4                                           Aurora   
5                     Banbury-Don Mills/York Mills   
6                                   Bathurst Manor   
7                              Bay Street Corridor   
8                                  Bayview Village   
9          Bayview Woods-Steeles/Hillcrest Village   
10                            Bedford Park-Nortown   
11                         Beechborough-Greenbrook   
12                                         Bendale   
13                           Birchcliffe-Cliffside   
14                                     Black Creek   
15         Bradford/West Gwillimbury/New Tecumseth   
16                                 Brampton (East)

<b>In the code below we accessing the infromation about the Business Improvement areas of Toronto from OpenData Toronto portal through their API</b>

In [8]:
url = "https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/action/package_show"
params = { "id": "9edb9628-1213-42bd-8352-5c4ed28e9e42"}
package = requests.get(url, params = params).json()
print(package["result"])
 
# Get the data by passing the resource_id to the datastore_search endpoint
# See https://docs.ckan.org/en/latest/maintaining/datastore.html for detailed parameters options
# For example, to retrieve the data content for the first resource in the datastore:
 
for idx, resource in enumerate(package["result"]["resources"]):
    if resource["datastore_active"]:
        url = "https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/action/datastore_search"
        p = { "id": resource["id"] }
        data = requests.get(url, params = p).json()
        bia_df = pd.DataFrame(data["result"]["records"])
        break
bia_df

{'license_title': 'Open Government Licence – Toronto', 'owner_unit': None, 'relationships_as_object': [], 'topics': 'Business,Locations and mapping', 'owner_email': 'opendata@toronto.ca', 'excerpt': 'Boundaries for the active Business Improvement Areas within the City of Toronto.', 'private': False, 'owner_division': 'Economic Development & Culture', 'num_tags': 8, 'id': '9edb9628-1213-42bd-8352-5c4ed28e9e42', 'metadata_created': '2019-07-23T16:45:34.043754', 'refresh_rate': 'Weekly', 'title': 'Business Improvement Areas', 'license_url': 'https://open.toronto.ca/open-data-license/', 'state': 'active', 'information_url': None, 'license_id': 'open-government-licence-toronto', 'type': 'dataset', 'resources': [{'cache_last_updated': None, 'package_id': '9edb9628-1213-42bd-8352-5c4ed28e9e42', 'datastore_active': True, 'id': 'd173e644-ace0-45e0-be43-8ba02fb116eb', 'size': None, 'format': 'GeoJSON', 'state': 'active', 'hash': '', 'description': '', 'is_preview': True, 'last_modified': '2021-0

Unnamed: 0,_id,AREA_ID,DATE_EFFECTIVE,AREA_ATTR_ID,PARENT_AREA_ID,AREA_SHORT_CODE,AREA_LONG_CODE,AREA_NAME,AREA_DESC,X,Y,LONGITUDE,LATITUDE,OBJECTID,Shape__Area,Shape__Length,geometry
0,7252,2482435,2020-12-16T20:42:00,26007418,,009-01,009-01,Riverside District,Riverside District,,,,,17577025,158831.5,3143.628503,"{""type"": ""Polygon"", ""coordinates"": [[[-79.3459..."
1,7253,2482434,2020-12-16T20:42:00,26007417,,041-03,041-03,Regal Heights Village,Regal Heights Village,,,,,17577041,130974.7,2639.776967,"{""type"": ""Polygon"", ""coordinates"": [[[-79.4324..."
2,7254,2482433,2020-12-16T20:42:00,26007416,,084-00,084-00,Queen Street West,Queen Street West,,,,,17577057,279383.8,4602.998896,"{""type"": ""Polygon"", ""coordinates"": [[[-79.3874..."
3,7255,2482432,2020-12-16T20:42:00,26007415,,006-00,006-00,Parkdale Village,Parkdale Village,,,,,17577073,275427.8,4980.837329,"{""type"": ""Polygon"", ""coordinates"": [[[-79.4457..."
4,7256,2482431,2020-12-16T20:42:00,26007414,,024-00,024-00,Pape Village,Pape Village,,,,,17577089,94993.89,2015.592108,"{""type"": ""Polygon"", ""coordinates"": [[[-79.3484..."
5,7257,2482430,2020-12-16T20:42:00,26007413,,049-03,049-03,Little Portugal On Dundas,Little Portugal On Dundas,,,,,17577105,382227.4,7628.898987,"{""type"": ""Polygon"", ""coordinates"": [[[-79.4377..."
6,7258,2482429,2020-12-16T20:42:00,26007412,,020-01,020-01,Little Italy,Little Italy,,,,,17577121,232341.6,3917.542802,"{""type"": ""Polygon"", ""coordinates"": [[[-79.4205..."
7,7259,2482428,2020-12-16T20:42:00,26007411,,042-01,042-01,Liberty Village,Liberty Village,,,,,17577137,797292.1,4400.913504,"{""type"": ""Polygon"", ""coordinates"": [[[-79.4246..."
8,7260,2482427,2020-12-16T20:42:00,26007410,,093-01,093-01,Leslieville,Leslieville,,,,,17577153,351302.9,6457.749078,"{""type"": ""Polygon"", ""coordinates"": [[[-79.3240..."
9,7261,2482426,2020-12-16T20:42:00,26007409,,094-03,094-03,Lawrence Ingram Keele,Lawrence Ingram Keele,,,,,17577169,2108682.0,7158.573022,"{""type"": ""Polygon"", ""coordinates"": [[[-79.4790..."


<b>Here we collecting data about the neighborhoods by webscrapping Wikipedia page</b>

In [9]:

neighborhood_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
neighborhood_data = requests.get(neighborhood_url).text
soup = BeautifulSoup(neighborhood_data, 'html5lib')
tables_list = soup.find_all('table')

print("There are {} tables in the webpage".format(len(tables_list)))

There are 3 tables in the webpage


In [10]:
for ind,table in enumerate(tables_list):
    if ("M1A" in str(table)):
        table_ind = ind
print("The table we want is of index {}".format(table_ind))
neigh_table = tables_list[0]

The table we want is of index 0


In [11]:
# Creating an empty list to store the data
table_contents = []

#Iterating through the each table cell that is inside the <td> tag
for row in neigh_table.find_all("td"):
    # Creating an empty cell dict
    cell = {}
    
    # Checking the condition of cell containing "Not assigned"
    if row.span.text == 'Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[0:3] # Getting the PostalCode value, which is the first 3 char values in each cell
        cell['Borough'] = row.span.text.split('(')[0] # Getting Borough data 
        #Getting the Neighborhood data
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ') 
        
        #Adding the cell values into the list
        table_contents.append(cell)
        
# Transforming the list into the dataframe 
neigh_df=pd.DataFrame(table_contents)

# Make adjustments
neigh_df['Borough']=neigh_df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

print("Shape of table is {}".format(neigh_df.shape))
pd.set_option('display.max_rows', None)
print(neigh_df)

Shape of table is (103, 3)
    PostalCode                 Borough  \
0          M3A              North York   
1          M4A              North York   
2          M5A        Downtown Toronto   
3          M6A              North York   
4          M7A            Queen's Park   
5          M9A               Etobicoke   
6          M1B             Scarborough   
7          M3B              North York   
8          M4B               East York   
9          M5B        Downtown Toronto   
10         M6B              North York   
11         M9B               Etobicoke   
12         M1C             Scarborough   
13         M3C              North York   
14         M4C               East York   
15         M5C        Downtown Toronto   
16         M6C                    York   
17         M9C               Etobicoke   
18         M1E             Scarborough   
19         M4E            East Toronto   
20         M5E        Downtown Toronto   
21         M6E                    York   
22     

### Part 2: Getting the geo data for each Neighborhood

In [12]:
#installing the package
!pip install pgeocode
import pgeocode



<b>Identifying the latitude and longitude values of each neighborhood address using the geolocator and storing the values in the respective variables</b>

In [13]:
postalCodes = neigh_df['PostalCode'].tolist() # Converting postal codes to the list

geolocator = pgeocode.Nominatim('ca')# Define the geolocator

latitudes = [] # Creating the list for latitude data
longitudes = [] # Creating the list for longitude data

for i, postalCode in enumerate(postalCodes): # Iterating through the postal codes to get their latitude and longitude
    
    # Getting the specific location
    g = geolocator.query_postal_code(postalCode)
    
    # Get latitude and longitude
    if not g.empty:
        latitudes.append(g.latitude)
        longitudes.append(g.longitude)
    else:
        latitudes.append("Not found")
        longitudes.append("Not found")

In [14]:
# Inserting the latitude and longitude data into the neighborhoods dataframe
neigh_df['Latitude'] = latitudes
neigh_df['Longitude'] = longitudes
neigh_df.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7545,-79.33
1,M4A,North York,Victoria Village,43.7276,-79.3148
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6555,-79.3626
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7223,-79.4504
4,M7A,Queen's Park,Ontario Provincial Government,43.6641,-79.3889


## Data preparation 
<b>In this part of the project we simplifying our dataframes by dropping unnecessery columns and data</b>

In [15]:
# Dropping none columns
bia_df.drop(['PARENT_AREA_ID', '_id','AREA_DESC','X', 'Y', 'LONGITUDE', 'OBJECTID','LATITUDE', 'AREA_ATTR_ID','DATE_EFFECTIVE','AREA_LONG_CODE','Shape__Area','Shape__Length', 'geometry'], axis = 1, inplace = True)
bia_df.to_csv(r'C:\Users\Beket\Documents\GitHub\Coursera_Capstone\bia_df.csv', index = False, header = True)
bia_df.head()

Unnamed: 0,AREA_ID,AREA_SHORT_CODE,AREA_NAME
0,2482435,009-01,Riverside District
1,2482434,041-03,Regal Heights Village
2,2482433,084-00,Queen Street West
3,2482432,006-00,Parkdale Village
4,2482431,024-00,Pape Village


In [16]:
# Dropping the 76th row because there is a nan value there
neigh_df.drop(76, inplace=True)
neigh_df.reset_index(inplace=True)
print("The shape of the dataframe is {}".format(neigh_df.shape))

The shape of the dataframe is (102, 6)


In [17]:
# Merging the income and neighborhood dataframes to work with it later
merged_df = income_df.merge(neigh_df, left_on = 'Neighborhood', right_on = 'Neighborhood')
merged_df.head()

Unnamed: 0,Neighborhood,Average Household Income Before Taxes,Median Household Income Before Taxes,Average Household Income After Taxes,Median Household Income After Taxes,index,PostalCode,Borough,Latitude,Longitude
0,Bayview Village,96616,67355,76983,58341,39,M2K,North York,43.7797,-79.3813
1,Humewood-Cedarvale,112354,61110,86327,54557,16,M6C,York,43.6915,-79.4307
2,Rosedale,423428,114625,275010,92347,91,M4W,Downtown Toronto,43.6827,-79.373
3,Scarborough Village,67599,49568,58858,46658,32,M1J,Scarborough,43.7464,-79.2323
4,The Beaches,166462,100365,121388,82713,19,M4E,East Toronto,43.6784,-79.2941


In [18]:
neigh_df.shape

(102, 6)

In [19]:
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

## Foursquare API
<b> In this part of the project we use foursquare API in order to access to the data about the number of restaurants and store it in our merged dataframe</b>

In [20]:
# Foursquare client credentials
CLIENT_ID = 'G1ZNGFWYJGBC3NBIMUUXMNKXK03CKMQJO44AWAAWQQAJZ41J' # your Foursquare ID
CLIENT_SECRET = '550RG0M2CN4D5DDSJLEAWRQBCVU0FQBYFK4HLLXN4IAZD44O' # your Foursquare Secret
ACCESS_TOKEN = '0F0MVWAHOCRIWNT1TULUPPHBSV5QDIKRZCRULT2HCORXMRZP' # your FourSquare Access Token
VERSION = '20210403'
LIMIT = 100
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: G1ZNGFWYJGBC3NBIMUUXMNKXK03CKMQJO44AWAAWQQAJZ41J
CLIENT_SECRET:550RG0M2CN4D5DDSJLEAWRQBCVU0FQBYFK4HLLXN4IAZD44O


<b>In order to get the number of restaurants in radius of 500 meters we make empty list and iterate over the rows in our merged dataframe to get the latitude and longitude data of each neighborhood and then using those values, using explore reguest for Foursquare API we get the number of restaurants in each neighborhood and store them in the list

In [21]:
ns_restaurants = [] # empty list
for i in range(9):
    category_id = '4d4b7105d754a06374d81259'
    radius = 500
    latitude = merged_df.loc[i, 'Latitude']
    longitude = merged_df.loc[i, 'Longitude']
    LIMIT = 100
    url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&oauth_token={}&v={}&categoryid={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude,ACCESS_TOKEN, VERSION, category_id, radius, LIMIT)
    results = requests.get(url).json()
    ns_restaurants.append(len(results['response']['groups'][0]['items']))
    
ns_restaurants

[5, 10, 4, 5, 15, 50, 8, 4, 3]

<b> We insert the data about the number of restaurants to the new column in our dataframe

In [22]:
merged_df['Number_of_Restaurants'] = ns_restaurants

<b> Explore dataframe

In [23]:
merged_df

Unnamed: 0,Neighborhood,Average Household Income Before Taxes,Median Household Income Before Taxes,Average Household Income After Taxes,Median Household Income After Taxes,index,PostalCode,Borough,Latitude,Longitude,Number_of_Restaurants
0,Bayview Village,96616,67355,76983,58341,39,M2K,North York,43.7797,-79.3813,5
1,Humewood-Cedarvale,112354,61110,86327,54557,16,M6C,York,43.6915,-79.4307,10
2,Rosedale,423428,114625,275010,92347,91,M4W,Downtown Toronto,43.6827,-79.373,4
3,Scarborough Village,67599,49568,58858,46658,32,M1J,Scarborough,43.7464,-79.2323,5
4,The Beaches,166462,100365,121388,82713,19,M4E,East Toronto,43.6784,-79.2941,15
5,Thorncliffe Park,56502,46595,50262,43628,29,M4H,East York,43.7059,-79.3464,50
6,Victoria Village,66525,51867,57242,46841,1,M4A,North York,43.7276,-79.3148,8
7,Weston,60230,45696,52849,42405,64,M9N,York,43.7068,-79.517,4
8,Woburn,70455,58921,62213,53645,22,M1G,Scarborough,43.7712,-79.2144,3


# Modelling our clustering model
<b> For this project I will use the K Means clustering model because it is most suitable model for this kind of business problem

## Preprocessing
<b> In this part we preprocess our dataframe in order to be able to cluster it </b>

In the code below we saving our string values and dropping them from the dataframe

In [24]:
indexes = merged_df['index']
neighborhoods = merged_df['Neighborhood']
latitudes = merged_df['Latitude']
longitudes = merged_df['Longitude']
merged_df.drop(['Neighborhood', 'PostalCode', 'Borough', 'index', 'Latitude', 'Longitude'], axis = 1, inplace = True)
x_df = merged_df
x_df

Unnamed: 0,Average Household Income Before Taxes,Median Household Income Before Taxes,Average Household Income After Taxes,Median Household Income After Taxes,Number_of_Restaurants
0,96616,67355,76983,58341,5
1,112354,61110,86327,54557,10
2,423428,114625,275010,92347,4
3,67599,49568,58858,46658,5
4,166462,100365,121388,82713,15
5,56502,46595,50262,43628,50
6,66525,51867,57242,46841,8
7,60230,45696,52849,42405,4
8,70455,58921,62213,53645,3


<b> Because our values in the columns of average and mean annual income are string we need to format them into float to be able to work with them in our cluster model

In [25]:
float_list = []
for item in x_df['Average Household Income Before Taxes'].items():
    float_list.append(float(item[1].replace(',', '.')))
float_list1 = []
for item in x_df['Median Household Income Before Taxes'].items():
    float_list1.append(float(item[1].replace(',', '.')))
float_list2 = []
for item in x_df['Average Household Income After Taxes'].items():
    float_list2.append(float(item[1].replace(',', '.')))
float_list3 = []
for item in x_df['Median Household Income After Taxes'].items():
    float_list3.append(float(item[1].replace(',', '.')))
x_df['Average Household Income Before Taxes'] = float_list
x_df['Median Household Income Before Taxes'] = float_list1
x_df['Average Household Income After Taxes'] = float_list2
x_df['Median Household Income After Taxes'] = float_list3
x_df

Unnamed: 0,Average Household Income Before Taxes,Median Household Income Before Taxes,Average Household Income After Taxes,Median Household Income After Taxes,Number_of_Restaurants
0,96.616,67.355,76.983,58.341,5
1,112.354,61.11,86.327,54.557,10
2,423.428,114.625,275.01,92.347,4
3,67.599,49.568,58.858,46.658,5
4,166.462,100.365,121.388,82.713,15
5,56.502,46.595,50.262,43.628,50
6,66.525,51.867,57.242,46.841,8
7,60.23,45.696,52.849,42.405,4
8,70.455,58.921,62.213,53.645,3


<b> Here we normalizing the float values in our dataframe in order to cluster model to properly work with them

In [26]:
from sklearn.preprocessing import StandardScaler
X = x_df.values[:,1:]
X = np.nan_to_num(X)
Clus_dataSet = StandardScaler().fit_transform(X)
Clus_dataSet

array([[ 0.04816388, -0.24392037,  0.02610052, -0.46637766],
       [-0.22004687, -0.10558724, -0.19984741, -0.11066589],
       [ 2.07831956,  2.68776773,  2.0566463 , -0.53752002],
       [-0.71575359, -0.51225169, -0.67150774, -0.46637766],
       [ 1.46587996,  0.41347287,  1.4813867 ,  0.24504589],
       [-0.84343823, -0.63951106, -0.85243328,  2.73502832],
       [-0.61701596, -0.53617574, -0.66058056, -0.2529506 ],
       [-0.88204855, -0.60121185, -0.92546032, -0.53752002],
       [-0.31406021, -0.46258264, -0.25430421, -0.60866237]])

<b> Here we are setting up our K Means clusterring model

In [27]:
clusterNum = 3
k_means = KMeans(init = "k-means++", n_clusters = clusterNum, n_init = 12)
k_means.fit(X)
labels = k_means.labels_
print(labels)

[0 0 1 0 2 0 0 0 0]


<b>We get our clustered labels for each neighborhood. We are observing the mean of the values of each cluster

In [28]:
x_df['cluster_label'] = labels
x_df.groupby('cluster_label').mean()

Unnamed: 0_level_0,Average Household Income Before Taxes,Median Household Income Before Taxes,Average Household Income After Taxes,Median Household Income After Taxes,Number_of_Restaurants
cluster_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,75.754429,54.444571,63.533429,49.439286,12.142857
1,423.428,114.625,275.01,92.347,4.0
2,166.462,100.365,121.388,82.713,15.0


<b> We inserting our labels into dataframe and changing labels for more understandable ones

In [29]:
merged_df['cluster_label'] = labels

In [30]:
merged_df.loc[(merged_df.cluster_label == 0), ['cluster_label']] = 'good'
merged_df.loc[(merged_df.cluster_label == 1), ['cluster_label']] = 'great'
merged_df.loc[(merged_df.cluster_label == 2), ['cluster_label']] = 'bad'
merged_df

Unnamed: 0,Average Household Income Before Taxes,Median Household Income Before Taxes,Average Household Income After Taxes,Median Household Income After Taxes,Number_of_Restaurants,cluster_label
0,96.616,67.355,76.983,58.341,5,good
1,112.354,61.11,86.327,54.557,10,good
2,423.428,114.625,275.01,92.347,4,great
3,67.599,49.568,58.858,46.658,5,good
4,166.462,100.365,121.388,82.713,15,bad
5,56.502,46.595,50.262,43.628,50,good
6,66.525,51.867,57.242,46.841,8,good
7,60.23,45.696,52.849,42.405,4,good
8,70.455,58.921,62.213,53.645,3,good


<b> returning the saved and dropped columns back to our dataframe

In [31]:
merged_df['neigbhorhood_index'] = indexes
merged_df['Neighborhood'] = neighborhoods
merged_df['Latitude'] = latitudes 
merged_df['Longitude'] = longitudes
merged_df

Unnamed: 0,Average Household Income Before Taxes,Median Household Income Before Taxes,Average Household Income After Taxes,Median Household Income After Taxes,Number_of_Restaurants,cluster_label,neigbhorhood_index,Neighborhood,Latitude,Longitude
0,96.616,67.355,76.983,58.341,5,good,39,Bayview Village,43.7797,-79.3813
1,112.354,61.11,86.327,54.557,10,good,16,Humewood-Cedarvale,43.6915,-79.4307
2,423.428,114.625,275.01,92.347,4,great,91,Rosedale,43.6827,-79.373
3,67.599,49.568,58.858,46.658,5,good,32,Scarborough Village,43.7464,-79.2323
4,166.462,100.365,121.388,82.713,15,bad,19,The Beaches,43.6784,-79.2941
5,56.502,46.595,50.262,43.628,50,good,29,Thorncliffe Park,43.7059,-79.3464
6,66.525,51.867,57.242,46.841,8,good,1,Victoria Village,43.7276,-79.3148
7,60.23,45.696,52.849,42.405,4,good,64,Weston,43.7068,-79.517
8,70.455,58.921,62.213,53.645,3,good,22,Woburn,43.7712,-79.2144


<b> Rearring the order of columns of a dataframe for appropriate view

In [32]:
cols = list(merged_df.columns.values)
cols

['Average Household Income Before Taxes',
 'Median Household Income Before Taxes',
 'Average Household Income After Taxes',
 'Median Household Income After Taxes',
 'Number_of_Restaurants',
 'cluster_label',
 'neigbhorhood_index',
 'Neighborhood',
 'Latitude',
 'Longitude']

In [33]:
cols = ['neigbhorhood_index', 'Neighborhood','Average Household Income Before Taxes',
 'Median Household Income Before Taxes',
 'Average Household Income After Taxes',
 'Median Household Income After Taxes',
 'Number_of_Restaurants',
 'Latitude',
 'Longitude', 'cluster_label']
merged_df = merged_df[cols]
merged_df

Unnamed: 0,neigbhorhood_index,Neighborhood,Average Household Income Before Taxes,Median Household Income Before Taxes,Average Household Income After Taxes,Median Household Income After Taxes,Number_of_Restaurants,Latitude,Longitude,cluster_label
0,39,Bayview Village,96.616,67.355,76.983,58.341,5,43.7797,-79.3813,good
1,16,Humewood-Cedarvale,112.354,61.11,86.327,54.557,10,43.6915,-79.4307,good
2,91,Rosedale,423.428,114.625,275.01,92.347,4,43.6827,-79.373,great
3,32,Scarborough Village,67.599,49.568,58.858,46.658,5,43.7464,-79.2323,good
4,19,The Beaches,166.462,100.365,121.388,82.713,15,43.6784,-79.2941,bad
5,29,Thorncliffe Park,56.502,46.595,50.262,43.628,50,43.7059,-79.3464,good
6,1,Victoria Village,66.525,51.867,57.242,46.841,8,43.7276,-79.3148,good
7,64,Weston,60.23,45.696,52.849,42.405,4,43.7068,-79.517,good
8,22,Woburn,70.455,58.921,62.213,53.645,3,43.7712,-79.2144,good


<b>From the clustering our dataframe we see that the best neighborhood for opening Korean Restaurant is Rosedale, because there are 4 restaurants in neighborhood and the highest annual income in comparison to other neighborhoods