# A Recommender System for Groceries Contractor

In [82]:
# importing libraries
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
from bs4 import BeautifulSoup
import requests # library to handle requests
import json # library to handle JSON files
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# !conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
import geopy.geocoders # convert an address into latitude and longitude values

# !conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries are imported.')

Libraries are imported.


## Postal Codes in Toronto

In [2]:
# Loading the dataset which is about postal codes in Toronto
# This dataset was created in week 3. 
df_toronto = pd.read_csv('data1.csv')
df_toronto.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,0,0,M1B,Scarborough,"Rouge, Malvern",43.81165,-79.195561
1,1,1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.785605,-79.158701
2,2,2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.76569,-79.175299
3,3,3,M1G,Scarborough,Woburn,43.768216,-79.21761
4,4,4,M1H,Scarborough,Cedarbrae,43.769608,-79.23944


## Create a Map of Toronto City (with its Postal Codes' Regions)

In [83]:
# for the city Toronto, latitude and longtitude are manually extracted via google search
toronto_latitude = 43.6932; toronto_longitude = -79.3832
map_toronto = folium.Map(location = [toronto_latitude, toronto_longitude], zoom_start = 10.7)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    

map_toronto

## Focusing on the "Scarorough" Borough in Toronto (its neighborhoods)

In [4]:

# df_toronto['Borough'] == 'Scarborough'

# selecting only neighborhoods regarding to "Scarborough" borough.
scarborough_data = df_toronto[df_toronto['Borough'] == 'Scarborough']
scarborough_data = scarborough_data.reset_index(drop=True).drop(columns = 'Unnamed: 0')
scarborough_data.head()

Unnamed: 0,Unnamed: 0.1,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,0,M1B,Scarborough,"Rouge, Malvern",43.81165,-79.195561
1,1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.785605,-79.158701
2,2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.76569,-79.175299
3,3,M1G,Scarborough,Woburn,43.768216,-79.21761
4,4,M1H,Scarborough,Cedarbrae,43.769608,-79.23944


## Create a Map of Scarborough and Its Neighbourhoods

In [5]:
address_scar = 'Scarborough, Toronto'
latitude_scar = 43.773077
longitude_scar = -79.257774
print('The geograpical coordinate of "Scarborough" are: {}, {}.'.format(latitude_scar, longitude_scar))

map_Scarborough = folium.Map(location=[latitude_scar, longitude_scar], zoom_start=11.5)

# add markers to map
for lat, lng, label in zip(scarborough_data['Latitude'], scarborough_data['Longitude'], scarborough_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius = 10,
        popup = label,
        color ='blue',
        fill = True,
        fill_color = '#3186cc',
        fill_opacity = 0.7).add_to(map_Scarborough)  
    
map_Scarborough

The geograpical coordinate of "Scarborough" are: 43.773077, -79.257774.


In [6]:
def foursquare_crawler (postal_code_list, neighborhood_list, lat_list, lng_list, LIMIT = 500, radius = 1000):
    result_ds = []
    counter = 0
    for postal_code, neighborhood, lat, lng in zip(postal_code_list, neighborhood_list, lat_list, lng_list):
         
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, CLIENT_SECRET, VERSION, 
            lat, lng, radius, LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        tmp_dict = {}
        tmp_dict['Postal Code'] = postal_code; tmp_dict['Neighborhood(s)'] = neighborhood; 
        tmp_dict['Latitude'] = lat; tmp_dict['Longitude'] = lng;
        tmp_dict['Crawling_result'] = results;
        result_ds.append(tmp_dict)
        counter += 1
        print('{}.'.format(counter))
        print('Data is Obtained, for the Postal Code {} (and Neighborhoods {}) SUCCESSFULLY.'.format(postal_code, neighborhood))
    return result_ds;

In [7]:
# @hiddel_cell
CLIENT_ID = '0MJA3NYYG3U2ZY1LTZN2OYEHS3Y3WVSON2GBSO3IL4EDYVIR' # your Foursquare ID
CLIENT_SECRET = 'WGWSAF2TKVUQPE3PD0N3EOITFVBY5EYP1VCZI3BMUG0ROUS5' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

## Crawling Internet (in fact only Foursquare database) for

## Venues in the Neighborhoods inside "Scarborough"

In [8]:

print('Crawling different neighborhoods inside "Scarborough"')
Scarborough_foursquare_dataset = foursquare_crawler(list(scarborough_data['Postalcode']),
                                                   list(scarborough_data['Neighborhood']),
                                                   list(scarborough_data['Latitude']),
                                                   list(scarborough_data['Longitude']),)

Crawling different neighborhoods inside "Scarborough"
1.
Data is Obtained, for the Postal Code M1B (and Neighborhoods Rouge, Malvern) SUCCESSFULLY.
2.
Data is Obtained, for the Postal Code M1C (and Neighborhoods Highland Creek, Rouge Hill, Port Union) SUCCESSFULLY.
3.
Data is Obtained, for the Postal Code M1E (and Neighborhoods Guildwood, Morningside, West Hill) SUCCESSFULLY.
4.
Data is Obtained, for the Postal Code M1G (and Neighborhoods Woburn) SUCCESSFULLY.
5.
Data is Obtained, for the Postal Code M1H (and Neighborhoods Cedarbrae) SUCCESSFULLY.
6.
Data is Obtained, for the Postal Code M1J (and Neighborhoods Scarborough Village) SUCCESSFULLY.
7.
Data is Obtained, for the Postal Code M1K (and Neighborhoods East Birchmount Park, Ionview, Kennedy Park) SUCCESSFULLY.
8.
Data is Obtained, for the Postal Code M1L (and Neighborhoods Clairlea, Golden Mile, Oakridge) SUCCESSFULLY.
9.
Data is Obtained, for the Postal Code M1M (and Neighborhoods Cliffcrest, Cliffside, Scarborough Village West) 

# Breakpoint:
    
    
 ## Saving results of Foursquare, so that we would not need to connect every time to Foursquare (and use our portions) .

In [9]:
import pickle
with open("Scarborough_foursquare_dataset.txt", "wb") as fp:   #Pickling
    pickle.dump(Scarborough_foursquare_dataset, fp)
print('Received Data from Internet is Saved to Computer.')

Received Data from Internet is Saved to Computer.


In [10]:
with open("Scarborough_foursquare_dataset.txt", "rb") as fp:   # Unpickling
    Scarborough_foursquare_dataset = pickle.load(fp)
# print(type(Scarborough_foursquare_dataset))
# Scarborough_foursquare_dataset

## Cleaning the RAW Data Received from Foursquare Database

In [11]:

# This function is created to connect to the saved list which is the received database. It will extract each venue 
# for every neighborhood inside the database

def get_venue_dataset(foursquare_dataset):
    result_df = pd.DataFrame(columns = ['Postal Code', 'Neighborhood', 
                                           'Neighborhood Latitude', 'Neighborhood Longitude',
                                          'Venue', 'Venue Summary', 'Venue Category', 'Distance'])
    # print(result_df)
    
    for neigh_dict in foursquare_dataset:
        postal_code = neigh_dict['Postal Code']; neigh = neigh_dict['Neighborhood(s)']
        lat = neigh_dict['Latitude']; lng = neigh_dict['Longitude']
        print('Number of Venuse in Coordination "{}" Posal Code and "{}" Negihborhood(s) is:'.format(postal_code, neigh))
        print(len(neigh_dict['Crawling_result']))
        
        for venue_dict in neigh_dict['Crawling_result']:
            summary = venue_dict['reasons']['items'][0]['summary']
            name = venue_dict['venue']['name']
            dist = venue_dict['venue']['location']['distance']
            cat =  venue_dict['venue']['categories'][0]['name']
            
            
            # print({'Postal Code': postal_code, 'Neighborhood': neigh, 
            #                   'Neighborhood Latitude': lat, 'Neighborhood Longitude':lng,
            #                   'Venue': name, 'Venue Summary': summary, 
            #                   'Venue Category': cat, 'Distance': dist})
            
            result_df = result_df.append({'Postal Code': postal_code, 'Neighborhood': neigh, 
                              'Neighborhood Latitude': lat, 'Neighborhood Longitude':lng,
                              'Venue': name, 'Venue Summary': summary, 
                              'Venue Category': cat, 'Distance': dist}, ignore_index = True)
            # print(result_df)
    
    return(result_df)

In [12]:
scarborough_venues = get_venue_dataset(Scarborough_foursquare_dataset)

Number of Venuse in Coordination "M1B" Posal Code and "Rouge, Malvern" Negihborhood(s) is:
5
Number of Venuse in Coordination "M1C" Posal Code and "Highland Creek, Rouge Hill, Port Union" Negihborhood(s) is:
5
Number of Venuse in Coordination "M1E" Posal Code and "Guildwood, Morningside, West Hill" Negihborhood(s) is:
19
Number of Venuse in Coordination "M1G" Posal Code and "Woburn" Negihborhood(s) is:
17
Number of Venuse in Coordination "M1H" Posal Code and "Cedarbrae" Negihborhood(s) is:
25
Number of Venuse in Coordination "M1J" Posal Code and "Scarborough Village" Negihborhood(s) is:
10
Number of Venuse in Coordination "M1K" Posal Code and "East Birchmount Park, Ionview, Kennedy Park" Negihborhood(s) is:
19
Number of Venuse in Coordination "M1L" Posal Code and "Clairlea, Golden Mile, Oakridge" Negihborhood(s) is:
26
Number of Venuse in Coordination "M1M" Posal Code and "Cliffcrest, Cliffside, Scarborough Village West" Negihborhood(s) is:
16
Number of Venuse in Coordination "M1N" Pos

## Showing Venues for Each Neighborhood in Scarborough

In [13]:
scarborough_venues.head()

Unnamed: 0,Postal Code,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Summary,Venue Category,Distance
0,M1B,"Rouge, Malvern",43.81165,-79.195561,Canadiana exhibit,This spot is popular,Zoo Exhibit,724
1,M1B,"Rouge, Malvern",43.81165,-79.195561,Wendy's,This spot is popular,Fast Food Restaurant,545
2,M1B,"Rouge, Malvern",43.81165,-79.195561,Grizzly Bear Exhibit,This spot is popular,Zoo Exhibit,622
3,M1B,"Rouge, Malvern",43.81165,-79.195561,simba safari lodge,This spot is popular,Zoo Exhibit,999
4,M1B,"Rouge, Malvern",43.81165,-79.195561,Lee Valley,This spot is popular,Hobby Shop,1001


In [14]:
scarborough_venues.tail()

Unnamed: 0,Postal Code,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Summary,Venue Category,Distance
329,M1W,"L'Amoreaux West, Steeles West",43.800698,-79.32074,Pizza Pizza,This spot is popular,Pizza Place,375
330,M1W,"L'Amoreaux West, Steeles West",43.800698,-79.32074,Warden Park,This spot is popular,Other Great Outdoors,456
331,M1W,"L'Amoreaux West, Steeles West",43.800698,-79.32074,Pharmacy Ave. & Finch Ave. E,This spot is popular,Intersection,815
332,M1W,"L'Amoreaux West, Steeles West",43.800698,-79.32074,Tim Hortons,This spot is popular,Coffee Shop,979
333,M1W,"L'Amoreaux West, Steeles West",43.800698,-79.32074,Nicey's Food Mart,This spot is popular,Grocery Store,989


# Breakpoint:

## End of Processing the Retrieved Information from Foursquare

## Saving a Cleaned Version of DataFrame as the Results from Foursquare

In [15]:
scarborough_venues.to_csv('scarborough_venues.csv')

## Loading Data from File (Saved "Foursquare " DataFrame for Venues)

In [16]:
scarborough_venues = pd.read_csv('scarborough_venues.csv')

## Some Summary Information about Neighborhoods inside "Scarborough"

In [17]:
neigh_list = list(scarborough_venues['Neighborhood'].unique())
print('Number of Neighborhoods inside Scarborough:')
print(len(neigh_list))
print('List of Neighborhoods inside Scarborough:')
neigh_list

Number of Neighborhoods inside Scarborough:
16
List of Neighborhoods inside Scarborough:


['Rouge, Malvern',
 'Highland Creek, Rouge Hill, Port Union',
 'Guildwood, Morningside, West Hill',
 'Woburn',
 'Cedarbrae',
 'Scarborough Village',
 'East Birchmount Park, Ionview, Kennedy Park',
 'Clairlea, Golden Mile, Oakridge',
 'Cliffcrest, Cliffside, Scarborough Village West',
 'Birch Cliff, Cliffside West',
 'Dorset Park, Scarborough Town Centre, Wexford Heights',
 'Maryvale, Wexford',
 'Agincourt',
 "Clarks Corners, Sullivan, Tam O'Shanter",
 "Agincourt North, L'Amoreaux East, Milliken, Steeles East",
 "L'Amoreaux West, Steeles West"]

## Some Summary Information about Neighborhoods inside "Scarborough" Cont'd

In [18]:
neigh_venue_summary = scarborough_venues.groupby('Neighborhood').count()
neigh_venue_summary.drop(columns = ['Unnamed: 0']).head()

Unnamed: 0_level_0,Postal Code,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Summary,Venue Category,Distance
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Agincourt,43,43,43,43,43,43,43
"Agincourt North, L'Amoreaux East, Milliken, Steeles East",26,26,26,26,26,26,26
"Birch Cliff, Cliffside West",11,11,11,11,11,11,11
Cedarbrae,25,25,25,25,25,25,25
"Clairlea, Golden Mile, Oakridge",26,26,26,26,26,26,26


In [19]:
print('There are {} uniques categories.'.format(len(scarborough_venues['Venue Category'].unique())))

print('Here is the list of different categories:')
list(scarborough_venues['Venue Category'].unique())

There are 110 uniques categories.
Here is the list of different categories:


['Zoo Exhibit',
 'Fast Food Restaurant',
 'Hobby Shop',
 'Italian Restaurant',
 'Burger Joint',
 'Breakfast Spot',
 'Bar',
 'Food & Drink Shop',
 'Liquor Store',
 'Smoothie Shop',
 'Discount Store',
 'Pharmacy',
 'Gym / Fitness Center',
 'Park',
 'Sports Bar',
 'Athletics & Sports',
 'Gym',
 'Convenience Store',
 'Restaurant',
 'Salon / Barbershop',
 'Garden',
 'Video Game Store',
 'Supermarket',
 'Bus Line',
 'Electronics Store',
 'Coffee Shop',
 'Indian Restaurant',
 'Juice Bar',
 'Vietnamese Restaurant',
 'Thrift / Vintage Store',
 'Chinese Restaurant',
 'Department Store',
 'Sandwich Place',
 'Clothing Store',
 'Pizza Place',
 'Filipino Restaurant',
 'Bakery',
 'Hakka Restaurant',
 'Caribbean Restaurant',
 'Music Store',
 'Thai Restaurant',
 'Bank',
 'Fried Chicken Joint',
 'Lounge',
 'Flower Shop',
 'German Restaurant',
 'Big Box Store',
 'Train Station',
 'Grocery Store',
 'Light Rail Station',
 'Asian Restaurant',
 'Rental Car Location',
 'Photography Studio',
 'Vegetarian / Veg

In [20]:
# Just for fun and deeper understanding
print(type(scarborough_venues[['Venue Category']]))

print(type(scarborough_venues['Venue Category']))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


## One-hot Encoding the "categroies" Column into Every Unique Categorical Feature.

In [59]:

# one hot encoding
scarborough_onehot = pd.get_dummies(data = scarborough_venues, drop_first  = False, 
                              prefix = "", prefix_sep = "", columns = ['Venue Category'])
scarborough_onehot.head()

Unnamed: 0.1,Unnamed: 0,Postal Code,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Summary,Distance,American Restaurant,Asian Restaurant,Athletics & Sports,Automotive Shop,BBQ Joint,Badminton Court,Bakery,Bank,Bar,Beer Store,Big Box Store,Bistro,Breakfast Spot,Bubble Tea Shop,Burger Joint,Bus Line,Bus Station,Café,Cantonese Restaurant,Caribbean Restaurant,Chinese Restaurant,Clothing Store,Coffee Shop,College Stadium,Convenience Store,Cosmetics Shop,Department Store,Dim Sum Restaurant,Diner,Discount Store,Electronics Store,Fast Food Restaurant,Filipino Restaurant,Fish Market,Flower Shop,Food & Drink Shop,Fried Chicken Joint,Furniture / Home Store,Garden,General Entertainment,German Restaurant,Golf Course,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Gym Pool,Hakka Restaurant,Hardware Store,Hobby Shop,Hong Kong Restaurant,Hookah Bar,Hotel,Indian Restaurant,Intersection,Italian Restaurant,Japanese Restaurant,Juice Bar,Korean Restaurant,Latin American Restaurant,Light Rail Station,Liquor Store,Lounge,Malay Restaurant,Market,Mediterranean Restaurant,Metro Station,Mexican Restaurant,Middle Eastern Restaurant,Motorcycle Shop,Music Store,Noodle House,Other Great Outdoors,Park,Pet Store,Pharmacy,Photography Studio,Pizza Place,Pool,Pool Hall,Print Shop,Pub,Rental Car Location,Restaurant,Salon / Barbershop,Sandwich Place,Seafood Restaurant,Shanghai Restaurant,Shop & Service,Shopping Mall,Skating Rink,Smoke Shop,Smoothie Shop,Soccer Field,Spa,Sporting Goods Shop,Sports Bar,Supermarket,Sushi Restaurant,Taiwanese Restaurant,Thai Restaurant,Thrift / Vintage Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wings Joint,Zoo Exhibit
0,0,M1B,"Rouge, Malvern",43.81165,-79.195561,Canadiana exhibit,This spot is popular,724,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1,M1B,"Rouge, Malvern",43.81165,-79.195561,Wendy's,This spot is popular,545,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2,M1B,"Rouge, Malvern",43.81165,-79.195561,Grizzly Bear Exhibit,This spot is popular,622,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,3,M1B,"Rouge, Malvern",43.81165,-79.195561,simba safari lodge,This spot is popular,999,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,4,M1B,"Rouge, Malvern",43.81165,-79.195561,Lee Valley,This spot is popular,1001,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Manually Selecting (Subsetting) Related Features for the Groceries Contractor

In [60]:

# This list is created manually 
important_list_of_features = [ 'Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude', 'African Restaurant', 'American Restaurant', 'Asian Restaurant', 'BBQ Joint', 
                              'Bakery', 'Breakfast Spot', 'Burger Joint', 'Cajun / Creole Restaurant', 'Cantonese Restaurant', 'Caribbean Restaurant', 'Chinese Restaurant','Diner', 
                              'Fast Food Restaurant',  'Filipino Restaurant', 'Fish Market', 'Food & Drink Shop', 'Fried Chicken Joint', 'Fruit & Vegetable Store', 'Greek Restaurant',
                              'Grocery Store', 'Hakka Restaurant', 'Hong Kong Restaurant', 'Hotpot Restaurant', 'Indian Restaurant', 'Italian Restaurant', 'Japanese Restaurant', 
                              'Korean Restaurant', 'Latin American Restaurant', 'Malay Restaurant', 'Mediterranean Restaurant', 'Mexican Restaurant', 'Middle Eastern Restaurant',
                              'Noodle House', 'Pizza Place', 'Restaurant', 'Sandwich Place', 'Seafood Restaurant', 'Shanghai Restaurant', 'Sushi Restaurant', 'Taiwanese Restaurant',
                              'Thai Restaurant', 'Vegetarian / Vegan Restaurant',  'Vietnamese Restaurant', 'Wings Joint']

## Updating the One-hot Encoded DataFrame and
 ## Grouping the Data by Neighborhoods

In [66]:
scarborough_onehot = scarborough_onehot.drop(
    columns = ['Neighborhood Latitude' , 'Neighborhood Longitude']).groupby(
    'Neighborhood').sum()

scarborough_onehot.head()

Unnamed: 0_level_0,Unnamed: 0,Distance,American Restaurant,Asian Restaurant,Athletics & Sports,Automotive Shop,BBQ Joint,Badminton Court,Bakery,Bank,Bar,Beer Store,Big Box Store,Bistro,Breakfast Spot,Bubble Tea Shop,Burger Joint,Bus Line,Bus Station,Café,Cantonese Restaurant,Caribbean Restaurant,Chinese Restaurant,Clothing Store,Coffee Shop,College Stadium,Convenience Store,Cosmetics Shop,Department Store,Dim Sum Restaurant,Diner,Discount Store,Electronics Store,Fast Food Restaurant,Filipino Restaurant,Fish Market,Flower Shop,Food & Drink Shop,Fried Chicken Joint,Furniture / Home Store,Garden,General Entertainment,German Restaurant,Golf Course,Greek Restaurant,Grocery Store,Gym,Gym / Fitness Center,Gym Pool,Hakka Restaurant,Hardware Store,Hobby Shop,Hong Kong Restaurant,Hookah Bar,Hotel,Indian Restaurant,Intersection,Italian Restaurant,Japanese Restaurant,Juice Bar,Korean Restaurant,Latin American Restaurant,Light Rail Station,Liquor Store,Lounge,Malay Restaurant,Market,Mediterranean Restaurant,Metro Station,Mexican Restaurant,Middle Eastern Restaurant,Motorcycle Shop,Music Store,Noodle House,Other Great Outdoors,Park,Pet Store,Pharmacy,Photography Studio,Pizza Place,Pool,Pool Hall,Print Shop,Pub,Rental Car Location,Restaurant,Salon / Barbershop,Sandwich Place,Seafood Restaurant,Shanghai Restaurant,Shop & Service,Shopping Mall,Skating Rink,Smoke Shop,Smoothie Shop,Soccer Field,Spa,Sporting Goods Shop,Sports Bar,Supermarket,Sushi Restaurant,Taiwanese Restaurant,Thai Restaurant,Thrift / Vintage Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Warehouse Store,Wings Joint,Zoo Exhibit
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1
Agincourt,9933,26156,1,0,0,0,1,1,1,1,0,0,0,0,1,1,0,0,0,0,2,0,8,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,1,0,1,0,0,0,1,0,1,1,1,1,0,0,1,0,2,1,1,0,3,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0
"Agincourt North, L'Amoreaux East, Milliken, Steeles East",7813,23373,0,0,0,0,3,0,2,0,0,0,0,0,0,0,0,0,0,0,0,1,5,0,0,0,0,0,0,1,0,1,0,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,2,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
"Birch Cliff, Cliffside West",1617,5130,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Cedarbrae,1450,19760,0,0,1,0,0,0,3,1,0,0,0,0,0,0,1,1,0,0,0,1,1,0,2,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,4,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
"Clairlea, Golden Mile, Oakridge",2925,17253,0,0,0,0,0,0,2,0,0,1,0,0,0,0,0,2,2,0,0,0,0,0,2,0,1,0,0,0,1,0,0,2,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,2,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


## Integrating Different Restaurants and Different Joints
### (Assuming Different Resaturants Use the Same Raw Groceries)
### This Assumption is made for simplicity and due to not having very large dataset about neighborhoods

In [67]:
feat_name_list = list(scarborough_onehot.columns)
restaurant_list = []


for counter, value in enumerate(feat_name_list):
    if value.find('Restaurant') != (-1):
        restaurant_list.append(value)
        
scarborough_onehot['Total Restaurants'] = scarborough_onehot[restaurant_list].sum(axis = 1)
scarborough_onehot = scarborough_onehot.drop(columns = restaurant_list)


feat_name_list = list(scarborough_onehot.columns)
joint_list = []


for counter, value in enumerate(feat_name_list):
    if value.find('Joint') != (-1):
        joint_list.append(value)
        
scarborough_onehot['Total Joints'] = scarborough_onehot[joint_list].sum(axis = 1)
scarborough_onehot = scarborough_onehot.drop(columns = joint_list)

## Showing the Fully-Processed DataFrame about Neighborhoods inside Scarborrough.
   This Dataset is Ready for any Machine Learning Algorithm.

In [68]:
scarborough_onehot.head()

Unnamed: 0_level_0,Unnamed: 0,Distance,Athletics & Sports,Automotive Shop,Badminton Court,Bakery,Bank,Bar,Beer Store,Big Box Store,Bistro,Breakfast Spot,Bubble Tea Shop,Bus Line,Bus Station,Café,Clothing Store,Coffee Shop,College Stadium,Convenience Store,Cosmetics Shop,Department Store,Diner,Discount Store,Electronics Store,Fish Market,Flower Shop,Food & Drink Shop,Furniture / Home Store,Garden,General Entertainment,Golf Course,Grocery Store,Gym,Gym / Fitness Center,Gym Pool,Hardware Store,Hobby Shop,Hookah Bar,Hotel,Intersection,Juice Bar,Light Rail Station,Liquor Store,Lounge,Market,Metro Station,Motorcycle Shop,Music Store,Noodle House,Other Great Outdoors,Park,Pet Store,Pharmacy,Photography Studio,Pizza Place,Pool,Pool Hall,Print Shop,Pub,Rental Car Location,Salon / Barbershop,Sandwich Place,Shop & Service,Shopping Mall,Skating Rink,Smoke Shop,Smoothie Shop,Soccer Field,Spa,Sporting Goods Shop,Sports Bar,Supermarket,Thrift / Vintage Store,Trail,Train Station,Video Game Store,Warehouse Store,Zoo Exhibit,Total Restaurants,Total Joints
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
Agincourt,9933,26156,0,0,1,1,1,0,0,0,0,1,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,1,0,1,1,1,1,0,0,0,2,0,3,0,0,0,0,0,0,0,1,0,0,0,0,0,0,20,1
"Agincourt North, L'Amoreaux East, Milliken, Steeles East",7813,23373,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,2,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,10,3
"Birch Cliff, Cliffside West",1617,5130,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Cedarbrae,1450,19760,1,0,0,3,1,0,0,0,0,0,0,1,0,0,0,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,2
"Clairlea, Golden Mile, Oakridge",2925,17253,0,0,0,2,0,0,1,0,0,0,0,2,2,0,0,2,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,2,0,0,0,0,0,1,0,0,0,0,2,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,4,0


# Run k-means to Cluster Neighborhoods into 5 Clusters

In [69]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

# run k-means clustering
kmeans = KMeans(n_clusters = 5, random_state = 0).fit(scarborough_onehot)

## Showing Centers of Each Cluster

In [70]:
means_df = pd.DataFrame(kmeans.cluster_centers_)
means_df.columns = scarborough_onehot.columns
means_df.index = ['G1','G2','G3','G4','G5']
means_df['Total Sum'] = means_df.sum(axis = 1)
means_df.sort_values(axis = 0, by = ['Total Sum'], ascending=False)

Unnamed: 0.1,Unnamed: 0,Distance,Athletics & Sports,Automotive Shop,Badminton Court,Bakery,Bank,Bar,Beer Store,Big Box Store,Bistro,Breakfast Spot,Bubble Tea Shop,Bus Line,Bus Station,Café,Clothing Store,Coffee Shop,College Stadium,Convenience Store,Cosmetics Shop,Department Store,Diner,Discount Store,Electronics Store,Fish Market,Flower Shop,Food & Drink Shop,Furniture / Home Store,Garden,General Entertainment,Golf Course,Grocery Store,Gym,Gym / Fitness Center,Gym Pool,Hardware Store,Hobby Shop,Hookah Bar,Hotel,Intersection,Juice Bar,Light Rail Station,Liquor Store,Lounge,Market,Metro Station,Motorcycle Shop,Music Store,Noodle House,Other Great Outdoors,Park,Pet Store,Pharmacy,Photography Studio,Pizza Place,Pool,Pool Hall,Print Shop,Pub,Rental Car Location,Salon / Barbershop,Sandwich Place,Shop & Service,Shopping Mall,Skating Rink,Smoke Shop,Smoothie Shop,Soccer Field,Spa,Sporting Goods Shop,Sports Bar,Supermarket,Thrift / Vintage Store,Trail,Train Station,Video Game Store,Warehouse Store,Zoo Exhibit,Total Restaurants,Total Joints,Total Sum
G1,7502.6,23926.0,0.0,0.2,0.2,1.2,0.4,0.2,0.2,0.0,0.0,0.4,0.2,0.0,0.2,0.0,0.0,1.2,0.0,0.2,0.0,0.2,0.0,0.4,0.2,0.2,0.0,0.0,0.4,0.0,0.0,0.2,0.8,0.2,0.0,0.0,0.0,0.4,0.2,0.0,0.4,0.0,0.2,0.2,0.2,0.2,0.0,0.2,0.0,0.2,0.0,0.8,0.4,1.4,0.0,1.6,0.2,0.2,0.2,0.0,0.0,0.0,0.6,0.2,0.8,0.0,0.2,0.0,0.0,0.2,0.0,0.0,0.4,0.0,0.2,0.0,0.2,0.2,0.0,13.4,1.6,31460.8
G4,1578.666667,17578.333333,0.666667,0.0,0.0,1.666667,0.333333,0.0,0.333333,0.0,0.0,0.0,0.0,1.333333,0.666667,0.0,0.0,1.333333,0.0,0.666667,0.0,0.0,0.333333,0.333333,0.333333,0.0,0.333333,0.333333,0.0,0.333333,0.333333,0.0,0.0,0.333333,0.666667,0.0,0.333333,0.0,0.0,0.0,0.666667,0.0,0.0,0.333333,0.333333,0.0,0.333333,0.0,0.333333,0.0,0.0,1.0,0.0,1.0,0.0,0.333333,0.0,0.0,0.0,0.333333,0.0,0.333333,0.333333,0.0,0.0,0.0,0.0,0.333333,0.333333,0.0,0.0,0.333333,0.333333,0.0,0.0,0.0,0.333333,0.0,0.0,5.0,0.666667,19180.333333
G3,6470.0,9683.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,16173.0
G5,1169.5,13038.0,0.0,0.0,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.5,1.5,0.0,0.5,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.5,1.0,0.0,0.0,0.0,0.0,0.5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.0,0.5,0.0,0.0,0.0,5.0,0.0,14225.5
G2,910.6,5012.4,0.0,0.0,0.0,0.0,0.2,0.2,0.0,0.2,0.2,0.4,0.0,0.0,0.0,0.2,0.0,0.4,0.2,0.0,0.0,0.0,0.0,0.4,0.0,0.0,0.0,0.0,0.2,0.0,0.2,0.0,0.0,0.2,0.0,0.2,0.0,0.2,0.0,0.2,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.2,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.6,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.6,1.8,0.6,5932.4


# Result:

### Best Group is G1;

### Second Best Group is G4;

### Third Best Group is G3;

