# First Task

#### 1. Install BeautifulSoup4 library

We need to install beautifulsoup4 library in the first place for web scrapping in this task.

In [1]:
# conda install -c anaconda beautifulsoup4

#### 2. Import all needed libraries
All necessary library imported in this project.

In [2]:
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests

#### 3. Web Scrapping the wikipedia page containing list of neighborhoods in Toronto
The following lines of code are for web scrapping the wikipedia page that contains list of neighborhoods in the city of Toronto.

In [3]:
# URL of the wikipedia page
source = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text

# Create beautiful soup object
soup = BeautifulSoup(source, 'lxml')

# Find table that contains the neighborhoods data in the page
table = soup.find('table').tbody

# Get all columns name
columns = [item.text.replace("\n", "") for item in table.find_all("th")]

# Get all neighborhood data
neighborhoods_data = []
for i,row in enumerate(table.find_all('tr')):
    neighborhoods_data.append([])
    for j, item in enumerate(row.find_all('td')):
        neighborhoods_data[i].append(item.text.replace("\n", ""))

neighborhoods_data = neighborhoods_data[1:]
neighborhoods_data = list(map(list, zip(*neighborhoods_data)))

# Create Dataframe
tor_neigh_df = pd.DataFrame({columns[i]: neighborhoods_data[i] for i in range(3)})
tor_neigh_df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### 4. Rename PostalCode Column
We want to rename one column in our dataframe.

In [4]:
tor_neigh_df.rename(columns={"Postcode": "PostalCode"}, inplace=True)
tor_neigh_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### 5. Drop Not Assigned value in Borough Column
All rows with not assigned borough value will be dropped.

In [5]:
tor_neigh_df = tor_neigh_df[tor_neigh_df["Borough"]!="Not assigned"]
tor_neigh_df.reset_index(drop=True, inplace=True)
tor_neigh_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


#### 6. Cell with Not Assigned neighborhood will be filled with the same value as the borough 

In [6]:
# Function to check neighborhood value and process it if it is still not assigned
def replace_neighborhood_na(row):
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]
    return row

In [7]:
tor_neigh_df = tor_neigh_df.apply(replace_neighborhood_na, axis=1)
tor_neigh_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor


#### 7. Combine rows with same PostalCode

In [8]:
neigh_df = tor_neigh_df.groupby(by="PostalCode").agg({"Borough": 'first', "Neighborhood": ", ".join})
neigh_df.reset_index(inplace=True)
neigh_df.tail()

Unnamed: 0,PostalCode,Borough,Neighborhood
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."
102,M9W,Etobicoke,Northwest


shape of the Dataframe

In [9]:
neigh_df.shape

(103, 3)

# Second Task

#### 1. Install geocoder library
This library is needed for getting longitude and latitude of each neighborhood.

In [10]:
# conda install -c conda-forge geocoder

#### 2. Get all longitude and latitude of each neighborhood
##### a. Using Geocoder

In [11]:
# import geocoder # import geocoder

# latitudes = []
# longitudes = []

# for index, row in neigh_df.iterrows():
#     print("Getting latitude and longitude of {}, {}, {}".format(row["PostalCode"], row["Borough"], row["Neighborhood"]))
#     # initialize your variable to None
#     lat_lng_coords = None
    
#     # loop until you get the coordinates
#     while(lat_lng_coords is None):
#       g = geocoder.google('{}, {}, {}'.format(row["PostalCode"], row["Borough"], row["Neighborhood"]))
#       lat_lng_coords = g.latlng

#     latitudes.append(lat_lng_coords[0])
#     longitudes.append(lat_lng_coords[1])
#     print("                                                      [DONE]")

##### b. Using available longitude and latitude data in csv format

In [12]:
ll_df = pd.read_csv("Geospatial_Coordinates.csv")
ll_df.rename(columns={"Postal Code": "PostalCode"}, inplace=True) #rename postal code column
ll_df.set_index("PostalCode", inplace=True)
ll_df.head()

Unnamed: 0_level_0,Latitude,Longitude
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476


#### 3. Merge these two dataframes
We want to assign latitude and longitude of every neighborhood.

In [13]:
neigh_df.set_index("PostalCode", inplace=True)
neigh_df.head()

Unnamed: 0_level_0,Borough,Neighborhood
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Rouge, Malvern"
M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
M1E,Scarborough,"Guildwood, Morningside, West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae


In [14]:
neigh_df = neigh_df.join(ll_df)
neigh_df.head()

Unnamed: 0_level_0,Borough,Neighborhood,Latitude,Longitude
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
M1G,Scarborough,Woburn,43.770992,-79.216917
M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [15]:
neigh_df.reset_index(inplace=True)
neigh_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# Third Task

#### 1. Import all necessary libraries for clustering and visualization

In [16]:
# !conda install scikit-learn

In [17]:
from sklearn.cluster import k_means
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

In [18]:
CLIENT_ID = 'VQG3LESIQX2EBRDST02BYBPDNHRXFVVVUHD3S1WEWWISTOPG' # your Foursquare ID
CLIENT_SECRET = '3QQSABWTS1WHHYFWIAB4UXPKMVDRPEKNKUHEI4SCVZSBHMM1' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: VQG3LESIQX2EBRDST02BYBPDNHRXFVVVUHD3S1WEWWISTOPG
CLIENT_SECRET:3QQSABWTS1WHHYFWIAB4UXPKMVDRPEKNKUHEI4SCVZSBHMM1


In [20]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [21]:
def getNearbyVenues(postalcodes, boroughs, neighborhoods, latitudes, longitudes, radius=500):
    LIMIT = 50
    venues_list=[]
    for postalcode, borough, neighborhood, lat, lng in zip(postalcodes, boroughs, neighborhoods, latitudes, longitudes):
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        try:
            results = requests.get(url).json()["response"]['groups'][0]['items']
        except Esception as e:
            print(results)
            print(e)
            
        # return only relevant information for each nearby venue
        venues_list.append([(
            postalcode,
            borough,
            neighborhood,
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['PostalCode', 
                  'Borough',
                  'Neighborhood', 
                  'Latitude', 
                  'Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [22]:
toronto_venues = getNearbyVenues(postalcodes=neigh_df['PostalCode'],
                                 boroughs=neigh_df["Borough"],
                                 neighborhoods=neigh_df["Neighborhood"],
                                   latitudes=neigh_df['Latitude'],
                                   longitudes=neigh_df['Longitude']
                                  )

In [23]:
toronto_venues.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,Interprovincial Group,43.80563,-79.200378,Print Shop
2,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
3,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Scarborough Historical Society,43.788755,-79.162438,History Museum
4,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place


In [25]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.tail()

Unnamed: 0,Yoga Studio,Accessories Store,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store
1690,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1691,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1692,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1693,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1694,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
