# Welcome to my Neighborhood Segmentation and Clustering of Toronto

## Let's get started

### Before we begin, we need to install the tools.

In [None]:
!conda install -c anaconda beautifulsoup4

In [2]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import json

!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim 

import requests
from pandas.io.json import json_normalize 

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes
import folium 

print('Libraries imported.')

Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Solving environment: failed with repodata from current_repodata.json, will retry with next repodata source.
Collecting package metadata (repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python-3.7-main

  added / updated specs:
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    _libgcc_mutex-0.1          |      conda_forge           3 KB  conda-forge
    _openmp_mutex-4.5          |           1_llvm           5 KB  conda-forge
    _py-xgboost-mutex-2.0      |            cpu_0           8 KB  conda-forge
    _pytorch_select-0.2        |            gpu_0           2 KB
    absl-py-0.11.0          

libssh2-1.9.0        | 225 KB    | ##################################### | 100% 
libevent-2.1.10      | 1.1 MB    | ##################################### | 100% 
bzip2-1.0.8          | 484 KB    | ##################################### | 100% 
psycopg2-2.8.6       | 168 KB    | ##################################### | 100% 
jupyter_client-6.1.1 | 76 KB     | ##################################### | 100% 
python_abi-3.7       | 4 KB      | ##################################### | 100% 
zstd-1.4.8           | 702 KB    | ##################################### | 100% 
charls-2.2.0         | 138 KB    | ##################################### | 100% 
libpq-13.1           | 2.7 MB    | ##################################### | 100% 
libstdcxx-ng-9.3.0   | 4.0 MB    | ##################################### | 100% 
pcre-8.44            | 261 KB    | ##################################### | 100% 
jeepney-0.6.0        | 32 KB     | ##################################### | 100% 
geographiclib-1.50   | 34 KB

tifffile-2021.2.1    | 126 KB    | ##################################### | 100% 
jinja2-2.11.3        | 93 KB     | ##################################### | 100% 
notebook-6.2.0       | 6.2 MB    | ##################################### | 100% 
llvm-openmp-11.0.1   | 4.7 MB    | ##################################### | 100% 
ninja-1.10.2         | 2.4 MB    | ##################################### | 100% 
xlrd-2.0.1           | 92 KB     | ##################################### | 100% 
async_generator-1.10 | 18 KB     | ##################################### | 100% 
importlib-metadata-3 | 20 KB     | ##################################### | 100% 
pthread-stubs-0.4    | 5 KB      | ##################################### | 100% 
dask-core-2021.2.0   | 681 KB    | ##################################### | 100% 
pyyaml-5.4.1         | 189 KB    | ##################################### | 100% 
oauthlib-3.0.1       | 82 KB     | ##################################### | 100% 
lerc-2.2.1           | 213 K

bokeh-2.2.3          | 7.0 MB    | ##################################### | 100% 
kiwisolver-1.3.1     | 78 KB     | ##################################### | 100% 
libxml2-2.9.10       | 1.3 MB    | ##################################### | 100% 
ipython-7.20.0       | 1.1 MB    | ##################################### | 100% 
libev-4.33           | 104 KB    | ##################################### | 100% 
keras-applications-1 | 30 KB     | ##################################### | 100% 
protobuf-3.15.1      | 346 KB    | ##################################### | 100% 
cytoolz-0.11.0       | 403 KB    | ##################################### | 100% 
nltk-3.4.4           | 1.1 MB    | ##################################### | 100% 
pymssql-2.1.5        | 236 KB    | ##################################### | 100% 
urllib3-1.26.3       | 99 KB     | ##################################### | 100% 
libzopfli-1.0.3      | 164 KB    | ##################################### | 100% 
pandocfilters-1.4.2  | 9 KB 

done


AttributeError: module 'numpy.linalg.lapack_lite' has no attribute '_ilp64'

### Step 1. Get the Data

First, we have to send a request to get the HTML file of the wikipedia page. Then, using BeautifulSoup, we load the HTML find in and set it equal to soup.

In [4]:
wiki = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
wiki.status_code

200

In [5]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(wiki.content,'html.parser')    

Next, we have to sift through the HTML to get down to the table. Let's search the soup for the table element with the wikitable sortable class (as it is coded in the HTML file).

In [6]:
table = soup.find('table',class_='wikitable sortable')
list(table.children)
tablebody = list(table.children)[1]
list(tablebody.children)

[<tr>
 <th>Postal Code
 </th>
 <th>Borough
 </th>
 <th>Neighbourhood
 </th></tr>,
 '\n',
 <tr>
 <td>M1A
 </td>
 <td>Not assigned
 </td>
 <td>Not assigned
 </td></tr>,
 '\n',
 <tr>
 <td>M2A
 </td>
 <td>Not assigned
 </td>
 <td>Not assigned
 </td></tr>,
 '\n',
 <tr>
 <td>M3A
 </td>
 <td>North York
 </td>
 <td>Parkwoods
 </td></tr>,
 '\n',
 <tr>
 <td>M4A
 </td>
 <td>North York
 </td>
 <td>Victoria Village
 </td></tr>,
 '\n',
 <tr>
 <td>M5A
 </td>
 <td>Downtown Toronto
 </td>
 <td>Regent Park, Harbourfront
 </td></tr>,
 '\n',
 <tr>
 <td>M6A
 </td>
 <td>North York
 </td>
 <td>Lawrence Manor, Lawrence Heights
 </td></tr>,
 '\n',
 <tr>
 <td>M7A
 </td>
 <td>Downtown Toronto
 </td>
 <td>Queen's Park, Ontario Provincial Government
 </td></tr>,
 '\n',
 <tr>
 <td>M8A
 </td>
 <td>Not assigned
 </td>
 <td>Not assigned
 </td></tr>,
 '\n',
 <tr>
 <td>M9A
 </td>
 <td>Etobicoke
 </td>
 <td>Islington Avenue, Humber Valley Village
 </td></tr>,
 '\n',
 <tr>
 <td>M1B
 </td>
 <td>Scarborough
 </td>
 <td>Malv

In [7]:
#Here we are taking each row element (tr) and stripping it of the text. 
table_rows = tablebody.find_all('tr')
rows = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [d.text.strip() for d in td]
    rows.append(row)
    
#Here we are taking the resulting nested list and placing it into a dataframe.
df = pd.DataFrame(columns=['Postal Code','Borough','Neighbourhood'])
rows= rows[1:]
for r in rows:
    dflength = len(df)
    df.loc[dflength] = r

In [8]:
pd.options.display.width = 500
pd.options.display.max_colwidth = 500
df.shape

(180, 3)

### Step 2: Clean the Data

Now we need to clean the data up as suggested by the assignment:

* Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
* If a cell has a borough but a Not assigned  neighborhood, then the neighborhood will be the same as the borough.


In [9]:
df = df[df.Borough != 'Not assigned']
#After removing these, there are no values in Neighbourhood that are Not assigned

In [10]:
#In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.
df.shape

(103, 3)

## PART 2 : Geocoding

In [15]:
!conda install -c conda-forge geocoder
import geocoder # import geocoder

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python-3.7-main

  added / updated specs:
    - geocoder


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geocoder-1.38.1            |             py_1          53 KB  conda-forge
    ratelim-0.1.6              |             py_2           6 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          59 KB

The following NEW packages will be INSTALLED:

  geocoder           conda-forge/noarch::geocoder-1.38.1-py_1
  ratelim            conda-forge/noarch::ratelim-0.1.6-py_2



Downloading and Extracting Packages
ratelim-0.1.6        | 6 KB      | ##################################### | 100% 
geocoder-1.38.1      | 53 KB     | ##################################### | 1

In [None]:
#Below hasn't been working, even when detached to just one code. So I downloaded the document provided by the course.

for postal_code in df['Postal Code']:
    lat_lng_coords = None

# loop until you get the coordinates
    while(lat_lng_coords is None):
      g = geocoder.google(f'{postal_code}, Toronto, Ontario')
      lat_lng_coords = g.latlng
    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    df[postal_code]['latitude','longitude'] = (latitude, longitude)
df.head(5)

In [18]:
newdf = pd.read_csv('https://cocl.us/Geospatial_data')
newdf.head()

AttributeError: 'NoneType' object has no attribute 'items'

  Postal Code   Latitude  Longitude
0         M1B  43.806686 -79.194353
1         M1C  43.784535 -79.160497
2         M1E  43.763573 -79.188711
3         M1G  43.770992 -79.216917
4         M1H  43.773136 -79.239476

In [19]:
df.head()

AttributeError: 'NoneType' object has no attribute 'items'

  Postal Code           Borough                                Neighbourhood
2         M3A        North York                                    Parkwoods
3         M4A        North York                             Victoria Village
4         M5A  Downtown Toronto                    Regent Park, Harbourfront
5         M6A        North York             Lawrence Manor, Lawrence Heights
6         M7A  Downtown Toronto  Queen's Park, Ontario Provincial Government

In [None]:
for postal_code in df['Postal Code']:
    df[postal_code]['latitude','longitude'] = newdf[postal_code]['Latitude','Longitude']

In [20]:
df2 = df.merge(newdf, how='left', on='Postal Code')
df2.head()

AttributeError: 'NoneType' object has no attribute 'items'

  Postal Code           Borough                                Neighbourhood   Latitude  Longitude
0         M3A        North York                                    Parkwoods  43.753259 -79.329656
1         M4A        North York                             Victoria Village  43.725882 -79.315572
2         M5A  Downtown Toronto                    Regent Park, Harbourfront  43.654260 -79.360636
3         M6A        North York             Lawrence Manor, Lawrence Heights  43.718518 -79.464763
4         M7A  Downtown Toronto  Queen's Park, Ontario Provincial Government  43.662301 -79.389494