# Scrapping and preparing toronto data

### Import necessary libraries

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

### Fetch data from wikipedia in text format

In [2]:
website_url=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup=BeautifulSoup(website_url,'lxml')
# print(soup.prettify())

### Define and instantiate dataframe with required columns

In [3]:
# define the dataframe columns
column_names = ['Postal Code', 'Borough', 'Neighborhood'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)    
neighborhoods

Unnamed: 0,Postal Code,Borough,Neighborhood


### Scrape required data from text we fetched from the web

In [4]:
tables = soup.find_all('table', class_='sortable')
# Search through the tables for the one with the headings we want.
for table in tables:
    ths = table.find_all('th')
    headings = [th.text.strip() for th in ths]
    if headings[:3] == ['Postal Code', 'Borough', 'Neighbourhood']:
        break

### Store data in dataframe we defined earlier

In [5]:
for tr in table.find_all('tr'):
    tds = tr.find_all('td')
    if not tds:
        continue
    PostalCode,Borough,Neighbourhood = [td.text.strip() for td in tds[:3]]
    neighborhoods = neighborhoods.append({'Postal Code':PostalCode,
                                          'Borough': Borough,
                                          'Neighborhood': Neighbourhood}, ignore_index=True)


### Check the data in dataframe 

In [6]:
neighborhoods.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned

<span style="color:blue"><b>1. Check distinct values in "Borough" column</span>

In [7]:
neighborhoods['Borough'].unique()

array(['Not assigned', 'North York', 'Downtown Toronto', 'Etobicoke',
       'Scarborough', 'East York', 'York', 'East Toronto', 'West Toronto',
       'Central Toronto', 'Mississauga'], dtype=object)

<span style="color:blue"><b>2. Remove rows which have "Borough" value  as "Not assigned" and again check for unique value to verify the data</span>

In [8]:
neighborhoods = neighborhoods[neighborhoods['Borough'] != 'Not assigned'].reset_index(drop=True)
print('Unique data in Borough column :',neighborhoods['Borough'].unique())
neighborhoods.head()

Unique data in Borough column : ['North York' 'Downtown Toronto' 'Etobicoke' 'Scarborough' 'East York'
 'York' 'East Toronto' 'West Toronto' 'Central Toronto' 'Mississauga']


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### Merge same postal code rows in one row with comma separated values in neighborhood columns

<span style="color:blue"><b>First check if Postal code column as duplicate values or not </span>

In [9]:
boolean = not neighborhoods["Postal Code"].is_unique      # True (credit to @Carsten)
if boolean is False:
    print("No duplicate exists")
else:
    print("Duplicate exists")
    

No duplicate exists


Since column doesnot contain any duplicate values no merging required

### If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

<span style="color:blue"><b> Check if there is a "Not assigned" neighborhood </span>

In [10]:
neighborhoods[neighborhoods['Neighborhood'] == 'Not assigned']

Unnamed: 0,Postal Code,Borough,Neighborhood


As there are no Not Assigned Neighborhood so no action required

<span style="color:blue"><b>Number of rows in dataframe: </span>

In [11]:
neighborhoods.shape

(103, 3)