#### Importing libraries

In [0]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

##### Setting pandas options

In [0]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = requests.get(url)
print(page.status_code)

200


#### Creating BeautifulSoup instance

In [0]:
soup = BeautifulSoup(page.content, 'html.parser')
#print(soup.prettify())

#### Searching for table in wikipedia site

In [0]:
tb = soup.find('table', class_='wikitable')

In [0]:
#for link in tb.find_all('tr'):
#   name = link.find('td')

#### Creating list with all table rows

In [0]:
t_row=tb.findAll('tr')
#t_row

In [8]:
t_row[5].findAll('td')  # view of single row

[<td>M5A
 </td>, <td>Downtown Toronto
 </td>, <td>Regent Park / Harbourfront
 </td>]

In [0]:
#@title Form 1
#list=[]
#for x in t_row[1:3]:
#    list.append(x)

In [0]:
#@title Form 2
#l2=[]
#for l in list:
#    l2.append(str(l).replace('<tr>',''))
#l2

#### Creating a list with clean data in rows

In [0]:
final=[]
for k in t_row[1:]:
    new=str(k).replace('<tr>\n<td>','').replace('\n</td>','').replace('</tr>','').split('\n<td>')
    final.append(new)

#### Checking the preview of the list items

In [12]:
final[0:3] 

[['M1A', 'Not assigned', ''],
 ['M2A', 'Not assigned', ''],
 ['M3A', 'North York', 'Parkwoods']]

#### Extracting columns names

In [13]:
columns_df=str(t_row[0]).replace('<tr>\n<th>','').replace('\n</th>','').replace('</tr>','').split('\n<th>')
columns_df

['Postal code', 'Borough', 'Neighborhood']

#### Creating raw data frame

In [14]:
table=pd.DataFrame(final, columns=columns_df)
table.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [15]:
table.shape

(180, 3)

#### Delete rows with blank entries in column "Borough"

In [16]:
table=table[table.Borough != 'Not assigned']
table.reset_index(inplace=True, drop=True)
table

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,Malvern / Rouge
7,M3B,North York,Don Mills
8,M4B,East York,Parkview Hill / Woodbine Gardens
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [17]:
table.shape

(103, 3)

#### Loop for replaceing "/" to ","

In [0]:
for i, t in enumerate(table['Neighborhood']): 
    t=t.replace(' /',',')
    table['Neighborhood'][i]=t
    #print(i,'---',table['Neighborhood'][i]) 

In [19]:
table.head() #Table preview after changes

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [20]:
table['Postal code'].value_counts().shape

(103,)

In [21]:
table[table['Neighborhood']==''].count()

Postal code     0
Borough         0
Neighborhood    0
dtype: int64

In [22]:
print('My table has {} rows.'.format(table.shape[0]))

My table has 103 rows.


#### Save table to csv file

In [0]:
table.to_csv('/content/drive/My Drive/Colab Notebooks/Coursera_Capstone/Table_part1.csv', index=False) # writing DataFrame to csv file