3.1 For this assignment, I will be parsing a table from Wikipedia and creating a dataframe

# Set-up

In [270]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import urllib.request
import warnings
warnings.filterwarnings('ignore')

#Import
fp = urllib.request.urlopen("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
mybytes = fp.read()

#change coding
html_doc = mybytes.decode("utf8")
fp.close()

#create soup
soup = BeautifulSoup(html_doc, 'lxml')


# Data Prep

In [274]:
# Using beautiful soup and Inspect element allowed to find the html section
My_table = soup.find('table',{'class':'wikitable sortable'})

#create a dataframe with information
df = pd.read_html(str(My_table))
#all the information is placed on first entry; so take it out
df= df[0]

#Setup Index
headers= ['Postcode', 'Bourough', 'Neighborhood']
df.columns= headers

#drop inedex method
df= df.drop(df.index[0])
#df

#Dealing with Missing Values
df.replace("Not assigned", np.nan, inplace = True) #Convert existing data missing information to nan
df.dropna(subset=["Bourough"],inplace=True) # drop any rows with missing values in any cell. INPLACE impact dataframe if true
df.reset_index(drop=True, inplace=True) #reindex

# Inefficient beautiful soup method
#S1=soup.body.find('div', class_='mw-body-content',id='bodyContent')
#S2= S1.find('div', id='mw-content-text')
#S3= S2.find('div', class_='mw-parser-output')
#S4= S3.find('table')#, class_='mw-parser-output')
#S5= S4.tbody.tr #[''#.th#.text#.text

print(df.head())


  Postcode          Bourough      Neighborhood
0      M3A        North York         Parkwoods
1      M4A        North York  Victoria Village
2      M5A  Downtown Toronto      Harbourfront
3      M5A  Downtown Toronto       Regent Park
4      M6A        North York  Lawrence Heights


# Collapsing Data Algorithm

In [276]:
# Step 1: Create an array with postcodes, and an empty spot to fill

PostAr= df.Postcode.unique() #unique list of postcode
#Create an array of x with a second entry with 'a' to signify string
x=[]
for post in PostAr:
    x.append([post,'a'])   

#Step 2: Fill the x array with the neighborhood
    # Alg1: if rayindex= dataframe postcode and neighborhood is not in value
    # Alg2: THEN we add the neighborhood in entry and repeat
for i,square in enumerate(x):    
    for index, row in df.iterrows():
        if (df.loc[index][0])== (x[i][0]) and (str(df.loc[index][2]) in x[i][1]) == False:
            x[i][1]= x[i][1] + ', ' + str(df.loc[index][2])
        else:
            pass


#Step 3: Remove 'a,' from the entries        
for i,square in enumerate(x):
    x[i][1]=x[i][1][3:]


#SAMPLE Text

#for i,square in enumerate(x):
#    print(type(x[i][1]))
 #   x[i][1]= x[i][1] + ', ' + df.loc[index][2]
    #print(x[i][1] + 'a')
    #for index, row in df.iterrows(): 
    
#for index, row in df.iterrows():
    #print(index) #print index number
    #print(type(df.loc[index][0]))
  #  print(df.loc[index][2])
    #print(df[row])
#df.iloc[0, 0]
 

[['M3A', 'Parkwoods'], ['M4A', 'Victoria Village'], ['M5A', 'Harbourfront, Regent Park'], ['M6A', 'Lawrence Heights, Lawrence Manor'], ['M7A', 'nan'], ['M9A', 'Islington Avenue'], ['M1B', 'Rouge, Malvern'], ['M3B', 'Don Mills North'], ['M4B', 'Woodbine Gardens, Parkview Hill'], ['M5B', 'Ryerson, Garden District'], ['M6B', 'Glencairn'], ['M9B', 'Cloverdale, Islington, Martin Grove, Princess Gardens, West Deane Park'], ['M1C', 'Highland Creek, Rouge Hill, Port Union'], ['M3C', 'Flemingdon Park, Don Mills South'], ['M4C', 'Woodbine Heights'], ['M5C', 'St. James Town'], ['M6C', 'Humewood-Cedarvale'], ['M9C', 'Bloordale Gardens, Eringate, Markland Wood, Old Burnhamthorpe'], ['M1E', 'Guildwood, Morningside, West Hill'], ['M4E', 'The Beaches'], ['M5E', 'Berczy Park'], ['M6E', 'Caledonia-Fairbanks'], ['M1G', 'Woburn'], ['M4G', 'Leaside'], ['M5G', 'Central Bay Street'], ['M6G', 'Christie'], ['M1H', 'Cedarbrae'], ['M2H', 'Hillcrest Village'], ['M3H', 'Bathurst Manor, Downsview North, Wilson Heig

# Recreating Dataframe

In [277]:
# Drop third column
df1 = df.iloc[0:,[0,1]]
# Drop the duplicates of new dataframe
df2 = df1.drop_duplicates()
df2.reset_index(drop=True, inplace=True) #reindex

#Add a new column filled with a
df2['Neighborhood'] = 'a'

#Fill the Neighborhood column with x list
for i,square in enumerate(x):    
    for index, row in df2.iterrows():
        if df2.loc[index][0]== (x[i][0]):
            df2.loc[index][2]= (x[i][1])
        else:
            pass

# Replace missing Neighborhood with Bourough        
df2.loc[df2['Neighborhood'] == 'nan']
df2['Neighborhood'].replace('nan', "Queen's Park", inplace= True)
df2.iloc[4][2] #Test it

#Complete
print(df2)


    Postcode          Bourough  \
0        M3A        North York   
1        M4A        North York   
2        M5A  Downtown Toronto   
3        M6A        North York   
4        M7A      Queen's Park   
5        M9A         Etobicoke   
6        M1B       Scarborough   
7        M3B        North York   
8        M4B         East York   
9        M5B  Downtown Toronto   
10       M6B        North York   
11       M9B         Etobicoke   
12       M1C       Scarborough   
13       M3C        North York   
14       M4C         East York   
15       M5C  Downtown Toronto   
16       M6C              York   
17       M9C         Etobicoke   
18       M1E       Scarborough   
19       M4E      East Toronto   
20       M5E  Downtown Toronto   
21       M6E              York   
22       M1G       Scarborough   
23       M4G         East York   
24       M5G  Downtown Toronto   
25       M6G  Downtown Toronto   
26       M1H       Scarborough   
27       M2H        North York   
28       M3H  