**Webscrapping a website for travel destinations using BeautifulSoup &
Forward Geocoding for finidng place coordinates and address using Nominatim**

In [1]:
#importing required packages
import requests
# requests is to get the html source code of any website
from bs4 import BeautifulSoup
# the main package bs4, from which we get the function BeautifulSoup()
import pandas as pd
import numpy as np

In [2]:
#PlanetWare url loading
url1='https://www.planetware.com/tourist-attractions-/portsmouth-eng-hmp-po.html'

In [3]:
#requests to get html code
attractions_html=requests.get(url1).text

print(attractions_html)

In [4]:
# we convert the above string into a bs4, lxml object so that the function understands the page
soup = BeautifulSoup(attractions_html, 'lxml')

soup

print(dir(soup))

In [5]:
# extracting from rquired attributes
names = soup.findAll('figcaption')
len(names)

18

In [6]:
#putting attribute text values in series
attraction_names = []
for name in names:
    attraction_names.append(name.text)
attraction_names = pd.Series(attraction_names, name = "Tourist_Attraction")

In [7]:
attraction_names= attraction_names[1::]

In [8]:
attraction_names

1         HMS Victory | Photo Copyright: Bryan Dearsley
2     Mary Rose Museum | Photo Copyright: Bryan Dear...
3                                          HMS Warrior 
4         HMS Warrior | Photo Copyright: Bryan Dearsley
5     Portsmouth Historic Dockyard | Photo Copyright...
6     Spinnaker Tower | Photo Copyright: Bryan Dearsley
7                                      Spinnaker Tower 
8     The D-Day Story | Photo Copyright: Bryan Dearsley
9      Gunwharf Quays | Photo Copyright: Bryan Dearsley
10    Royal Navy Submarine Museum |  Comrade Foot / ...
11            Round and Square Towers in Old Portsmouth
12                                Portsmouth Cathedral 
13                  Charles Dickens' Birthplace Museum 
14                    Portsmouth Museum and Art Gallery
15                Southsea Beach and Pier in Portsmouth
16    Southsea Rock Gardens | Photo Copyright: Bryan...
17                                        Stansted Park
Name: Tourist_Attraction, dtype: object

In [9]:
#series to dataframe
attraction_names= attraction_names.to_frame()

In [10]:
#cleaning data: removing unwanted suffixes
attraction_names_1=attraction_names['Tourist_Attraction'].map(lambda x: x.replace('| Photo Copyright: Bryan Dearsley',''))

In [11]:
#cleaning data: removing suffxes 2
attraction_names_1['Tourist_Attraction'] = attraction_names['Tourist_Attraction'].str.split('|').str[0]

In [12]:
#cleaning data: removing unwanted suffixes 3
attraction_names_2 = attraction_names_1['Tourist_Attraction'].str.split('|').str[0]

In [13]:
placestogo=attraction_names_2.to_frame()

In [14]:
#dropping duplicates
placestogo=placestogo.drop_duplicates()

In [15]:
placestogo=placestogo.reset_index(drop=True)
placestogo

Unnamed: 0,Tourist_Attraction
0,HMS Victory
1,Mary Rose Museum
2,HMS Warrior
3,Portsmouth Historic Dockyard
4,Spinnaker Tower
5,The D-Day Story
6,Gunwharf Quays
7,Royal Navy Submarine Museum
8,Round and Square Towers in Old Portsmouth
9,Portsmouth Cathedral


In [16]:
#Exporting scrapped place names to PlanetWare_placesextracted.csv
placestogo.to_csv('PlanetWare_placesextracted.csv')

# Returning address information using Nominatim

In [17]:
#installing and importing geocoding pakcages

#!pip install geopandas
#!pip install geopy


import geopandas as gpd
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

In [18]:
#Setting up Nominatim
locator = Nominatim(user_agent="Travelops", timeout=20)
rgeocode = RateLimiter(locator.reverse, min_delay_seconds=0.001)

In [19]:
#function forward geocoding
#takes place name and finds address and coordinates
def location_info(x):

    data = locator.geocode(x) #use this line when using Nominatim
    
    #converted = pd.json_normalize(data).squeeze() #squeeze converts a dataframe to a pandas series
    return data

In [20]:
#calling functions for geocoding of above extracted places
location_info_df = placestogo['Tourist_Attraction'].apply(location_info)
location_info_df

0     (HMS Victory, Main Road, Portsea, Portsmouth, ...
1     (Mary Rose Museum, Main Road, Portsea, Portsmo...
2     (HMS Warrior, Gosport Ferry Piazza, Portsea, P...
3     (Portsmouth Historic Dockyard, The Hard, Ports...
4     (Spinnaker Tower, The Canalside, Gunwharf Quay...
5                                                  None
6     (Gunwharf Quays, Portsea, Portsmouth, England,...
7     (Royal Navy Submarine Museum, Haslar Road, Sea...
8                                                  None
9     (Cathedral Church of St Thomas of Canterbury, ...
10    (Charles Dickens Birthplace Museum, 393, Old C...
11                                                 None
12                                                 None
13    (Southsea Rock Gardens, Clarence Esplanade, So...
14    (Stansted Park, Stansted Mountfitchet, Uttlesf...
Name: Tourist_Attraction, dtype: object

In [21]:
#joing place names with their geolocation identities- both data sets above
df_locations = pd.concat([placestogo,location_info_df], axis=1)
df_locations

Unnamed: 0,Tourist_Attraction,Tourist_Attraction.1
0,HMS Victory,"(HMS Victory, Main Road, Portsea, Portsmouth, ..."
1,Mary Rose Museum,"(Mary Rose Museum, Main Road, Portsea, Portsmo..."
2,HMS Warrior,"(HMS Warrior, Gosport Ferry Piazza, Portsea, P..."
3,Portsmouth Historic Dockyard,"(Portsmouth Historic Dockyard, The Hard, Ports..."
4,Spinnaker Tower,"(Spinnaker Tower, The Canalside, Gunwharf Quay..."
5,The D-Day Story,
6,Gunwharf Quays,"(Gunwharf Quays, Portsea, Portsmouth, England,..."
7,Royal Navy Submarine Museum,"(Royal Navy Submarine Museum, Haslar Road, Sea..."
8,Round and Square Towers in Old Portsmouth,
9,Portsmouth Cathedral,"(Cathedral Church of St Thomas of Canterbury, ..."


In [22]:
#cleaning data: renaming columns
df_locations.columns =['place','address']

In [23]:
print(type(df_locations['address'][1][1]))
df_locations['address'][1][1]
#coordinates are stored in a tuple with the address column

<class 'tuple'>


(50.80220235, -1.1089264299156816)

In [24]:
print(type(df_locations['address'][1][0]))
df_locations['address'][1][0]
#in the same address column the first element of list is str class

<class 'str'>


'Mary Rose Museum, Main Road, Portsea, Portsmouth, England, PO1 3LU, United Kingdom'

In [25]:
type(df_locations['address'])
#address is series type

pandas.core.series.Series

In [26]:
#resetting index
df_locations=df_locations.reset_index(drop=True)

In [27]:
#checking datatypes before more cleaning
print(type(df_locations))
print(type(df_locations['place']))
print(type(df_locations['address']))
print(type(df_locations['address'][0]))
print(type(df_locations['address'][0][0]))
print(type(df_locations['address'][0][1]))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'geopy.location.Location'>
<class 'str'>
<class 'tuple'>


In [28]:
#number of rows or places
len(df_locations)

15

In [29]:
#extracting the string part as address and storing in new column of dataframe
df_locations['address_']=df_locations['address'].astype('str')

In [30]:
#extracting the tuple part as coordinates and storing in a list
coord=[]
for i in range(len(df_locations['address'])):
    if df_locations['address'][i]!=None:
        coord.append(df_locations['address'][i][1])
    else:
        coord.append(None)

In [31]:
#checking data type for coord
type(coord)

list

In [32]:
#changing list to series
coord=pd.Series(coord)

In [33]:
#adding new columns in dataframe df_locations to store series values of coordinates
df_locations['coordinates']=coord.values

In [34]:
df_locations.head()

Unnamed: 0,place,address,address_,coordinates
0,HMS Victory,"(HMS Victory, Main Road, Portsea, Portsmouth, ...","HMS Victory, Main Road, Portsea, Portsmouth, E...","(50.80181325, -1.1096133411749904)"
1,Mary Rose Museum,"(Mary Rose Museum, Main Road, Portsea, Portsmo...","Mary Rose Museum, Main Road, Portsea, Portsmou...","(50.80220235, -1.1089264299156816)"
2,HMS Warrior,"(HMS Warrior, Gosport Ferry Piazza, Portsea, P...","HMS Warrior, Gosport Ferry Piazza, Portsea, Po...","(50.7982217, -1.1092544924090513)"
3,Portsmouth Historic Dockyard,"(Portsmouth Historic Dockyard, The Hard, Ports...","Portsmouth Historic Dockyard, The Hard, Portse...","(50.80046565, -1.1094840008522384)"
4,Spinnaker Tower,"(Spinnaker Tower, The Canalside, Gunwharf Quay...","Spinnaker Tower, The Canalside, Gunwharf Quays...","(50.795596450000005, -1.1084489003112004)"


In [35]:
#removing redundant address column
df_locations=df_locations.drop('address',axis=1)

In [36]:
df_locations.head()

Unnamed: 0,place,address_,coordinates
0,HMS Victory,"HMS Victory, Main Road, Portsea, Portsmouth, E...","(50.80181325, -1.1096133411749904)"
1,Mary Rose Museum,"Mary Rose Museum, Main Road, Portsea, Portsmou...","(50.80220235, -1.1089264299156816)"
2,HMS Warrior,"HMS Warrior, Gosport Ferry Piazza, Portsea, Po...","(50.7982217, -1.1092544924090513)"
3,Portsmouth Historic Dockyard,"Portsmouth Historic Dockyard, The Hard, Portse...","(50.80046565, -1.1094840008522384)"
4,Spinnaker Tower,"Spinnaker Tower, The Canalside, Gunwharf Quays...","(50.795596450000005, -1.1084489003112004)"


In [37]:
#replacing all string type 'None' with None values
for i in range(len(df_locations['address_'])):
    if df_locations['address_'][i]=='None':
        df_locations['address_'][i]=None

In [38]:
#exporting to excel for further cleaning thorugh alteryx
df_locations.to_excel('PortsmouthPlaces.xlsx')

**Now move to the other ipynb file (Portsmouth Destination List and Mapping) where we scrapped data from another webiste (Expedia) and combined these two data sets used Nominatim again with missing value treatment and then visualised the locations on a map with the interactive mapping**