**Webscrapping a website for travel destinations using BeautifulSoup &
Forward Geocoding for finidng place coordinates and address using Nominatim & Generating Interactive Map Visualisation with Follium**

In [1]:
#importing packages
import requests
# requests is to get the html source code of any website
from bs4 import BeautifulSoup
# the main package bs4, from which we get the function BeautifulSoup()
import pandas as pd
import numpy as np

#importing geocoding libraries
import geopandas as gpd
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

#if not downlaoded download the packages
#pip install follium


# import the library
import folium

In [2]:
url='https://www.expedia.co.uk/Things-To-Do-In-Portsmouth.d553248634335196573.Travel-Guide-Activities'

In [3]:
attractions_html=requests.get(url).text

In [4]:
soup = BeautifulSoup(attractions_html, 'lxml')
# we convert the above string into a bs4, lxml object so that the function understands the page

In [5]:
#print(soup)

In [6]:
#print(dir(soup))

In [7]:
# extracting from rquired attributes
names1 = soup.findAll('span',{'class':"is-visually-hidden"})
len(names1)

22

In [8]:
names1

[<span class="is-visually-hidden">Explore map</span>,
 <span class="is-visually-hidden">Gunwharf Quays</span>,
 <span class="is-visually-hidden">Portsmouth Historic Dockyard</span>,
 <span class="is-visually-hidden">Portsmouth International Port</span>,
 <span class="is-visually-hidden">HMS Victory</span>,
 <span class="is-visually-hidden">Portsmouth Guildhall</span>,
 <span class="is-visually-hidden">Mary Rose Museum</span>,
 <span class="is-visually-hidden">Sally Port</span>,
 <span class="is-visually-hidden">Square Tower</span>,
 <span class="is-visually-hidden">The Point</span>,
 <span class="is-visually-hidden">Portsmouth City Centre</span>,
 <span class="is-visually-hidden">Cosham</span>,
 <span class="is-visually-hidden">Old Portsmouth</span>,
 <span class="is-visually-hidden">Buckland</span>,
 <span class="is-visually-hidden"></span>,
 <span class="is-visually-hidden"></span>,
 <span class="is-visually-hidden"></span>,
 <span class="is-visually-hidden"></span>,
 <span class="is

In [9]:
attraction_names = []
for name in names1:
    attraction_names.append(name.text)
attraction_names = pd.Series(attraction_names, name = "Tourist_Attraction")

In [10]:
####Important
#Note: change numer of rows when you load as site gets update
attraction_names=attraction_names[13:26:]

In [11]:
attraction_names

13    Buckland
14            
15            
16            
17            
18            
19            
20            
21            
Name: Tourist_Attraction, dtype: object

In [12]:
placestogo= attraction_names.to_frame()
placestogo

Unnamed: 0,Tourist_Attraction
13,Buckland
14,
15,
16,
17,
18,
19,
20,
21,


In [13]:
#Importing first list generate from Portsmouth Places to Visit-1 ipynb file
placestogo1=pd.read_excel('PortsmouthPlaces.xlsx')

In [14]:
#dropping additional columns
placestogo1=placestogo1.drop(columns=['Unnamed: 0','address_','coordinates'])

In [15]:
placestogo1

Unnamed: 0,place
0,HMS Victory
1,Mary Rose Museum
2,HMS Warrior
3,Portsmouth Historic Dockyard
4,Spinnaker Tower
5,The D-Day Story
6,Gunwharf Quays
7,Royal Navy Submarine Museum
8,Round and Square Towers in Old Portsmouth
9,Portsmouth Cathedral


In [16]:
#concatenating two lists together- More websites More Places More Options
touristattr=pd.concat([placestogo1['place'],placestogo['Tourist_Attraction']],ignore_index=True)

In [17]:
touristattr

0                                  HMS Victory 
1                             Mary Rose Museum 
2                                  HMS Warrior 
3                 Portsmouth Historic Dockyard 
4                              Spinnaker Tower 
5                              The D-Day Story 
6                               Gunwharf Quays 
7                  Royal Navy Submarine Museum 
8     Round and Square Towers in Old Portsmouth
9                         Portsmouth Cathedral 
10          Charles Dickens' Birthplace Museum 
11            Portsmouth Museum and Art Gallery
12        Southsea Beach and Pier in Portsmouth
13                       Southsea Rock Gardens 
14                                Stansted Park
15                                     Buckland
16                                             
17                                             
18                                             
19                                             
20                                      

In [18]:
#converting to dataframe
touristattr=touristattr.to_frame()

In [19]:
#cleaning: renaming columns 1
touristattr=touristattr.rename(columns={0:'place'})

In [20]:
#cleaning: changing all to capitalize case
touristattr['place'] =touristattr['place'].str.capitalize()

In [21]:
#removing spaces left and right
touristattr['place'] =touristattr['place'].str.lstrip()
touristattr['place'] =touristattr['place'].str.rstrip()

In [22]:
#dropping duplicates
touristattr['place']=touristattr['place'].drop_duplicates()

In [23]:
#dropping null values and reindexing
touristattr=touristattr.dropna(inplace=False)
touristattr=touristattr.reset_index(drop=True)

In [24]:
touristattr

Unnamed: 0,place
0,Hms victory
1,Mary rose museum
2,Hms warrior
3,Portsmouth historic dockyard
4,Spinnaker tower
5,The d-day story
6,Gunwharf quays
7,Royal navy submarine museum
8,Round and square towers in old portsmouth
9,Portsmouth cathedral


In [25]:
#touristattr.add_suffix(', Portsmouth,United Kingdom')
#[str(col) + ', Portsmouth,United Kingdom' for col in touristattr.columns]
#for better analysis as not including this gives places with similar names possibly there is Portsmouth in United States as well
touristattr['place'] = touristattr['place'].astype(str)+ ', Portsmouth,United Kingdom' 

In [26]:
touristattr

Unnamed: 0,place
0,"Hms victory, Portsmouth,United Kingdom"
1,"Mary rose museum, Portsmouth,United Kingdom"
2,"Hms warrior, Portsmouth,United Kingdom"
3,"Portsmouth historic dockyard, Portsmouth,Unite..."
4,"Spinnaker tower, Portsmouth,United Kingdom"
5,"The d-day story, Portsmouth,United Kingdom"
6,"Gunwharf quays, Portsmouth,United Kingdom"
7,"Royal navy submarine museum, Portsmouth,United..."
8,"Round and square towers in old portsmouth, Por..."
9,"Portsmouth cathedral, Portsmouth,United Kingdom"


**Final list is ready and and now we could find coordinates for our places**

# Geocoding  Using Nominatim

In [27]:
#Setting up Nominatim
locator = Nominatim(user_agent="Travelops", timeout=20)
rgeocode = RateLimiter(locator.reverse, min_delay_seconds=0.001)

In [28]:
#function forward geocoding
#takes place name and finds address and coordinates
def location_info(x):

    data = locator.geocode(x) #use this line when using Nominatim
    
    #converted = pd.json_normalize(data).squeeze() #squeeze converts a dataframe to a pandas series
    return data

In [29]:
#expamle
locator.geocode('D-Day Museum, Portsmouth,United Kingdom')

Location(D-Day Museum, Clarence Esplanade, Old Portsmouth, Portsmouth, England, PO5 3AE, United Kingdom, (50.77964605, -1.0893765716108148, 0.0))

In [30]:
#Rule based- D-Day story is informal name should be replaced with real name
touristattr['place'][5]='D-Day Museum, Portsmouth,United Kingdom'

In [31]:
#example
locator.geocode('Square tower, Portsmouth,United Kingdom')

Location(Square Tower, Millennium Promenade, Old Portsmouth, Portsmouth, England, PO1 2ND, United Kingdom, (50.789355, -1.1065100000000005, 0.0))

In [32]:
#Rule based- Square and Round Towers are informal name should be replaced with real name
touristattr['place'][8]='Square tower, Portsmouth,United Kingdom' 

In [33]:
locator.geocode('Portsmouth museum, Portsmouth,United Kingdom')

Location(Portsmouth City Records Office, Museum Road, Old Portsmouth, Portsmouth, England, PO1 2LE, United Kingdom, (50.79144385, -1.0968967552858682, 0.0))

In [34]:
#To allow it to not to confuse with one in U.S.
touristattr['place'][11]='Portsmouth museum, Portsmouth,United Kingdom'

In [35]:
#example
locator.geocode('Southsea beach, Portsmouth,United Kingdom')

Location(Southsea nudist beach, Eastney, Portsmouth, England, PO4 9LP, United Kingdom, (50.7861817, -1.0378028, 0.0))

In [36]:
#replace for better mathc
touristattr['place'][12]='Southsea beach, Portsmouth,United Kingdom'

In [37]:
locator.geocode('Royal navy submarine museum')

Location(Royal Navy Submarine Museum, Haslar Road, Seafield, Clayhall, Gosport, Hampshire, England, PO12 2FG, United Kingdom, (50.78792895, -1.1228231163235636, 0.0))

In [38]:
#this is not in Portsmouth but in Goston easily accessible by ferry so suffix Portsmouth confuses geocoder
touristattr['place'][7]='Royal navy submarine museum'

In [39]:
#generating addresses and coordinates
location_info_df = touristattr['place'].apply(location_info)
location_info_df

0     (HMS Victory, Main Road, Portsea, Portsmouth, ...
1     (Mary Rose Museum, Main Road, Portsea, Portsmo...
2     (HMS Warrior, Gosport Ferry Piazza, Portsea, P...
3     (Portsmouth Historic Dockyard, The Hard, Ports...
4     (Spinnaker Tower, The Canalside, Gunwharf Quay...
5     (D-Day Museum, Clarence Esplanade, Old Portsmo...
6     (Gunwharf Quays, Portsea, Portsmouth, England,...
7     (Royal Navy Submarine Museum, Haslar Road, Sea...
8     (Square Tower, Millennium Promenade, Old Ports...
9     (Cathedral Church of St Thomas of Canterbury, ...
10    (Charles Dickens Birthplace Museum, 393, Old C...
11    (Portsmouth City Records Office, Museum Road, ...
12    (Southsea nudist beach, Eastney, Portsmouth, E...
13    (Southsea Rock Gardens, Clarence Esplanade, So...
14                                                 None
15    (Buckland, Fratton, Portsmouth, England, PO2 7...
16    (Portsmouth, England, United Kingdom, (50.8036...
Name: place, dtype: object

In [40]:
#comining place name with geocode identities
df_locations = pd.concat([touristattr,location_info_df], axis=1)
df_locations.head(10)

Unnamed: 0,place,place.1
0,"Hms victory, Portsmouth,United Kingdom","(HMS Victory, Main Road, Portsea, Portsmouth, ..."
1,"Mary rose museum, Portsmouth,United Kingdom","(Mary Rose Museum, Main Road, Portsea, Portsmo..."
2,"Hms warrior, Portsmouth,United Kingdom","(HMS Warrior, Gosport Ferry Piazza, Portsea, P..."
3,"Portsmouth historic dockyard, Portsmouth,Unite...","(Portsmouth Historic Dockyard, The Hard, Ports..."
4,"Spinnaker tower, Portsmouth,United Kingdom","(Spinnaker Tower, The Canalside, Gunwharf Quay..."
5,"D-Day Museum, Portsmouth,United Kingdom","(D-Day Museum, Clarence Esplanade, Old Portsmo..."
6,"Gunwharf quays, Portsmouth,United Kingdom","(Gunwharf Quays, Portsea, Portsmouth, England,..."
7,Royal navy submarine museum,"(Royal Navy Submarine Museum, Haslar Road, Sea..."
8,"Square tower, Portsmouth,United Kingdom","(Square Tower, Millennium Promenade, Old Ports..."
9,"Portsmouth cathedral, Portsmouth,United Kingdom","(Cathedral Church of St Thomas of Canterbury, ..."


In [41]:
#changing col names
df_locations.columns =['place','address']

In [42]:
df_locations.head()

Unnamed: 0,place,address
0,"Hms victory, Portsmouth,United Kingdom","(HMS Victory, Main Road, Portsea, Portsmouth, ..."
1,"Mary rose museum, Portsmouth,United Kingdom","(Mary Rose Museum, Main Road, Portsea, Portsmo..."
2,"Hms warrior, Portsmouth,United Kingdom","(HMS Warrior, Gosport Ferry Piazza, Portsea, P..."
3,"Portsmouth historic dockyard, Portsmouth,Unite...","(Portsmouth Historic Dockyard, The Hard, Ports..."
4,"Spinnaker tower, Portsmouth,United Kingdom","(Spinnaker Tower, The Canalside, Gunwharf Quay..."


In [43]:
df_locations=df_locations.reset_index(drop=True)

In [44]:
df_locations.head()

Unnamed: 0,place,address
0,"Hms victory, Portsmouth,United Kingdom","(HMS Victory, Main Road, Portsea, Portsmouth, ..."
1,"Mary rose museum, Portsmouth,United Kingdom","(Mary Rose Museum, Main Road, Portsea, Portsmo..."
2,"Hms warrior, Portsmouth,United Kingdom","(HMS Warrior, Gosport Ferry Piazza, Portsea, P..."
3,"Portsmouth historic dockyard, Portsmouth,Unite...","(Portsmouth Historic Dockyard, The Hard, Ports..."
4,"Spinnaker tower, Portsmouth,United Kingdom","(Spinnaker Tower, The Canalside, Gunwharf Quay..."


In [45]:
#extracting the string part as address and storing in new column of dataframe
df_locations['address_']=df_locations['address'].astype('str')

In [46]:
#drop null and reset index
df_locations=df_locations.dropna().reset_index(drop=True)

In [47]:
df_locations.head()

Unnamed: 0,place,address,address_
0,"Hms victory, Portsmouth,United Kingdom","(HMS Victory, Main Road, Portsea, Portsmouth, ...","HMS Victory, Main Road, Portsea, Portsmouth, E..."
1,"Mary rose museum, Portsmouth,United Kingdom","(Mary Rose Museum, Main Road, Portsea, Portsmo...","Mary Rose Museum, Main Road, Portsea, Portsmou..."
2,"Hms warrior, Portsmouth,United Kingdom","(HMS Warrior, Gosport Ferry Piazza, Portsea, P...","HMS Warrior, Gosport Ferry Piazza, Portsea, Po..."
3,"Portsmouth historic dockyard, Portsmouth,Unite...","(Portsmouth Historic Dockyard, The Hard, Ports...","Portsmouth Historic Dockyard, The Hard, Portse..."
4,"Spinnaker tower, Portsmouth,United Kingdom","(Spinnaker Tower, The Canalside, Gunwharf Quay...","Spinnaker Tower, The Canalside, Gunwharf Quays..."


In [48]:
#extracting the tuple part as coordinates and storing in a list
coord=[]
for i in range(len(df_locations['address'])):
    if df_locations['address'][i]!=None:
        coord.append(df_locations['address'][i][1])
    else:
        coord.append(None)

In [49]:
#coordinates list to series
coord=pd.Series(coord)

#adding as col. to dataframe
df_locations['coordinates']=coord.values

In [50]:
df_locations.head()

Unnamed: 0,place,address,address_,coordinates
0,"Hms victory, Portsmouth,United Kingdom","(HMS Victory, Main Road, Portsea, Portsmouth, ...","HMS Victory, Main Road, Portsea, Portsmouth, E...","(50.80181325, -1.1096133411749904)"
1,"Mary rose museum, Portsmouth,United Kingdom","(Mary Rose Museum, Main Road, Portsea, Portsmo...","Mary Rose Museum, Main Road, Portsea, Portsmou...","(50.80220235, -1.1089264299156816)"
2,"Hms warrior, Portsmouth,United Kingdom","(HMS Warrior, Gosport Ferry Piazza, Portsea, P...","HMS Warrior, Gosport Ferry Piazza, Portsea, Po...","(50.7982217, -1.1092544924090513)"
3,"Portsmouth historic dockyard, Portsmouth,Unite...","(Portsmouth Historic Dockyard, The Hard, Ports...","Portsmouth Historic Dockyard, The Hard, Portse...","(50.80046565, -1.1094840008522384)"
4,"Spinnaker tower, Portsmouth,United Kingdom","(Spinnaker Tower, The Canalside, Gunwharf Quay...","Spinnaker Tower, The Canalside, Gunwharf Quays...","(50.795596450000005, -1.1084489003112004)"


In [51]:
#dropping redundant address column
df_locations=df_locations.drop('address',axis=1)

In [52]:
#drop duplicates and reset index
df_locations=df_locations.drop_duplicates().reset_index(drop=True)

df_locations

In [53]:
#converting str 'None' to None vlaues
for i in range(len(df_locations['address_'])):
    if df_locations['address_'][i]=='None':
        df_locations['address_'][i]=None

In [54]:
#exporting final list with geo locations to excel
df_locations.to_excel('PortsmouthPlacesfinal.xlsx')

**Now we have the list of places and their coordinates we would clean them and plot them using follium**

P.S. this file PortsmouthPlacesfinal.xlsx is used in alteryx Workflow to clean and for other spatial operations in alteryx

In [55]:
df_locations.head()

Unnamed: 0,place,address_,coordinates
0,"Hms victory, Portsmouth,United Kingdom","HMS Victory, Main Road, Portsea, Portsmouth, E...","(50.80181325, -1.1096133411749904)"
1,"Mary rose museum, Portsmouth,United Kingdom","Mary Rose Museum, Main Road, Portsea, Portsmou...","(50.80220235, -1.1089264299156816)"
2,"Hms warrior, Portsmouth,United Kingdom","HMS Warrior, Gosport Ferry Piazza, Portsea, Po...","(50.7982217, -1.1092544924090513)"
3,"Portsmouth historic dockyard, Portsmouth,Unite...","Portsmouth Historic Dockyard, The Hard, Portse...","(50.80046565, -1.1094840008522384)"
4,"Spinnaker tower, Portsmouth,United Kingdom","Spinnaker Tower, The Canalside, Gunwharf Quays...","(50.795596450000005, -1.1084489003112004)"


In [56]:
# creating list of lat and lon from coordinates
Lat=[]
Lon=[]
for i in range(len(df_locations)):
    Lat.append(df_locations['coordinates'][i][0])
    Lon.append(df_locations['coordinates'][i][1])
    
#list to pandas series
Lat=pd.Series(Lat)
Lon=pd.Series(Lon)

#adding series as columns to our dataframe
df_locations=df_locations.assign(Lat=Lat.values)
df_locations=df_locations.assign(Lon=Lon.values)

In [57]:
df_locations.head()

Unnamed: 0,place,address_,coordinates,Lat,Lon
0,"Hms victory, Portsmouth,United Kingdom","HMS Victory, Main Road, Portsea, Portsmouth, E...","(50.80181325, -1.1096133411749904)",50.801813,-1.109613
1,"Mary rose museum, Portsmouth,United Kingdom","Mary Rose Museum, Main Road, Portsea, Portsmou...","(50.80220235, -1.1089264299156816)",50.802202,-1.108926
2,"Hms warrior, Portsmouth,United Kingdom","HMS Warrior, Gosport Ferry Piazza, Portsea, Po...","(50.7982217, -1.1092544924090513)",50.798222,-1.109254
3,"Portsmouth historic dockyard, Portsmouth,Unite...","Portsmouth Historic Dockyard, The Hard, Portse...","(50.80046565, -1.1094840008522384)",50.800466,-1.109484
4,"Spinnaker tower, Portsmouth,United Kingdom","Spinnaker Tower, The Canalside, Gunwharf Quays...","(50.795596450000005, -1.1084489003112004)",50.795596,-1.108449


In [58]:
#appending our hotel location to the exisitng data to map
df_locations=df_locations.append({'place':'Hotel Holiday Inn','address_':'Pembroke Rd, Portsmouth PO1 2TA','coordinates':None, 'Lat': 50.7895,'Lon': -1.1027},ignore_index=True)

  df_locations=df_locations.append({'place':'Hotel Holiday Inn','address_':'Pembroke Rd, Portsmouth PO1 2TA','coordinates':None, 'Lat': 50.7895,'Lon': -1.1027},ignore_index=True)


//this hotel list should be more than one hotel or can be taken from a excel file or similarly scrapped from websites to analyse locations of these hotels and thereby picking best hotel//

In [59]:
df_locations.tail()
#see- hotel gets added at the last row

Unnamed: 0,place,address_,coordinates,Lat,Lon
12,"Southsea beach, Portsmouth,United Kingdom","Southsea nudist beach, Eastney, Portsmouth, En...","(50.7861817, -1.0378028)",50.786182,-1.037803
13,"Southsea rock gardens, Portsmouth,United Kingdom","Southsea Rock Gardens, Clarence Esplanade, Sou...","(50.778863, -1.0832390311098703)",50.778863,-1.083239
14,"Buckland, Portsmouth,United Kingdom","Buckland, Fratton, Portsmouth, England, PO2 7P...","(50.8110016, -1.0751747)",50.811002,-1.075175
15,", Portsmouth,United Kingdom","Portsmouth, England, United Kingdom","(50.8036831, -1.075614)",50.803683,-1.075614
16,Hotel Holiday Inn,"Pembroke Rd, Portsmouth PO1 2TA",,50.7895,-1.1027


# Generating Interactive Map

#pip install folium

In [60]:
# Make an empty map
m = folium.Map(location=[20,0], tiles="OpenStreetMap", zoom_start=2)

In [61]:
#function to color hotels in different color
def color(hotel):
    if hotel=='Hotel Holiday Inn':
        col = 'red'
    else:
        col='blue'
    return col

In [62]:
# add marker one by one on the map



for i in range(0,len(df_locations)):
   folium.Marker(
      location=[df_locations.iloc[i]['Lat'], df_locations.iloc[i]['Lon']],
                                    icon= folium.Icon(color=color(df_locations.iloc[i]['place'])),
                                    popup=df_locations.iloc[i]['address_']).add_to(m)

#fits the required portion of map to screen
m.fit_bounds([[50.8418849, -1.0673875], [50.7861817, -1.0378028]])

# Show the map again
m


In [63]:
#saves map as html
m.save('Portsmouthmap.html')

**Yayy!! My hotel is nicely placed in centre of Old Portsmouth near the beach and close to attractions**