# 1/3 First, we build the files to be uploaded at the end to S3
## We will build the list of towns with associated weather.
## And we will build the list of 30 best hotels for these towns.

In [2]:
import pandas as pd

locations = [
"Mont Saint Michel",
"St Malo",
"Bayeux",
"Le Havre",
"Rouen",
"Paris",
"Amiens",
"Lille",
"Strasbourg",
"Chateau du Haut Koenigsbourg",
"Colmar",
"Eguisheim",
"Besancon",
"Dijon",
"Annecy",
"Grenoble",
"Lyon",
"Gorges du Verdon",
"Bormes les Mimosas",
"Cassis",
"Marseille",
"Aix en Provence",
"Avignon",
"Uzes",
"Nimes",
"Aigues Mortes",
"Saintes Maries de la mer",
"Collioure",
"Carcassonne",
"Ariege",
"Toulouse",
"Montauban",
"Biarritz",
"Bayonne",
"La Rochelle"
]

# Build locations dataframe
print("Building locations dataframe")
df_locations = pd.DataFrame(data=locations, columns=['location'])
print(f"Location DF contains {len(df_locations.index)} items.")

Building locations dataframe
Location DF contains 35 items.


In [3]:
# Buiding GPS coordinates
import requests
from tqdm.notebook import tqdm
print("\nBuilding GPS coordinates for all towns")
display_names = []
lon = []
lat = []
for row in tqdm(df_locations.itertuples(), total=df_locations.shape[0]):
    location = row.location.replace(' ', '+')
    url = f"https://nominatim.openstreetmap.org/?q={location}&country=France&format=json"
    r = requests.get(url)
    if r.reason == 'OK':
        r_json = r.json()
        if r_json:
            display_names.append(r_json[0]['display_name'])
            lat.append(float(r_json[0]['lat']))
            lon.append(float(r_json[0]['lon']))
        else:
            raise Exception(f'No openstreetmap json for {location} location')
    else:
        raise Exception(url, "NOK !")


Building GPS coordinates for all towns


  0%|          | 0/35 [00:00<?, ?it/s]

In [5]:
# Adding GPS and display name to main Dataframe
df_locations['display_name'] = display_names
df_locations['lat'] = lat
df_locations['lon'] = lon
print(df_locations.head())

            location                                       display_name  \
0  Mont Saint Michel  Mont Saint-Michel, Terrasse de l'Abside, Le Mo...   
1            St Malo  Saint-Malo, Ille-et-Vilaine, Bretagne, France ...   
2             Bayeux  Bayeux, Calvados, Normandie, France métropolit...   
3           Le Havre  Le Havre, Seine-Maritime, Normandie, France mé...   
4              Rouen  Rouen, Seine-Maritime, Normandie, France métro...   

         lat       lon  
0  48.635954 -1.511460  
1  48.649518 -2.026041  
2  49.276462 -0.702474  
3  49.493898  0.107973  
4  49.440459  1.093966  


## ![image.png](attachment:223e17f3-83f1-4328-9f96-537af7f560bf.png)List of locations and associated GPS coordinates is created.

## Getting the weather for those towns, helped by ther GPS coordinates

In [6]:
# Building weather dataframe
print("\nBuilding weather dataframe")
df_weather = None
for row in tqdm(df_locations.itertuples(), total=df_locations.shape[0]):
    location = row.location
    lon = row.lon
    lat = row.lat

    API_KEY = "32c093b568434c56f5ee763fa5341101"
    url = f"https://api.openweathermap.org/data/2.5/forecast?lat={lat}&lon={lon}&units=metric&lang=fr&appid={API_KEY}"
    r = requests.get(url)
    if r.reason != 'OK':
        raise Exception(url, "NOK !")
        
    r_json = r.json()
    if not r_json:
        raise Exception(f'No openweathermap json for {location} location')

    df_local_weather = pd.json_normalize(r_json['list'])
    # We keep only weather at noon, not every 3h
    df_local_weather = df_local_weather[df_local_weather['dt_txt'].str.contains('12:00:00')]
    # We remove unwanted columns
    df_local_weather.drop(columns=['dt', 'weather', 'dt_txt', 'visibility', 'main.temp', 'main.temp_min', 'main.temp_max', 'main.pressure', 'main.sea_level', 'main.grnd_level', 'main.humidity',
                             'main.temp_kf', 'clouds.all', 'wind.speed', 'wind.deg', 'wind.gust', 'sys.pod', 'description', 'rain.3h'], inplace=True, errors='ignore')
    df_local_weather.rename(columns={'main.feels_like':'perceived_temperature'}, inplace = True)
    # We add the location for local weather
    df_local_weather['location'] = location
    # Concatenation of the weather for all locations
    df_weather = (df_local_weather if df_weather is None else pd.concat([df_weather, df_local_weather], ignore_index=True))

df_weather = df_weather.groupby('location', as_index=False)[['pop', 'perceived_temperature']].mean()
df_weather['pop'] = df_weather['pop'].apply(lambda x : int(round(x, 2)*100))
df_weather['perceived_temperature'] = df_weather['perceived_temperature'].apply(lambda x : int(x))
print(df_weather.head())
print(f"\nWeathers DF contains {len(df_weather.index)} items, with mean weather at noon (probability of precipitation & perceived temperature) of 5 next days.")


Building weather dataframe


  0%|          | 0/35 [00:00<?, ?it/s]

          location  pop  perceived_temperature
0    Aigues Mortes   44                      6
1  Aix en Provence   41                      7
2           Amiens    4                      1
3           Annecy    6                      5
4           Ariege   42                     -4

Weathers DF contains 35 items, with mean weather at noon (probability of precipitation & perceived temperature) of 5 next days.


In [8]:
# Merging locations and weathers
print("\nMerging locations and weathers")
df_locations_weathers = pd.merge(df_weather, df_locations, on='location')
# Saving as csv
df_locations_weathers.to_csv(r'./csv/df_locations_weather.csv', index_label='Id', encoding='utf-8')
print(df_locations_weathers.head())
print(f"Merged DF contains {len(df_locations_weathers.index)} items, with mean weather at noon of 5 next days.")


Merging locations and weathers
          location  pop  perceived_temperature  \
0    Aigues Mortes   44                      6   
1  Aix en Provence   41                      7   
2           Amiens    4                      1   
3           Annecy    6                      5   
4           Ariege   42                     -4   

                                        display_name        lat       lon  
0  Aigues-Mortes, Nîmes, Gard, Occitanie, France ...  43.565823  4.191284  
1  Aix-en-Provence, Bouches-du-Rhône, Provence-Al...  43.529842  5.447474  
2  Amiens, Somme, Hauts-de-France, France métropo...  49.894171  2.295695  
3  Annecy, Haute-Savoie, Auvergne-Rhône-Alpes, Fr...  45.899235  6.128885  
4   Ariège, Occitanie, France métropolitaine, France  42.945537  1.406554  
Merged DF contains 35 items, with mean weather at noon of 5 next days.


## ![image.png](attachment:b3f9acbc-6f6f-49b4-95cd-d4c2c86cee25.png)List of locations and associated weather is created. 

## Getting the best hotels dataframe by scraping booking.com

In [51]:
# Building hotels dataframe by scraping booking.com
from bs4 import BeautifulSoup
print("\nBuilding hotels dataframe")
df_hotels = None
session = requests.Session()
for row in tqdm(df_locations.itertuples(), total=df_locations.shape[0], desc='Locations'):
    location = row.location

    print(f"\nGetting hotels for Location {location} on booking.com")
    lon = row.lon
    lat = row.lat    
    url = f"https://www.booking.com/searchresults.fr.html?place_id_lat={lat}&place_id_lon={lon}&order=score&nflt=distance%3D5000"
    
    r = session.get(url, headers={"Accept-Language" : "fr,en-US;q=0.7,en;q=0.3", "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0)"})
    if r.reason != 'OK':
        raise Exception(url, "NOK !")
    soup = BeautifulSoup(r.text, features="lxml")
    
    titles = []
    for m in soup.findAll("div", class_="fcab3ed991 a23c043802"):
        titles.append(m.text)
        
    if titles == []:
        raise Exception(f'titles not found for {location} location')
    
    scores = []
    for m in soup.findAll("div", class_="b5cd09854e d10a6220b4"):
        scores.append(float(m.text.replace(',','.')))

    if scores == []:
        raise Exception(f'scores not found for {location} location')
        
    # Complete missing scores with null value (no score hotels are listed at the end)
    for i in range(len(titles) - len(scores)):
        scores.append(0)    
    
    urls_hotels = []
    for m in soup.findAll("a", class_="e13098a59f"):
        urls_hotels.append(m.attrs['href'].split('?')[0])

    if urls_hotels == []:
        raise Exception(f'urls hotels not found for {location} location')
    
    desc = []
    for m in soup.findAll("div", class_="a1b3f50dcd"):
        n = m.findChild("div", class_="d8eab2cf7f", recursive=False)
        if n:
            desc.append(n.text)

    if desc == []:
        raise Exception(f'desc not found for {location} location')
    
    gps = []
    print(f"{len(urls_hotels)} best hotels found !")
    print(f"Getting hotels data for {location} location")
    for url_hotel in tqdm(urls_hotels, desc='Hotels'):
        r = session.get(url_hotel, headers={"Accept-Language" : "fr,en-US;q=0.7,en;q=0.3", "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0)"})
        soup = BeautifulSoup(r.text, features="lxml")
        latlng = soup.find("a", id="hotel_address").attrs['data-atlas-latlng']
        gps.append({'lat' : float(latlng.split(',')[0]), 'lon' : float(latlng.split(',')[1])})

    if gps == []:
        raise Exception(f'GPS not found for {location} location')

    df_local_hotel = pd.DataFrame(data={'title':titles, 'score':scores, 'desc':desc, 'url':urls_hotels, 'lat':[x ['lat'] for x in gps], 'lon':[x ['lon'] for x in gps]})
    df_local_hotel['location'] = location

    # Concatenation of the weather for all locations
    df_hotels = (df_local_hotel if df_hotels is None else pd.concat([df_hotels, df_local_hotel], ignore_index=True))


Building hotels dataframe


Locations:   0%|          | 0/35 [00:00<?, ?it/s]


Getting hotels for Location Mont Saint Michel on booking.com
30 best hotels found !
Getting hotels data for Mont Saint Michel location


Hotels:   0%|          | 0/30 [00:00<?, ?it/s]


Getting hotels for Location St Malo on booking.com
30 best hotels found !
Getting hotels data for St Malo location


Hotels:   0%|          | 0/30 [00:00<?, ?it/s]


Getting hotels for Location Bayeux on booking.com
30 best hotels found !
Getting hotels data for Bayeux location


Hotels:   0%|          | 0/30 [00:00<?, ?it/s]


Getting hotels for Location Le Havre on booking.com
30 best hotels found !
Getting hotels data for Le Havre location


Hotels:   0%|          | 0/30 [00:00<?, ?it/s]


Getting hotels for Location Rouen on booking.com
30 best hotels found !
Getting hotels data for Rouen location


Hotels:   0%|          | 0/30 [00:00<?, ?it/s]


Getting hotels for Location Paris on booking.com
30 best hotels found !
Getting hotels data for Paris location


Hotels:   0%|          | 0/30 [00:00<?, ?it/s]


Getting hotels for Location Amiens on booking.com
30 best hotels found !
Getting hotels data for Amiens location


Hotels:   0%|          | 0/30 [00:00<?, ?it/s]


Getting hotels for Location Lille on booking.com
30 best hotels found !
Getting hotels data for Lille location


Hotels:   0%|          | 0/30 [00:00<?, ?it/s]


Getting hotels for Location Strasbourg on booking.com
30 best hotels found !
Getting hotels data for Strasbourg location


Hotels:   0%|          | 0/30 [00:00<?, ?it/s]


Getting hotels for Location Chateau du Haut Koenigsbourg on booking.com
30 best hotels found !
Getting hotels data for Chateau du Haut Koenigsbourg location


Hotels:   0%|          | 0/30 [00:00<?, ?it/s]


Getting hotels for Location Colmar on booking.com
30 best hotels found !
Getting hotels data for Colmar location


Hotels:   0%|          | 0/30 [00:00<?, ?it/s]


Getting hotels for Location Eguisheim on booking.com
30 best hotels found !
Getting hotels data for Eguisheim location


Hotels:   0%|          | 0/30 [00:00<?, ?it/s]


Getting hotels for Location Besancon on booking.com
30 best hotels found !
Getting hotels data for Besancon location


Hotels:   0%|          | 0/30 [00:00<?, ?it/s]


Getting hotels for Location Dijon on booking.com
30 best hotels found !
Getting hotels data for Dijon location


Hotels:   0%|          | 0/30 [00:00<?, ?it/s]


Getting hotels for Location Annecy on booking.com
30 best hotels found !
Getting hotels data for Annecy location


Hotels:   0%|          | 0/30 [00:00<?, ?it/s]


Getting hotels for Location Grenoble on booking.com
30 best hotels found !
Getting hotels data for Grenoble location


Hotels:   0%|          | 0/30 [00:00<?, ?it/s]


Getting hotels for Location Lyon on booking.com
30 best hotels found !
Getting hotels data for Lyon location


Hotels:   0%|          | 0/30 [00:00<?, ?it/s]


Getting hotels for Location Gorges du Verdon on booking.com
11 best hotels found !
Getting hotels data for Gorges du Verdon location


Hotels:   0%|          | 0/11 [00:00<?, ?it/s]


Getting hotels for Location Bormes les Mimosas on booking.com
30 best hotels found !
Getting hotels data for Bormes les Mimosas location


Hotels:   0%|          | 0/30 [00:00<?, ?it/s]


Getting hotels for Location Cassis on booking.com
30 best hotels found !
Getting hotels data for Cassis location


Hotels:   0%|          | 0/30 [00:00<?, ?it/s]


Getting hotels for Location Marseille on booking.com
30 best hotels found !
Getting hotels data for Marseille location


Hotels:   0%|          | 0/30 [00:00<?, ?it/s]


Getting hotels for Location Aix en Provence on booking.com
30 best hotels found !
Getting hotels data for Aix en Provence location


Hotels:   0%|          | 0/30 [00:00<?, ?it/s]


Getting hotels for Location Avignon on booking.com
30 best hotels found !
Getting hotels data for Avignon location


Hotels:   0%|          | 0/30 [00:00<?, ?it/s]


Getting hotels for Location Uzes on booking.com
30 best hotels found !
Getting hotels data for Uzes location


Hotels:   0%|          | 0/30 [00:00<?, ?it/s]


Getting hotels for Location Nimes on booking.com
30 best hotels found !
Getting hotels data for Nimes location


Hotels:   0%|          | 0/30 [00:00<?, ?it/s]


Getting hotels for Location Aigues Mortes on booking.com
30 best hotels found !
Getting hotels data for Aigues Mortes location


Hotels:   0%|          | 0/30 [00:00<?, ?it/s]


Getting hotels for Location Saintes Maries de la mer on booking.com
30 best hotels found !
Getting hotels data for Saintes Maries de la mer location


Hotels:   0%|          | 0/30 [00:00<?, ?it/s]


Getting hotels for Location Collioure on booking.com
30 best hotels found !
Getting hotels data for Collioure location


Hotels:   0%|          | 0/30 [00:00<?, ?it/s]


Getting hotels for Location Carcassonne on booking.com
30 best hotels found !
Getting hotels data for Carcassonne location


Hotels:   0%|          | 0/30 [00:00<?, ?it/s]


Getting hotels for Location Ariege on booking.com
4 best hotels found !
Getting hotels data for Ariege location


Hotels:   0%|          | 0/4 [00:00<?, ?it/s]


Getting hotels for Location Toulouse on booking.com
30 best hotels found !
Getting hotels data for Toulouse location


Hotels:   0%|          | 0/30 [00:00<?, ?it/s]


Getting hotels for Location Montauban on booking.com
30 best hotels found !
Getting hotels data for Montauban location


Hotels:   0%|          | 0/30 [00:00<?, ?it/s]


Getting hotels for Location Biarritz on booking.com
30 best hotels found !
Getting hotels data for Biarritz location


Hotels:   0%|          | 0/30 [00:00<?, ?it/s]


Getting hotels for Location Bayonne on booking.com
30 best hotels found !
Getting hotels data for Bayonne location


Hotels:   0%|          | 0/30 [00:00<?, ?it/s]


Getting hotels for Location La Rochelle on booking.com
30 best hotels found !
Getting hotels data for La Rochelle location


Hotels:   0%|          | 0/30 [00:00<?, ?it/s]

In [52]:
df_hotels.head(20)

Unnamed: 0,title,score,desc,url,lat,lon,location
0,le coin des hirondelles,9.7,"Situé à Pontorson, l'établissement le coin des...",https://www.booking.com/hotel/fr/le-coin-des-h...,48.602864,-1.474069,Mont Saint Michel
1,L'ancien Presbytère d'Ardevon,9.6,"Situé à Pontorson, L'ancien Presbytère d'Ardev...",https://www.booking.com/hotel/fr/l-39-ancien-p...,48.603324,-1.476397,Mont Saint Michel
2,"Maison chaleureuse, spacieuse et familiale",9.5,"Dotée d'une connexion Wi-Fi gratuite, la Maiso...",https://www.booking.com/hotel/fr/maison-chaleu...,48.596577,-1.505204,Mont Saint Michel
3,Gîtes le Mont Desclos Saint Michel,9.4,"Situé à Beauvoir, à 5,7 km de l'abbaye du Mont...",https://www.booking.com/hotel/fr/gites-le-mont...,48.596656,-1.504449,Mont Saint Michel
4,Résidence Beauvoir le Mont-Saint-Michel (9 gît...,9.3,"Installée à 4,4 km de l'abbaye du Mont-Saint-M...",https://www.booking.com/hotel/fr/residence-bea...,48.597878,-1.508419,Mont Saint Michel
5,Chambres d'Hôtes Les Vieilles Digues,9.3,"Occupant une maison bretonne restaurée, l'étab...",https://www.booking.com/hotel/fr/chambres-d-ha...,48.604188,-1.511599,Mont Saint Michel
6,Maison proche Mont Saint Michel,9.3,"À moins de 23 km de Scriptorial d'Avranches, m...",https://www.booking.com/hotel/fr/maison-proche...,48.597071,-1.504222,Mont Saint Michel
7,Gites Bellevue,9.3,Situé à seulement 2 km du célèbre Mont-Saint-M...,https://www.booking.com/hotel/fr/gites-bellevu...,48.60788,-1.517224,Mont Saint Michel
8,Etoile des Grèves,9.3,"Situé à 8,1 km de l'abbaye du Mont-Saint-Miche...",https://www.booking.com/hotel/fr/etoile-des-gr...,48.615509,-1.491524,Mont Saint Michel
9,La Jacotière,9.3,"Implantée à Ardevon, la maison d'hôtes La Jaco...",https://www.booking.com/hotel/fr/la-jacotia-re...,48.614114,-1.504314,Mont Saint Michel


In [53]:
df_hotels.to_csv(r'./csv/df_hotels.csv', index_label='Id',encoding='utf-8')

## ![image.png](attachment:96149ed0-1086-4a3a-85f4-7edc3e1fb232.png)List of best hotels for the 35 locations is created. 

## Now we upload the 2 saved files to S3 bucket

In [1]:
import boto3
# Saving under S3
BUCKET_NAME = 'ylequere-jedha'
print(f"\n# Saving Kayak csv files under S3 in {BUCKET_NAME} bucket.")
print("\n!!! The credentials are needed to achieve this task. Please contact ylequere@gmail.com to obtain it.")
s3 = boto3.resource("s3")
bucket = s3.Bucket(BUCKET_NAME)

print("\n# Uploading locations_weather.csv as https://ylequere-jedha.s3.eu-west-3.amazonaws.com/02-Kayak/locations_weather.csv")
bucket.upload_file('./csv/df_locations_weather.csv', Key='02-Kayak/locations_weather.csv')
print("\n# Uploading hotels.csv as https://ylequere-jedha.s3.eu-west-3.amazonaws.com/02-Kayak/hotels.csv")
bucket.upload_file('./csv/df_hotels.csv', Key='02-Kayak/hotels.csv')


# Saving Kayak csv files under S3 in ylequere-jedha bucket.

!!! The credentials are needed to achieve this task. Please contact ylequere@gmail.com to obtain it.

# Uploading locations_weather.csv as https://ylequere-jedha.s3.eu-west-3.amazonaws.com/02-Kayak/locations_weather.csv

# Uploading hotels.csv as https://ylequere-jedha.s3.eu-west-3.amazonaws.com/02-Kayak/hotels.csv


## ![image.png](attachment:96149ed0-1086-4a3a-85f4-7edc3e1fb232.png)df_hotels.csv & locations_weather.csv files are upload on S3 bucket.