# Scrape Online Travel Agent (OTA) Property Prices

@Nileka add a description here as to what this notebook offers

### Execute the cell below once to turn off debugging

In [1]:
'''
    WARNING CONTROL to display or ignore all warnings
'''
import warnings; warnings.simplefilter('ignore')     #switch betweeb 'default' and 'ignore'
import traceback

''' Set debug flag to view extended error messages; else set it to False to turn off debugging mode '''
debug = True

## Initialize the OTA scrape class

In [2]:
import os
import sys
from datetime import datetime, date, timedelta
#import configparser

ROOT_DIR = "/home/nuwan/workspace/rezgate/wrangler"
MODULE_PATH = os.path.join(ROOT_DIR, 'modules/ota/')
sys.path.insert(1,MODULE_PATH)
import otaWebScraper as otaws

dataDirPath="../../data/hospitality/bookings/scraper"

if debug:
    import importlib
    otaws = importlib.reload(otaws)
    
kwargs = {
    "ROOT_DIR":ROOT_DIR,
}
clsScraper = otaws.OTAWebScraper(**kwargs)

All OTAWebScraper software packages loaded successfully!
All OTAWebScraper software packages loaded successfully!
Initialing OTAWebScraper class for  data


## Initialize the set of OTA URL for Scraping

In [3]:
DATA_PATH = os.path.join(ROOT_DIR, 'data/hospitality/scraper/')
path = DATA_PATH
file = "otaInputURLs.json"
start_date = date.today()
end_date = start_date + timedelta(days=1)

try:
    if start_date < date.today():
        raise ValueError("Start date must be greater than today: %s" % str(date.today()))
    if end_date <= start_date:
        raise ValueError("End date %s is invalid. It must be greater than Start Date: %s" % (str(end_date),str(start_date)))
    scrape_crietia_dict = {"pageOffset":10,
                           "pageUpperLimit":550,
                           "startDate": start_date,
                           "endDate" : end_date,
             }

    _otaURLfilePath, _ota_url_parameterized_list  = clsScraper.build_scrape_url_list(fileName=file,
                                                                       dirPath=None,
                                                                       **scrape_crietia_dict,
                                                                       **kwargs
                                                                      )
    if _otaURLfilePath:
        print("Data in: %s" % _otaURLfilePath)
    if len(_ota_url_parameterized_list)>0:
        print("Completed parameterizing urls with %d instances." 
              % (len(_ota_url_parameterized_list)))

except Exception as err:
    _s_fn_id = "Class <WebScraper> Function <read_folder_csv_to_df>"
    print("[Error]"+_s_fn_id, err)
    print(traceback.format_exc())


Loaded 1 properties to begin scraping OTA data.
Processing booking.com ...
Data in: /home/nuwan/workspace/rezgate/wrangler/data/tmp/build_scrape_url_list.csv
Completed parameterizing urls with 784 instances.


## Prepare folder & file structure

In [4]:
from datetime import datetime, timezone

dirPath = None
_search_dt = datetime.now()
kwargs = {'searchDateTime': _search_dt,
          'storageLocation': "local",   # values can be "local" or "AWS_S3"
         }
''' include the timezone '''
_search_dt = (_search_dt.replace(tzinfo=timezone.utc)).isoformat()

_current_search_data_dir = clsScraper.get_search_data_dir_path(dirPath, **kwargs)
print("Extracting data into %s for search datetime: %s" % (_current_search_data_dir,str(_search_dt)))

Extracting data into /home/nuwan/workspace/rezgate/wrangler/data/hospitality/bookings/scraper/rates/2022-9-15-7-30/ for search datetime: 2022-09-15T07:37:45.918980+00:00


## Scrape data into CSVs
___TODO:___ hault if internet connection times out; might be possible with airflow.

In [55]:
import pandas as pd

if _otaURLfilePath:
    urlDF = pd.read_csv(_otaURLfilePath, sep=",")
    _otaURLParamDictList = urlDF.to_dict('records')

_l_saved_files = clsScraper.scrape_url_list(
    otaURLlist =_otaURLParamDictList,
#    otaURLfile = None,
    searchDT = _search_dt,
    dirPath =_current_search_data_dir)
print("Scraping completed and data saved in %s!" % _current_search_data_dir)

loading parameterized urls from list 784 records
/home/nuwan/workspace/rezgate/wrangler/data/hospitality/bookings/scraper/rates/2022-9-14-17-30//booking.com.20079110.2022-09-14.000.csv
/home/nuwan/workspace/rezgate/wrangler/data/hospitality/bookings/scraper/rates/2022-9-14-17-30//booking.com.20079110.2022-09-14.010.csv
/home/nuwan/workspace/rezgate/wrangler/data/hospitality/bookings/scraper/rates/2022-9-14-17-30//booking.com.20079110.2022-09-14.020.csv
/home/nuwan/workspace/rezgate/wrangler/data/hospitality/bookings/scraper/rates/2022-9-14-17-30//booking.com.20079110.2022-09-14.030.csv
/home/nuwan/workspace/rezgate/wrangler/data/hospitality/bookings/scraper/rates/2022-9-14-17-30//booking.com.20079110.2022-09-14.040.csv
/home/nuwan/workspace/rezgate/wrangler/data/hospitality/bookings/scraper/rates/2022-9-14-17-30//booking.com.20079110.2022-09-14.050.csv
/home/nuwan/workspace/rezgate/wrangler/data/hospitality/bookings/scraper/rates/2022-9-14-17-30//booking.com.20079110.2022-09-14.060.csv

KeyboardInterrupt: 

## Use Spark to Load Data into Database

### Declare sparkworkload class

In [35]:
import os
import sys
from datetime import datetime, date, timedelta

ROOT_DIR = "/home/nuwan/workspace/rezgate/wrangler/"
UTILS_PATH = os.path.join(ROOT_DIR, 'utils/')
sys.path.insert(1, UTILS_PATH)
import sparkWorkLoads as spark

if debug:
    import importlib
    spark = importlib.reload(spark)

kwargs = {
    "ROOT_DIR":ROOT_DIR,
}

clsSparkWL = spark.SparkWorkLoads(name="ota prices", **kwargs)
#_session = clsSparkWL.get_spark_session()

All packages in SparkWorkLoads loaded successfully!


### Read CSV into spark dataframe

In [13]:
#dirPath = _current_search_data_dir
dirPath = "../../data/hospitality/bookings/scraper/rates/2022-9-14-21-0/"
_search_sdf = clsSparkWL.read_csv_to_sdf(filesPath=dirPath)
_search_sdf = _search_sdf.distinct()
print("Spark loaded %d rows" % _search_sdf.count())

[Stage 39:>                                                         (0 + 2) / 2]

Spark loaded 330 rows


                                                                                

### Extract room rate decimal value

In [14]:
from pyspark.sql.functions import substring,lit,col
from pyspark.sql.types import StringType,BooleanType,DateType,DecimalType,FloatType, IntegerType,LongType, ShortType, TimestampType

_search_sdf=_search_sdf.withColumn("currency", lit("US$"))
_search_sdf=_search_sdf.withColumn('room_rate', substring('room_rate', 4,10))

''' reset data types to match table '''
_search_sdf = _search_sdf.withColumn("destination_id",col("destination_id").cast(StringType())) \
    .withColumn("room_rate",col("room_rate").cast(FloatType()))
#    .withColumn("search_datetime",col("search_datetime").cast(DateType()))

_search_sdf.printSchema()
#_search_sdf.show(n=2, vertical=True, truncate=False)
print("Split and Extraction complete!")

root
 |-- search_dt: timestamp (nullable = true)
 |-- checkin_date: timestamp (nullable = true)
 |-- property_name: string (nullable = true)
 |-- room_type: string (nullable = true)
 |-- room_rate: float (nullable = true)
 |-- review_score: double (nullable = true)
 |-- destination_id: string (nullable = true)
 |-- location_desc: string (nullable = true)
 |-- other_info: string (nullable = true)
 |-- currency: string (nullable = false)

Split and Extraction complete!


## Cleanup and Save data to table

In [15]:
''' Get destination id dictionary '''
destfilesPath = "../../data/hospitality/bookings/scraper/destinations/"
destinations_sdf = clsSparkWL.read_csv_to_sdf(filesPath=destfilesPath)
destinations_sdf = destinations_sdf.selectExpr("city as destination_name", \
                                               "destinationID as destination_id")
destinations_sdf = destinations_sdf.withColumn("destination_name",col("destination_name").cast(StringType())) \
                                .withColumn("destination_id",col("destination_id").cast(StringType()))

destinations_sdf.printSchema()
#destinations_sdf.show(n=2, vertical=True, truncate=False)
print("Destination dictionary loarded!")

''' Lookup & augment destination name '''
aug_search_sdf = destinations_sdf.join(_search_sdf,"destination_id")
aug_search_sdf.show(n=2, vertical=True, truncate=False)
print("Destination names augmented to dataframe!")

root
 |-- destination_name: string (nullable = true)
 |-- destination_id: string (nullable = true)

Destination dictionary loarded!
-RECORD 0-------------------------------------------------------------------------------------------------------------------------------
 destination_id   | 20079110                                                                                                            
 destination_name | Las Vegas                                                                                                           
 search_dt        | 2022-09-15 05:05:41.233021                                                                                          
 checkin_date     | 2022-09-14 00:00:00                                                                                                 
 property_name    | Westgate Flamingo Bay Resort                                                                                        
 room_type        | One-Bedroom Villa with Sof

## Save cleaned SDF to Tmp File

In [37]:
_tmp_fname = clsSparkWL.save_sdf_to_csv(aug_search_sdf)
print("%d rows saved to %s" % (aug_search_sdf.count(), _tmp_fname))

                                                                                

330 rows saved to /home/nuwan/workspace/rezgate/wrangler/data/tmp/save_sdf_to_csv.csv


## Read CSV from Tmp and Save to DB Table

In [38]:
''' Save dataframe to table '''
_s_tbl_name = "ota_property_prices"
_get_tmp_sdf = clsSparkWL.read_csv_to_sdf(filesPath=_tmp_fname)
_get_tmp_sdf.printSchema()
count = clsSparkWL.insert_sdf_into_table(save_sdf=_get_tmp_sdf, dbTable=_s_tbl_name)
print("%d Data saved to %s" % (count,_s_tbl_name))

root
 |-- destination_id: integer (nullable = true)
 |-- destination_name: string (nullable = true)
 |-- search_dt: timestamp (nullable = true)
 |-- checkin_date: timestamp (nullable = true)
 |-- property_name: string (nullable = true)
 |-- room_type: string (nullable = true)
 |-- room_rate: double (nullable = true)
 |-- review_score: double (nullable = true)
 |-- location_desc: string (nullable = true)
 |-- other_info: string (nullable = true)
 |-- currency: string (nullable = true)

Wait a moment while we insert data int ota_property_prices
Save to ota_property_prices complete!
330 Data saved to ota_property_prices


## Read data from table

In [39]:
_s_tbl_name = "ota_property_prices"
data = clsSparkWL.get_data_from_table(dbTable=_s_tbl_name)
print("%d records loaded from %s" %(data.count(),_s_tbl_name))

Wait a moment, retrieving data ...
Loading complete!
7662 records loaded from ota_property_prices


In [4]:
#path = dataDirPath
file = "otaInputURLs.json"
property_dict = clsScraper.load_ota_list(dirPath=dataDirPath, fileName=file)
_scrape_tags_df = clsScraper.get_scrape_html_tags(property_dict)
print(_scrape_tags_df)

        variable   tag                   code          ota
0  content_block   div            .d20f4628d0  booking.com
0  property_name  span  fcab3ed991 a23c043802  booking.com
0      room_type  span             df597226dd  booking.com
0      room_rate   div  fcab3ed991 bd73d13072  booking.com
0   review_score   div  b5cd09854e d10a6220b4  booking.com
0       location   div             a1fbd102d9  booking.com
0          Other   div             d22a7c133b  booking.com
