# Scrape Online Travel Agent (OTA) Property Prices

___Changes in this notebook must be migrated to the ```dags/modules/ota/etlotaPropertyPrices.py```___

@Nileka add a description here as to what this notebook offers

### Execute the cell below once to turn off debugging

In [2]:
'''
    WARNING CONTROL to display or ignore all warnings
'''
import warnings; warnings.simplefilter('ignore')     #switch betweeb 'default' and 'ignore'
import traceback

''' Set debug flag to view extended error messages; else set it to False to turn off debugging mode '''
debug = True

## Initialize the OTA scrape class

In [63]:
import os
import sys
from datetime import datetime, date, timedelta

sys.path.insert(1,"/home/nuwan/workspace/rezaware/")
import rezaware as reza
from wrangler.modules.ota.scraper import propertyScrapers as ps, scraperUtils as otasu
from utils.modules.etl.load import sparkwls as spark
from utils.modules.ml.natlang import nlp

''' restart initiate classes '''
if debug:
    import importlib
    reza = importlib.reload(reza)
    ps = importlib.reload(ps)
    otasu = importlib.reload(otasu)
    spark = importlib.reload(spark)
    nlp = importlib.reload(nlp)
    
prop_kwargs = {"WRITE_TO_FILE":True,
              }
''' optional - if not specified class will use the default values '''
# kwargs = {
#     "ROOT_DIR":ROOT_DIR,   # absolute path to the wrangler dir
#     "UTILS_DIR":UTILS_DIR, # abslute path to the generic utils
#     "MODULE_DIR":MODULE_DIR,   # absolute path to the ota module
#     "DATA_DIR":DATA_DIR, # absolute path to the scraper data dir
# }
clsScraper = ps.PropertyScraper(
    desc='scrape hotel prices data from OTAs', **prop_kwargs)
clsSparkWL = spark.SparkWorkLoads(desc="ota prices", **prop_kwargs)
print("\nClass initialization and load complete!")

All python packages in rezaware loaded successfully!
All scraper software packages loaded successfully!
All scraper in ota software packages loaded successfully!
All packages in load loaded successfully!
All packages in nlp loaded successfully!
Initialing scraper class for scraperUtils with instance Utilities class for property data scraping
Initialing scraper class for propertyScraper with instance scrape hotel prices data from OTAs
Data path set to /home/nuwan/workspace/rezaware/wrangler/data/ota/scraper/hospitality/bookings/

Class initialization and load complete!


--- Logging error ---
Traceback (most recent call last):
  File "/usr/lib/python3.8/logging/__init__.py", line 1085, in emit
    msg = self.format(record)
  File "/usr/lib/python3.8/logging/__init__.py", line 929, in format
    return fmt.format(record)
  File "/usr/lib/python3.8/logging/__init__.py", line 668, in format
    record.message = record.getMessage()
  File "/usr/lib/python3.8/logging/__init__.py", line 373, in getMessage
    msg = msg % self.args
TypeError: not all arguments converted during string formatting
Call stack:
  File "/usr/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/home/nuwan/.local/lib/python3.8/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/home/nuwan/.local/lib/python3.8/site-packages/traitlets/config/application.py", line 976, in launch_instance
    app

## Initialize the set of OTA URL for Scraping

In [64]:
file = "otaInputURLs.json"
start_date = date.today()
end_date = start_date + timedelta(days=1)

try:
    if start_date < date.today():
        raise ValueError("Start date must be greater than today: %s" % str(date.today()))
    if end_date <= start_date:
        raise ValueError("End date %s is invalid. It must be greater than Start Date: %s" % (str(end_date),str(start_date)))
    urls_kwargs = {"pageOffset":10,
                   "pageUpperLimit":10,
                   "startDate": start_date,
                   "endDate" : end_date,
                  }

    _otaURLfilePath, _ota_url_parameterized_list  = clsScraper.build_scrape_url_list(
                                                    file_name=file,  # mandatory to give the inputs json file
                                                    dir_path=None,   # optional to be used iff required
                                                    **urls_kwargs
                                                    )
    if _otaURLfilePath:
        print("Data in: %s" % _otaURLfilePath)
    if len(_ota_url_parameterized_list)>0:
        print("Completed parameterizing urls with %d instances." 
              % (len(_ota_url_parameterized_list)))

except Exception as err:
    _s_fn_id = "Class <WebScraper> Function <read_folder_csv_to_df>"
    print("[Error]"+_s_fn_id, err)
    print(traceback.format_exc())


Loaded 1 properties to begin scraping OTA data.
Processing booking.com ...
28
Data in: /home/nuwan/workspace/rezaware/wrangler/data/ota/scraper/hospitality/bookings/tmp/scraper-build-scrape-url-list.csv
Completed parameterizing urls with 28 instances.


## Prepare folder & file structure

In [30]:
from datetime import datetime, timezone

dirPath = None
_search_dt = datetime.now()
_search_dt = _search_dt + (datetime.min - _search_dt) % timedelta(minutes=15)
''' include the timezone '''
_search_dt = (_search_dt.replace(tzinfo=timezone.utc)).isoformat()

kwargs = {
#    'SEARCH_DATETIME': _search_dt,
    'STORAGE_METHOD': "local",   # values can be "local" or "AWS_S3"
}

_current_search_data_store_dir = clsScraper.make_storage_dir(**kwargs)
print("Extracting data into %s for search datetime: %s" 
      % (_current_search_data_store_dir,str(_search_dt)))

Extracting data into /home/nuwan/workspace/rezaware/wrangler/data/ota/scraper/hospitality/bookings/rates/2022-10-24-16-0/ for search datetime: 2022-10-24T15:45:00+00:00


## Scrape data into CSVs
___TODO:___ hault if internet connection times out; might be possible with airflow.

In [33]:
import pandas as pd

''' TODO change to read csv using utils/sparkwls class '''
if _otaURLfilePath:
    urlDF = pd.read_csv(_otaURLfilePath, sep=",")
    _otaURLParamDictList = urlDF.to_dict('records')

''' TODO change the function parameter names to camel format '''
_l_saved_files = clsScraper.scrape_url_list(
    otasuRLlist =_otaURLParamDictList,
    searchDT = _search_dt,
    data_store_dir =_current_search_data_store_dir)
print("Scraping completed and data saved in %s!" % _current_search_data_store_dir)

loading parameterized urls from list 28 records
[Error]function <_scrape_bookings_to_csv> No data received for https://www.booking.com/searchresults.en-gb.html?ss=Las+Vegas&label=gen173nr-1DCAEoggI46AdIM1gEaIUBiAEBmAEJuAEXyAEM2AED6AEBiAIBqAIDuAK_g7aYBsACAdICJDFiNWFiNzM3LTQ0YmItNDIzOC04NDM0LWRjMDFlNmZhYTUwM9gCBOACAQ&sid=36e124997ccdbec4823b6c98d5931c2e&aid=304142&lang=en-gb&sb=1&src_elem=sb&src=searchresults&dest_id=20079110&dest_type=city&checkin=2022-10-24&checkout=2022-10-25&group_adults=1&no_rooms=1&group_children=0&selected_currency=USD&offset=0
Traceback (most recent call last):
  File "/home/nuwan/workspace/rezaware/wrangler/modules/ota/scraper/propertyScrapers.py", line 444, in _scrape_bookings_to_csv
    for _list in lists:
ValueError: No data received for https://www.booking.com/searchresults.en-gb.html?ss=Las+Vegas&label=gen173nr-1DCAEoggI46AdIM1gEaIUBiAEBmAEJuAEXyAEM2AED6AEBiAIBqAIDuAK_g7aYBsACAdICJDFiNWFiNzM3LTQ0YmItNDIzOC04NDM0LWRjMDFlNmZhYTUwM9gCBOACAQ&sid=36e124997ccdbec

[Error]function <_scrape_bookings_to_csv> No data received for https://www.booking.com/searchresults.en-gb.html?ss=Las+Vegas&label=gen173nr-1DCAEoggI46AdIM1gEaIUBiAEBmAEJuAEXyAEM2AED6AEBiAIBqAIDuAK_g7aYBsACAdICJDFiNWFiNzM3LTQ0YmItNDIzOC04NDM0LWRjMDFlNmZhYTUwM9gCBOACAQ&sid=36e124997ccdbec4823b6c98d5931c2e&aid=304142&lang=en-gb&sb=1&src_elem=sb&src=searchresults&dest_id=20023488&dest_type=city&checkin=2022-10-24&checkout=2022-10-25&group_adults=1&no_rooms=1&group_children=0&selected_currency=USD&offset=10
Traceback (most recent call last):
  File "/home/nuwan/workspace/rezaware/wrangler/modules/ota/scraper/propertyScrapers.py", line 444, in _scrape_bookings_to_csv
    for _list in lists:
ValueError: No data received for https://www.booking.com/searchresults.en-gb.html?ss=Las+Vegas&label=gen173nr-1DCAEoggI46AdIM1gEaIUBiAEBmAEJuAEXyAEM2AED6AEBiAIBqAIDuAK_g7aYBsACAdICJDFiNWFiNzM3LTQ0YmItNDIzOC04NDM0LWRjMDFlNmZhYTUwM9gCBOACAQ&sid=36e124997ccdbec4823b6c98d5931c2e&aid=304142&lang=en-gb&sb=1&sr

[Error]function <_scrape_bookings_to_csv> No data received for https://www.booking.com/searchresults.en-gb.html?ss=Las+Vegas&label=gen173nr-1DCAEoggI46AdIM1gEaIUBiAEBmAEJuAEXyAEM2AED6AEBiAIBqAIDuAK_g7aYBsACAdICJDFiNWFiNzM3LTQ0YmItNDIzOC04NDM0LWRjMDFlNmZhYTUwM9gCBOACAQ&sid=36e124997ccdbec4823b6c98d5931c2e&aid=304142&lang=en-gb&sb=1&src_elem=sb&src=searchresults&dest_id=20135442&dest_type=city&checkin=2022-10-24&checkout=2022-10-25&group_adults=1&no_rooms=1&group_children=0&selected_currency=USD&offset=0
Traceback (most recent call last):
  File "/home/nuwan/workspace/rezaware/wrangler/modules/ota/scraper/propertyScrapers.py", line 444, in _scrape_bookings_to_csv
    for _list in lists:
ValueError: No data received for https://www.booking.com/searchresults.en-gb.html?ss=Las+Vegas&label=gen173nr-1DCAEoggI46AdIM1gEaIUBiAEBmAEJuAEXyAEM2AED6AEBiAIBqAIDuAK_g7aYBsACAdICJDFiNWFiNzM3LTQ0YmItNDIzOC04NDM0LWRjMDFlNmZhYTUwM9gCBOACAQ&sid=36e124997ccdbec4823b6c98d5931c2e&aid=304142&lang=en-gb&sb=1&src

[Error]function <_scrape_bookings_to_csv> No data received for https://www.booking.com/searchresults.en-gb.html?ss=Las+Vegas&label=gen173nr-1DCAEoggI46AdIM1gEaIUBiAEBmAEJuAEXyAEM2AED6AEBiAIBqAIDuAK_g7aYBsACAdICJDFiNWFiNzM3LTQ0YmItNDIzOC04NDM0LWRjMDFlNmZhYTUwM9gCBOACAQ&sid=36e124997ccdbec4823b6c98d5931c2e&aid=304142&lang=en-gb&sb=1&src_elem=sb&src=searchresults&dest_id=20023181&dest_type=city&checkin=2022-10-24&checkout=2022-10-25&group_adults=1&no_rooms=1&group_children=0&selected_currency=USD&offset=10
Traceback (most recent call last):
  File "/home/nuwan/workspace/rezaware/wrangler/modules/ota/scraper/propertyScrapers.py", line 444, in _scrape_bookings_to_csv
    for _list in lists:
ValueError: No data received for https://www.booking.com/searchresults.en-gb.html?ss=Las+Vegas&label=gen173nr-1DCAEoggI46AdIM1gEaIUBiAEBmAEJuAEXyAEM2AED6AEBiAIBqAIDuAK_g7aYBsACAdICJDFiNWFiNzM3LTQ0YmItNDIzOC04NDM0LWRjMDFlNmZhYTUwM9gCBOACAQ&sid=36e124997ccdbec4823b6c98d5931c2e&aid=304142&lang=en-gb&sb=1&sr

## Read CSV into spark dataframe

In [43]:
DATA_DIR="/home/nuwan/workspace/rezaware/wrangler/data/ota/scraper/hospitality/bookings/"
spark_kwargs = {"TO_PANDAS":True,   # change spark dataframe to pandas
                "IS_FOLDER":True,   # if folder then check if folder is empty
               }
# _current_search_data_store_dir = "../../data/hospitality/bookings/scraper/rates/2022-10-5-21-0/"
_current_search_data_store_dir = os.path.join(DATA_DIR, "rates/2022-10-5-3-0/")
# dirPath = _current_search_data_store_dir
_search_sdf, traceback = clsSparkWL.read_csv_to_sdf(filesPath=_current_search_data_store_dir, **spark_kwargs)
if not traceback:
    print("Spark loaded %d rows" % _search_sdf.shape[0])
else:
    print("No data loaded by spark; process failed!")

                                                                                

Spark loaded 3018 rows


## Transform the data
augment the dataframe with: 
1. exctract the room rate decimal numbers
1. matching city names to the codes 
1. categorizing the room types based on the taxonomy
1. setting the data types of the columns
using a transform function in the properties class

### Extract room rate decimal value

In [44]:
''' define room price column to extrac number'''
rate_col_name = "room_rate"
aug_col_name = "room_price"
''' extract the price value from room rate'''
_search_sdf = clsScraper.extract_room_rate(_search_sdf,rate_col_name,aug_col_name)

### Categorize room type by similarity mappring

In [45]:
_save_rcate_to = os.path.join(DATA_DIR,'tmp/similarity_categorized_rooms.csv')
''' categorize the room types '''
emb_kwargs = {
    'LOWER':True,
    'NO_STOP_WORDS':False,
    'METRIC':"COSIN",
    'MAX_SCORES':2,
    'TOLERANCE':0.7,
    'ROOM_CATE_FNAME':"room_descriptions.csv",
}
print("wait a moment this may take a while categorizing %d room type" % (_search_sdf.shape[0]))
_categorized_room_df = clsScraper.merge_similar_room_cate(_search_sdf,emb_kwargs)
_room_cate_count = len(_categorized_room_df.room_cate.unique())
print("Assigned %d room categories" % _room_cate_count)

wait a moment this may take a while categorizing 3018 room type
Assigned 59 room categories


### Assign location city names

In [46]:
_aug_dest_df = clsScraper.assign_lx_name(data_df=_categorized_room_df)
print("merged %d rows with destination name and type" % _aug_dest_df.shape[0])

merged 93606 rows with destination name and type


## Save cleaned SDF to Tmp File

In [49]:
''' convert dataframe from panas to spark '''
_tmp_fname = clsSparkWL.save_sdf_to_csv(_aug_dest_df)
print("transformed data saved to %s" % _tmp_fname)

22/10/24 19:23:10 WARN TaskSetManager: Stage 10 contains a task of very large size (5187 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/10/24 19:23:12 WARN TaskSetManager: Stage 13 contains a task of very large size (5187 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/10/24 19:23:14 WARN TaskSetManager: Stage 16 contains a task of very large size (5187 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/10/24 19:23:18 WARN TaskSetManager: Stage 17 contains a task of very large size (5187 KiB). The maximum recommended task size is 1000 KiB.


[Stage 17:>                                                         (0 + 2) / 2]

transformed data saved to /home/nuwan/workspace/rezaware/utils/data/etl/load/tmp/load_save_sdf_to.csv


                                                                                

## Read CSV from Tmp and Save to DB Table

In [61]:
''' Save dataframe to table '''
_s_tbl_name = "ota_property_prices"
_get_tmp_sdfm,traceback = clsSparkWL.read_csv_to_sdf(filesPath=_tmp_fname)
# _get_tmp_sdf.printSchema()
count, saved_df = clsScraper.save_to_db(data_df=_aug_dest_df,table_name = _s_tbl_name)
# count = clsSparkWL.insert_sdf_into_table(save_sdf=_get_tmp_sdf, dbTable=_s_tbl_name)
print("%d Data saved to %s" % (count,_s_tbl_name))

                                                                                

None
22/10/24 19:40:05 WARN TaskSetManager: Stage 41 contains a task of very large size (5510 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

Wait a moment while we insert data int ota_property_prices
22/10/24 19:40:07 WARN TaskSetManager: Stage 44 contains a task of very large size (5510 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

Save to ota_property_prices complete!
22/10/24 19:40:16 WARN TaskSetManager: Stage 45 contains a task of very large size (5510 KiB). The maximum recommended task size is 1000 KiB.


[Stage 45:>                                                         (0 + 2) / 2]

93606 Data saved to ota_property_prices


                                                                                

In [53]:
type(_get_tmp_sdf)

tuple

## Read data from table

In [62]:
_s_tbl_name = "ota_property_prices"
data = clsSparkWL.get_data_from_table(dbTable=_s_tbl_name)
print("%d records loaded from %s" %(data.count(),_s_tbl_name))

# data.select("*").distinct().where(date('created_dt') >= date.today()).sort("search_dt").show(2, vertical=True)

Wait a moment, retrieving data ...
Loading complete!
175944 records loaded from ota_property_prices


In [150]:
''' DEPRECATED -- moved as a function in otaPropertyScraper class '''

''' Get destination id dictionary '''
destDirPath = os.path.join(DATA_PATH, 'destinations/')
destinations_sdf = clsSparkWL.read_csv_to_sdf(filesPath=destDirPath)
destinations_sdf = destinations_sdf.selectExpr("city as destination_name", \
                                                "destinationID as destination_id")
# destinations_sdf = destinations_sdf.withColumn("destination_name",col("destination_name").cast(StringType())) \
#                                 .withColumn("destination_id",col("destination_id").cast(StringType()))
if debug:
    destinations_sdf.printSchema()
    destinations_sdf.show(n=2, vertical=True, truncate=False)
    print("Destination dictionary loarded!")

''' Lookup & augment destination name '''
#aug_search_sdf = destinations_sdf.join(_search_sdf,on='destination_id',how='rightouter')
aug_search_sdf = _search_sdf.join(destinations_sdf,
                                  _search_sdf.destination_id == destinations_sdf.destination_id,
                                  how='leftouter').drop(_search_sdf.destination_id)

if debug:
    print("%d destination names augmented to dataframe!" % (aug_search_sdf.count()))
    aug_search_sdf.show(n=2, vertical=True, truncate=False)

root
 |-- destination_name: string (nullable = true)
 |-- destination_id: integer (nullable = true)

-RECORD 0-------------------------
 destination_name | Las Vegas     
 destination_id   | 20079110      
-RECORD 1-------------------------
 destination_name | New York City 
 destination_id   | 20088325      
only showing top 2 rows

Destination dictionary loarded!


                                                                                

2095 destination names augmented to dataframe!


                                                                                

-RECORD 0-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 search_dt        | 2022-09-24 02:45:00                                                                                                                                                                                                                                     
 checkin_date     | 2022-09-23 00:00:00                                                                                                                                                                                                                                     
 property_name    | Extended Stay America Suites - Orlando - Lake Buena Vista                                                                                                                    

In [8]:
''' DEPRECATED - replaced with a function on otaPropertyScrpaer calss '''

from pyspark.sql.functions import substring,lit,col
from pyspark.sql.types import StringType,BooleanType,DateType,DecimalType,FloatType, IntegerType,LongType, ShortType, TimestampType

# _search_sdf=_search_sdf.withColumn("currency", lit("US$"))
# _search_sdf=_search_sdf.withColumn('room_rate', substring('room_rate', 4,10))

# ''' reset data types to match table '''
# _search_sdf = _search_sdf.withColumn("destination_id",col("destination_id").cast(StringType())) \
#     .withColumn("room_rate",col("room_rate").cast(FloatType()))
# #    .withColumn("search_datetime",col("search_datetime").cast(DateType()))

if debug:
    _search_sdf.printSchema()
    _search_sdf.show(n=2, vertical=True, truncate=False)
print("Split and Extraction complete!")

AttributeError: 'DataFrame' object has no attribute 'printSchema'

In [18]:
''' DEPRECATED -- merge moved to the function in otaPropScraper '''

_categorized_room_df = _search_sdf.merge(_room_type_assign, how='left', left_on=['room_type'], right_on=['room_type'])
print("Merged %d rows with categorized room type information" % _categorized_room_df.shape[0])

_categorized_room_df = new_df
#_categorized_room_df.to_csv(_save_rcate_to)
#_save_rcate_to=clsSparkWL.save_sdf_to_csv(_save_rcate_to)
print("Merged data saved to %s" % (_save_rcate_to))
print("Room type categorization complete!")

Merged data saved to None
Room type categorization complete!


In [9]:
''' DEPRECATED '''
_categorized_room_df = _search_sdf.merge(_room_type_assign, how='left', left_on=['room_type'], right_on=['room_type'])
_categorized_room_df.to_csv(os.path.join(ROOT_DIR,'data/tmp/similarity_scores.csv'))