# Data Source Feeds Storage (Tester)


In [1]:
'''
    WARNING CONTROL to display or ignore all warnings
'''
import warnings; warnings.simplefilter('ignore')     #switch betweeb 'default' and 'ignore'
import traceback

''' Set debug flag to view extended error messages; else set it to False to turn off debugging mode '''
debug = True


In [13]:
import os
import sys

proj_dir = os.path.abspath(os.pardir)
sys.path.insert(1,proj_dir.split('rezaware/')[0])
from rezaware.modules.etl.extractor import dataFeedsDB as source
from rezaware.modules.etl.loader import sparkNoSQLwls as nosql

''' restart initiate classes '''
if debug:
    import importlib
    source= importlib.reload(source)
    nosql= importlib.reload(nosql)

__desc__ = "read and write files from and to a particular source"
clsFeed = source.FeedWorkLoads(desc=__desc__)
clsNoSQL = nosql.NoSQLWorkLoads(desc=__desc__)

print("\n%s class initialization and load complete!" % __desc__)

All functional FEEDWORKLOADS-libraries in EXTRACTOR-package of ETL-module imported successfully!
All functional SPARKNOSQLWLS-libraries in LOADER-package of ETL-module imported successfully!
sparkNoSQLwls Class initialization complete
FeedWorkLoads Class initialization complete
sparkNoSQLwls Class initialization complete

read and write files from and to a particular source class initialization and load complete!


## Data Source Dictionary
Example data source dictionary with

In [16]:
data_feeds = [
    {
#         "id" : 'ObjID(9876)',
        "source" : {
            "owner" : "kayak.com",   # data ownser unique identifier (i.e., legal entity name)
            "dates": {
                "activated" :'2023-07-07', # optional date the data source is active
                "expires":'2024-07-06',     # and inactive period
            }},
        "context": {
            "summary":'scraping kayak.com airline booking data for HERO', # any set of key value pairs to
            "country":'canada',"scope" : 'national',   # describe, identify, and distinguish the data feed
            },
        "realm":{
            "module" : 'OTA', # a unique realm name, db name prefix
            "entity" : 'scraper', # db name second prefix
            "package" : 'Airline',   # collection prefix
            "function":'Booking',
#             "supplier":'kayak'
        },
        "uri":
        [
            {
            "urn" : "", # urn:ota:transport:airline:booking (IANA)
            "protocol":'https',   # FTP, FTPS, HTTP, TELENET
            "domain" : 'skyscanner.com',  # https://kayak.com/flights/
            "port" : '',
            "path" : 'airlines',
            "query": {
                "expression":'{arrivalPort}-{departurePort}/{flightDate}/1adults?a&fs=cfc=1;bfc=1;transportation=transportation_plane',
                "parameter" :{
                    "arrivalPort" : 'string',
                    "departurePort":'string',
                    "flightDate" : 'date'
                }
            },
            "fragments":['number','page']  # https://kayak.com/flights/#number
            },
            {
            "urn" : "", # urn:ota:transport:airline:booking (IANA)
            "protocol":'https',   # FTP, FTPS, HTTP, TELENET
            "domain" : 'kayak.com',  # https://kayak.com/flights/
            "port" : '',
            "path" : 'flights',
            "query": {
                "expression":'{arrivalPort}-{departurePort}/{flightDate}/1adults?a&fs=cfc=1;bfc=1;transportation=transportation_plane',
                "parameter" :{
                    "arrivalPort":'string',
                    "departurePort" : 'string',
                    "flightDate" :'date'
                }
            },
            "fragments":[]
            }
        ],
        "get":{
            "method":'scrape',
            "object":'json'
        } 

    },
#     {
#         "id" : 'ObjID(1234)',
#         "source" : {
#             "owner" : "canada government",
#             "date": {
#                 "activated" :'2023-07-07',
#                 "expires":'2024-0706',
#             }
#         },
#         "context" : {
#             "summary":'download national traveller demographic data for HERO',
#             "country":'canada',
#             "scope" : 'national',
#             "statistics":'travel',
#         },
#         "realm":{
#             "module" : 'traveller
#             "entity" : 'demography',
#             "package" :'canada',
#             "function":'stats',
#         },
#         "uri":[{
#             "urn" : "",
#             "protocol":'https',
#             "domain" : '150.stat.can.gc.ca',
#             "port" : '',
#             "path" : 't1/tb/en',
#             "query": {
#                 "expression":'',
#                 "parameter" :{
#                 }
#             },
#             "fragments":''
#         }],
#         "get":{
#             "method":'download',
#             "object":'json'
#         } 
#     }
]

import json

for source in data_feeds:
    print(json.dumps(source,indent=2))

{
  "source": {
    "owner": "kayak.com",
    "dates": {
      "activated": "2023-07-07",
      "expires": "2024-07-06"
    }
  },
  "context": {
    "summary": "scraping kayak.com airline booking data for HERO",
    "country": "canada",
    "scope": "national"
  },
  "realm": {
    "module": "OTA",
    "entity": "scraper",
    "package": "Airline",
    "function": "Booking"
  },
  "uri": [
    {
      "urn": "",
      "protocol": "https",
      "domain": "skyscanner.com",
      "port": "",
      "path": "airlines",
      "query": {
        "expression": "{arrivalPort}-{departurePort}/{flightDate}/1adults?a&fs=cfc=1;bfc=1;transportation=transportation_plane",
        "parameter": {
          "arrivalPort": "string",
          "departurePort": "string",
          "flightDate": "date"
        }
      },
      "fragments": [
        "number",
        "page"
      ]
    },
    {
      "urn": "",
      "protocol": "https",
      "domain": "kayak.com",
      "port": "",
      "path": "flight

## Store data feed in MongoDB

In [19]:
kwargs={"FORCEDB" : True}
# kwargs['FORCEDB']=True
stored_feeds = clsFeed.write_feeds_to_nosql(data_feeds, **kwargs)
stored_feeds

Total 1 documents, successful insert count = 0 & modify count = 1


[{'database': 'ota_scraper',
  'collection': 'airline_booking',
  '_id': ObjectId('64de98fecb175a0c5fda76cb')}]