# Market Room Type Classifier
In this notebook we experiment the NLP and K-means clustering functions for standarizing the room types from the OTA property scrapped data.

In [6]:
'''
    WARNING CONTROL to display or ignore all warnings
'''
import warnings; warnings.simplefilter('ignore')     #switch betweeb 'default' and 'ignore'
import traceback

''' Set debug flag to view extended error messages; else set it to False to turn off debugging mode '''
debug = True

In [160]:
import os
import sys

sys.path.insert(1,"/home/nuwan/workspace/rezaware/")
import rezaware as reza
from utils.modules.etl.load import sparkFILEwls as files
from utils.modules.ml.natlang import nlp 
from utils.modules.ml.cluster import points 

''' restart initiate classes '''
if debug:
    import importlib
    reza = importlib.reload(reza)
    rw = importlib.reload(files)
    nlp = importlib.reload(nlp)
    points = importlib.reload(points)
    
__desc__ = "cluster and classify room types"
clsRW = files.FileWorkLoads(desc=__desc__)
clsCL = points.ClusterWorkLoads(desc=__desc__)
clsNLP = nlp.NatLanWorkLoads(desc=__desc__)
print("\nClass initialization and load complete!")

All functional APP-libraries in REZAWARE-package of REZAWARE-module imported successfully!
All functional FILEWORKLOADS-libraries in LOAD-package of ETL-module imported successfully!
All functional NATLANWORKLOADS-libraries in NATLANG-package of ML-module imported successfully!
All functional CLUSTERWORKLOADS-libraries in CLUSTER-package of ML-module imported successfully!
FileWorkLoads Class initialization complete
ClusterWorkLoads Class initialization complete
NatLanWorkLoads Class initialization complete

Class initialization and load complete!


## Read data into DataFrame

In [123]:
__fpath__ = os.path.join("/home/nuwan/workspace/rezaware/",
                        "wrangler/data/ota/scraper/hospitality/bookings/rates/2022-10-4-13-0")
read_kwargs = {
    "HEADER":"true",
    "INFERSCHEMA":"true",
    "RECURSIVELOOKUP":"true",
    "TO_PANDAS":True,
}
_data = clsRW.read_csv_to_sdf(
    filesPath = __fpath__,
    **read_kwargs,
)
_data.head(3)

                                                                                

Unnamed: 0,ota_name,search_dt,checkin_date,destination_id,property_name,room_type,room_rate,review_score,location_desc,other_info
0,booking.com,2022-10-04 21:00:00,2022-10-04,20023181,"Chic and Modern, Brickell / Miami + FREE Parking",One-Bedroom Apartment,US$151,8.2,"Brickell, MiamiShow on map0.7 miles from centre",One-Bedroom ApartmentEntire apartment • 1 bedr...
1,booking.com,2022-10-04 21:00:00,2022-10-04,20023181,Two Bedroom Oceanview Apartment in Brickell,Two-Bedroom Apartment,US$185,7.7,"Brickell, MiamiShow on map0.7 miles from centre",Two-Bedroom ApartmentEntire apartment • 2 bedr...
2,booking.com,2022-10-04 21:00:00,2022-10-04,20023181,"Huge Room in a House, Near the Airport and Cos...",Double Room,US$51,8.7,"Little Havana, MiamiShow on map3.6 miles from ...",Double Room1 large double bedOnly 1 room left ...


## Get room type embeddings 

In [105]:
_se_props = {
    "NOSTOPWORDS":True,
    "LOWER":True,
}
_sent,_emb = clsNLP.get_sentence_embeddings(
    sentences=_data['property_name'],   # list of word ngrams
    model_name='distilbert-base-nli-mean-tokens',   # https://www.sbert.net/docs/pretrained_models.html
    kwargs=_se_props

)
print(_sent, "\n\n", _emb)

## Label data with kmeans clusters

In [161]:
__category__ = "KMEANS"
__columns__ = ['destination_id','review_score']
clust_props = {
    "NCLUSTERS":10,
    "MAXITERATIONS":200,
    "CENTROIDINIT":5,
    "RANDOMSTATE":0,
}
_cl_data = clsCL.cluster_n_label_data(
    data = _emb,#_data.dropna(),
    category = __category__,
    columns = __columns__,
    **clust_props
)
clsCL.clusters

<class 'numpy.ndarray'>


array([2, 9, 9, ..., 8, 9, 9], dtype=int32)