- If the categorical column doesn't provide a way to determine the yes or no for the variable, it will be considered as binary variable else it will be a variable that has three unique values, which are yes, no, unknown

In [1]:
import json
import os
import pandas as pd
import numpy as np

    
RAW_FOLDER = "raw"
PROCESSED_FOLDER = "processed"

    
def write_dict_into_json(dictionary, filename):
    with open(filename, "w") as write_file:
        json.dump(dictionary, write_file, indent=4)
        
        


## Extract out Reviews, listings, reviewers, that have interactions

In [2]:
reviews = pd.read_parquet("../../data/processed/reviews.parquet")
reviewers = pd.read_parquet("../../data/processed/reviewers.parquet")
listings = pd.read_parquet("../../data/processed/listings.parquet")

# Why doing this filtering?:
# - Because some of the listings do not have reviews so the reviews.parquet does not contain the ids of those listings 
# that are not reviewed before
# - The script did not manage to extract the features for certain listing, that's why that there are some missing listings

# Filter out those reviews that don't have common listing id that is within our available listings list
common_listings_list = set(list(reviews["listing_id"].unique())).intersection(
    list(listings["listing_id"].unique())
)
reviews = reviews[reviews["listing_id"].isin(common_listings_list)]

# Filter the reviewers and items that never appear in the review table.
distinct_listings_in_reviews = reviews["listing_id"].unique()
distinct_reviewers_in_reviews = reviews["reviewer_id"].unique()
reviewers = reviewers[reviewers["reviewer_id"].isin(distinct_reviewers_in_reviews)]
listings = listings[listings["listing_id"].isin(distinct_listings_in_reviews)]

# reviews.to_parquet("../../data/processed/reviews_with_interactions.parquet")
# reviewers.to_parquet("../../data/processed/reviewers_with_interactions.parquet")
# listings.to_parquet("../../data/processed/listings_with_interactions.parquet")

In [3]:
pd.read_parquet('../../data/processed/reviewers_with_interactions.parquet')

Unnamed: 0,reviewer_id,reviewer_picture_url
0,143198342,https://a0.muscache.com/im/pictures/user/99a81...
1,173798324,https://a0.muscache.com/im/pictures/user/9c2a8...
2,69387949,https://a0.muscache.com/im/pictures/user/c4477...
3,214128814,https://a0.muscache.com/im/pictures/user/d519e...
4,57994791,https://a0.muscache.com/im/pictures/user/2cff0...
...,...,...
1378152,479275265,https://a0.muscache.com/defaults/user_pic-225x...
1378153,9904131,https://a0.muscache.com/im/pictures/user/b3acd...
1378154,154013471,https://a0.muscache.com/im/pictures/user/653be...
1378155,144408351,https://a0.muscache.com/im/pictures/user/dd6ff...


In [4]:
reviewers

Unnamed: 0,reviewer_id,reviewer_picture_url
0,143198342,https://a0.muscache.com/im/pictures/user/99a81...
1,173798324,https://a0.muscache.com/im/pictures/user/9c2a8...
2,69387949,https://a0.muscache.com/im/pictures/user/c4477...
3,214128814,https://a0.muscache.com/im/pictures/user/d519e...
4,57994791,https://a0.muscache.com/im/pictures/user/2cff0...
...,...,...
1378152,479275265,https://a0.muscache.com/defaults/user_pic-225x...
1378153,9904131,https://a0.muscache.com/im/pictures/user/b3acd...
1378154,154013471,https://a0.muscache.com/im/pictures/user/653be...
1378155,144408351,https://a0.muscache.com/im/pictures/user/dd6ff...


# Review

In [2]:
RULES_EXTRACTION_REVIEW = {
        "id": lambda x: x['id'],
        "rating": lambda x: x['rating'],
        'comments': lambda x: x['comments'],
        "localized_comments": lambda x: x['localizedReview']['comments'],
        'response': lambda x: x['response'],
        "localized_response": lambda x: x['localizedReview']['response'],
        'language':lambda x: x['language'],
        'created_at':lambda x: x['createdAt'],
        'localized_date': lambda x: x['localizedDate'],
        'reviewee_id': lambda x: x['reviewee']['id'],
        'reviewee_first_name': lambda x: x['reviewee']['firstName'],
        'reviewee_host_name': lambda x: x['reviewee']['hostName'],
        'reviewee_is_superhost': lambda x: x['reviewee']['isSuperhost'],
        'reviewee_picture_url': lambda x: x['reviewee']['pictureUrl'],
        'reviewer_id': lambda x: x['reviewer']['id'],
        'reviewer_first_name': lambda x: x['reviewer']['firstName'],
        'reviewer_host_name': lambda x: x['reviewer']['hostName'],
        'reviewer_is_superhost': lambda x: x['reviewer']['isSuperhost'],
        'reviewer_picture_url': lambda x: x['reviewer']['pictureUrl'],
}


def extract_features_from_review(review, rules):
    features_dict = {}
    for feature in rules:
        try:
            features_dict[feature] = rules[feature](review)
        except Exception as e:
            features_dict[feature] = None

    return features_dict


def convert_review_json_into_df(reviews_dict):
    all_reviews = []
    i = 0
    for listing_id in list(reviews_dict.keys()):
        reviews = reviews_dict[listing_id]
        i += len(reviews)
        try:
            for review in reviews:
                features_from_review = extract_features_from_review(review, RULES_EXTRACTION_REVIEW)
                features_dict = {}
                features_dict['listing_id'] = listing_id
                features_all = {**features_dict, **features_from_review }
                all_reviews.append(features_all)
        except Exception as e:
            print(e)
            pass
        
    all_reviews_df = pd.DataFrame(all_reviews)
    return all_reviews_df

In [None]:
f = open(f"../../data/{RAW_FOLDER}/review/compiled_reviews.json")
compiled_reviews = json.load(f)
# f = open(f"../../data/{RAW_FOLDER}/review/compiled_retry_reviews.json")
# compiled_retry_reviews = json.load(f)
# f = open(f"../../data/{RAW_FOLDER}/review/compiled_retry2_reviews.json")
# compiled_retry2_reviews = json.load(f)

# Merge together
all_reviews_df = convert_review_json_into_df(compiled_reviews)
# retry_reviews_df = convert_review_json_into_df(compiled_retry_reviews)
# retry2_reviews_df = convert_review_json_into_df(compiled_retry2_reviews)
# all_reviews_df = pd.concat([reviews_df, retry_reviews_df, retry2_reviews_df])

In [None]:
# Preprocessing start here
all_reviews_df = all_reviews_df[all_reviews_df['rating'].notnull()]
all_reviews_df = all_reviews_df[all_reviews_df['rating']!=0]
all_reviews_df['created_at'] = pd.to_datetime(all_reviews_df['created_at'])
all_reviews_df['timestamp'] = all_reviews_df['created_at'].values.astype(np.int64) // 10 ** 9


In [None]:
display(all_reviews_df.info())
# display(all_reviews_df['rating'].value_counts())
# all_reviews_df.to_parquet("../../data/processed/reviews.parquet", index=False)

## Ad hoc code for generating comment embeddings

In [28]:
pd.read_parquet('./f/1.parquet')

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,374,375,376,377,378,379,380,381,382,383
0,704892038167035212,0.039378,0.041486,0.081815,0.066784,-0.070854,0.04639,0.025475,-0.086752,-0.021896,...,0.044091,0.013315,-0.003753,-0.051588,0.034977,0.041798,0.028239,0.049611,-0.044409,0.018653
1,684522347121271107,0.004326,0.070417,0.043662,0.061249,-0.063514,0.039472,-0.016677,-0.070122,0.011258,...,0.056451,9.6e-05,0.00458,-0.100999,0.012499,0.011946,0.027007,0.014431,-0.059853,-0.016948
2,604895488752053606,0.079359,0.072283,0.09076,0.000576,-0.064849,-0.030039,0.019742,-0.084245,-0.005737,...,0.029959,0.073968,-0.00762,-0.019684,0.018022,0.065056,0.005788,-0.006315,-0.054381,0.056732
3,588172201542935573,0.048126,-0.017004,0.067688,0.04159,-0.117701,0.001766,-0.071279,-0.058311,-0.04577,...,0.006597,-0.001178,0.020077,-0.071327,-0.031507,0.026125,0.09717,0.025888,-0.067283,-0.007639
4,484564536512792441,0.048268,0.058953,-0.023515,0.011881,0.008068,-0.049708,0.099669,-0.058964,-0.070608,...,0.000475,0.024408,-0.023574,0.033157,-0.047368,-0.00611,0.145205,0.054102,0.070958,0.065745
5,475064187764940939,0.068215,-0.016861,0.066701,0.008709,-0.005063,0.002889,-0.021054,-0.035752,-0.055287,...,0.047675,0.019042,-0.039867,-0.010061,-0.003232,0.024573,0.049855,0.061118,0.025196,-0.021155
6,728747825773533337,-0.000163,-0.006723,0.052633,-0.038902,-0.056901,0.028176,-0.06168,-0.083197,-0.038329,...,0.033368,0.031081,0.05748,-0.044544,0.04385,0.093641,0.022155,0.037165,-0.058443,0.033182
7,723659729976188025,0.024073,-0.044684,-0.007038,0.065071,-0.049823,0.056866,0.026048,-0.030846,0.027097,...,0.046138,-0.024488,-0.028492,-0.022632,0.022019,0.005958,-0.017066,0.024776,-0.057318,-0.028762
8,672997075987963400,0.032603,0.012921,0.04129,0.049057,-0.092493,-0.018676,0.044889,-0.064482,-0.047653,...,0.012808,0.051967,0.017969,-0.072397,-0.024877,0.06194,0.034839,0.048368,-0.101976,0.028594
9,666469545770554465,0.150491,0.092861,0.082168,0.093048,0.034991,-0.052362,-0.029784,-0.137123,-0.015209,...,-0.013373,-0.016822,-0.038784,-0.016979,0.031779,-0.012485,-0.003669,-0.029634,-0.145284,0.044047


In [13]:
cols = ['id', 'comments', 'language', 'localized_comments', 'reviewer_host_name', 'reviewer_first_name']
reviews = pd.read_parquet("../../data/processed/reviews_with_interactions.parquet")[cols]
tmp = reviews.copy()
data_list = tmp.to_dict("records")
data_list

[{'id': '704892038167035212',
  'comments': 'A lovely, relaxing stay .  Comfortable beds and everything we needed. This house felt homely and peaceful. <br/>Children enjoyed the tennis courts and the hot tub and the visits from Catherine’s dog .',
  'language': 'en',
  'localized_comments': None,
  'reviewer_host_name': 'Katy',
  'reviewer_first_name': 'Katy'},
 {'id': '684522347121271107',
  'comments': 'wonderful experience to spend the night in the tower. God beds, god kitchen facilities.<br/>   We have only wonderful memories to bring back from our stay.<br/>   We enjoyed the Jacuzzi, tennis, lake fishing and the dogs.<br/>   We would definitely recommend the place.',
  'language': 'en',
  'localized_comments': None,
  'reviewer_host_name': 'Charlotte',
  'reviewer_first_name': 'Charlotte'},
 {'id': '604895488752053606',
  'comments': 'Lovely setting within beautiful landscape. Our children loved the tennis courts, boats and playing with the dogs. The accommodation is quite rustic 

In [7]:
tmp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1442687 entries, 0 to 1459535
Data columns (total 4 columns):
 #   Column              Non-Null Count    Dtype 
---  ------              --------------    ----- 
 0   id                  1442687 non-null  object
 1   comments            1442687 non-null  object
 2   language            1442627 non-null  object
 3   localized_comments  440951 non-null   object
dtypes: object(4)
memory usage: 55.0+ MB


In [14]:
tmp[tmp['id'] == '489715891']

Unnamed: 0,id,comments,language,localized_comments,reviewer_host_name,reviewer_first_name
1715,489715891,.,und,,William,William


In [30]:
from sentence_transformers import SentenceTransformer


def write_dict_into_json(dictionary, filename):
    with open(filename, "w") as write_file:
        json.dump(dictionary, write_file, indent=4)



embed_model = SentenceTransformer("all-MiniLM-L6-v2")
embed_model.max_seq_length = 512


def get_comment_embedding(data_dict):
    try:
        lang = data_dict["language"]
        id = data_dict["id"]
        host_name = data_dict['reviewer_host_name']
        first_name = data_dict['reviewer_first_name']
        print(id)
        if lang == "en":
            cmt = data_dict["comments"]
        else:
            cmt = data_dict["localized_comments"]
        
        if not cmt:
            if host_name or first_name:
                cmt =  host_name + " " + first_name
            else:
                cmt = "Unknown User"
                
        embedding = (embed_model.encode(cmt)).tolist()
        return id, embedding
    except Exception as e:
        print(e)
        return id, []
    
# cols = ["id", "comments", "language", "localized_comments"]
# reviews = pd.read_parquet("../../data/processed/reviews_with_interactions.parquet")[cols]
# tmp = reviews.copy()
# data_list = tmp.to_dict("records")

((get_comment_embedding(data_list[0])))

704892038167035212


('704892038167035212',
 [0.039378050714731216,
  0.041486404836177826,
  0.08181539177894592,
  0.06678380817174911,
  -0.07085411250591278,
  0.04639007896184921,
  0.025474894791841507,
  -0.08675163984298706,
  -0.021896155551075935,
  0.007553863804787397,
  -0.043479643762111664,
  -0.0032101147808134556,
  0.03647883981466293,
  -0.015571055002510548,
  0.012882683426141739,
  0.00743962824344635,
  0.08068014681339264,
  -0.033306777477264404,
  0.06019384786486626,
  0.011717780493199825,
  -0.10092111676931381,
  -0.010472149588167667,
  0.026371581479907036,
  0.028830505907535553,
  0.006107657216489315,
  0.09619241952896118,
  -0.0061335694044828415,
  0.0854024589061737,
  -0.014555937610566616,
  -0.0018731099553406239,
  -0.009050669148564339,
  -0.003520032623782754,
  -0.05277658626437187,
  -0.01094576995819807,
  0.02231508493423462,
  0.046502646058797836,
  0.011446964927017689,
  -0.07391511648893356,
  -0.030183004215359688,
  0.028155431151390076,
  0.002054428

In [15]:
data_list[1715]

{'id': '489715891',
 'comments': '.',
 'language': 'und',
 'localized_comments': None,
 'reviewer_host_name': 'William',
 'reviewer_first_name': 'William'}

In [27]:
list(embedding)

[0.009733857,
 0.029614307,
 -0.03303649,
 0.030185029,
 -0.014571813,
 -0.054161347,
 0.049502008,
 0.023411404,
 0.035355,
 0.0063719144,
 0.06429423,
 -0.090765946,
 -0.013925307,
 -0.006107349,
 0.03869533,
 0.0064654937,
 -0.056304082,
 -0.02503414,
 -0.03821416,
 0.016444197,
 -0.07505159,
 0.039411545,
 -0.0015657103,
 0.06380282,
 -0.03916088,
 0.02934386,
 -0.013437062,
 0.051197905,
 -0.011633531,
 -0.009254718,
 0.0165236,
 0.049796283,
 0.05287555,
 0.011832487,
 -0.019250281,
 0.06434128,
 -0.10128075,
 -0.050535757,
 -0.03524283,
 0.050284512,
 -0.07774871,
 -0.078567415,
 -0.021618212,
 -0.051412515,
 -0.08049528,
 -0.07477217,
 -0.029869253,
 0.06231763,
 0.10864398,
 -0.0120752575,
 -0.06307487,
 0.015753545,
 -0.060917057,
 0.02761116,
 0.06175219,
 -0.0022959143,
 0.06709759,
 -0.034017757,
 0.019438101,
 0.005664658,
 -0.00046447432,
 -0.011265913,
 -0.018202744,
 -0.056710683,
 0.027931873,
 0.0018455776,
 0.0021535284,
 -0.026849518,
 0.010865155,
 0.051846236,
 -

## Reviewers

In [26]:
# listings = pd.read_parquet("../../data/processed/listings.parquet")
# distinct_host_user = listings['host_user_id'].unique()

# # all_reviews_df['is_reviewer_host_user'] = 
# reviewers['reviewer_id'].isin(distinct_host_user).value_counts()

In [2]:
reviewers = pd.read_parquet("../../data/processed/reviewers.parquet")


In [3]:
reviewers

Unnamed: 0,reviewer_id,reviewer_picture_url
0,143198342,https://a0.muscache.com/im/pictures/user/99a81...
1,173798324,https://a0.muscache.com/im/pictures/user/9c2a8...
2,69387949,https://a0.muscache.com/im/pictures/user/c4477...
3,214128814,https://a0.muscache.com/im/pictures/user/d519e...
4,57994791,https://a0.muscache.com/im/pictures/user/2cff0...
...,...,...
1378152,479275265,https://a0.muscache.com/defaults/user_pic-225x...
1378153,9904131,https://a0.muscache.com/im/pictures/user/b3acd...
1378154,154013471,https://a0.muscache.com/im/pictures/user/653be...
1378155,144408351,https://a0.muscache.com/im/pictures/user/dd6ff...


In [6]:
reviewers.iloc[0]['reviewer_picture_url']

'https://a0.muscache.com/im/pictures/user/99a81a07-57db-40ec-94cf-2a0312092219.jpg?aki_policy=profile_x_medium'

In [11]:
url = 'https://a0.muscache.com/im/pictures/user/99a81a07-57db-40ec-94cf-2a0312092219.jpg?aki_policy=profile_x_medium'
filename = wget.download(url)

  0% [                                                                              ]     0 / 12451 65% [...................................................                           ]  8192 / 12451100% [..............................................................................] 12451 / 12451

In [13]:
from multiprocessing import Pool
import numpy as np
import wget
import os
import tensorflow as tf
from tensorflow.keras.applications.resnet_v2 import preprocess_input
from tensorflow.keras.preprocessing import image
import pandas as pd
from pandarallel import pandarallel


def resnet(x: np.ndarray) -> np.ndarray:
    maps = model.predict(x)
    if np.prod(maps.shape) == maps.shape[-1] * len(x):
        return np.squeeze(maps)
    else:
        return maps.mean(axis=1).mean(axis=1)


def get_image_embeddings_from_url(url):
#     filename = wget.download(url)
    img = image.load_img('./images/99a81a07-57db-40ec-94cf-2a0312092219.png', target_size=(224, 224))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
    embeddings = resnet(x)
#     os.remove(filename)
    return embeddings


def get_embeddings_by_row(row):
    try:
        if row['reviewer_id'] == '143198342':
            raise Exception('break')
        url = row["reviewer_picture_url"]
        embeddings = get_image_embeddings_from_url(url)
        curr = pd.Series(embeddings[0])
        resulting_row = pd.concat([row, curr])
        return resulting_row  
    except:
        return row



model = tf.keras.applications.ResNet50V2(include_top=False)
get_image_embeddings_from_url('a')
# reviewers = pd.read_parquet("../../data/processed/reviewers.parquet")
# copy = reviewers.iloc[:2].apply(get_embeddings_by_row, axis=1)
# copy

2023-02-20 00:34:39.910031: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.




array([[3.1392019 , 0.45692506, 0.05753694, ..., 0.631175  , 0.05783671,
        0.        ]], dtype=float32)

In [19]:
tmp = reviewers.copy()
data_list = tmp.to_dict('records')
urls

[{'reviewer_id': '143198342',
  'reviewer_picture_url': 'https://a0.muscache.com/im/pictures/user/99a81a07-57db-40ec-94cf-2a0312092219.jpg?aki_policy=profile_x_medium'},
 {'reviewer_id': '173798324',
  'reviewer_picture_url': 'https://a0.muscache.com/im/pictures/user/9c2a8bf1-fa48-4f89-a865-eb91ddf03914.jpg?aki_policy=profile_x_medium'},
 {'reviewer_id': '69387949',
  'reviewer_picture_url': 'https://a0.muscache.com/im/pictures/user/c4477486-9d61-4c79-89bd-c93b87e3cb0b.jpg?aki_policy=profile_x_medium'},
 {'reviewer_id': '214128814',
  'reviewer_picture_url': 'https://a0.muscache.com/im/pictures/user/d519e56c-7fc6-4c0a-bf16-98e5a95a4a5b.jpg?aki_policy=profile_x_medium'},
 {'reviewer_id': '57994791',
  'reviewer_picture_url': 'https://a0.muscache.com/im/pictures/user/2cff016b-3c4a-43e2-accd-4dd5c8acf647.jpg?aki_policy=profile_x_medium'},
 {'reviewer_id': '26876394',
  'reviewer_picture_url': 'https://a0.muscache.com/im/users/26876394/profile_pic/1422501960/original.jpg?aki_policy=profile

In [14]:
tmp = reviewers.iloc[:3]
embedding_dimension = 2048
result = pd.DataFrame([[0,1, 1], [12,1],[]])
result.columns = [f"f{i}" for i in range(embedding_dimension)]

pd.concat([tmp, result], axis=1)

ValueError: Length mismatch: Expected axis has 3 elements, new values have 2048 elements

In [62]:
pd.Series(emb[0])

0       2.662371
1       0.700146
2       0.109135
3       0.118933
4       0.285432
          ...   
2043    0.000000
2044    3.773804
2045    0.470698
2046    0.059108
2047    0.074295
Length: 2048, dtype: float32

In [21]:
a = pd.read_parquet('reviewers_updated.parquet')
a.head()

Unnamed: 0,reviewer_id,reviewer_picture_url,f0,f1,f2,f3,f4,f5,f6,f7,...,f2038,f2039,f2040,f2041,f2042,f2043,f2044,f2045,f2046,f2047
0,143198342,https://a0.muscache.com/im/pictures/user/99a81...,3.139202,0.456925,0.057537,0.183351,0.604626,0.0,0.073998,0.627451,...,0.333214,0.0,0.549147,0.000144,0.0,0.0,2.079071,0.631175,0.057837,0.0
1,173798324,https://a0.muscache.com/im/pictures/user/9c2a8...,0.0,0.017391,0.308177,0.459374,0.063172,0.0,0.0,0.0,...,0.175553,0.023251,0.0772,0.0,0.050934,0.529226,2.042562,0.0,0.140455,0.0
2,69387949,https://a0.muscache.com/im/pictures/user/c4477...,0.401716,2.417225,0.382474,0.104182,0.0,0.0,0.084567,0.0,...,0.043901,0.1193,0.043874,0.0,0.073897,0.343616,2.054035,0.101928,0.069139,0.0
3,214128814,https://a0.muscache.com/im/pictures/user/d519e...,0.025787,0.013773,0.007992,0.197614,0.04751,0.0,0.091397,0.0,...,0.000306,0.186578,0.4642,0.0,0.0,0.971479,0.508529,0.0,0.130409,0.0
4,57994791,https://a0.muscache.com/im/pictures/user/2cff0...,0.0,0.844734,0.046809,0.184545,0.006723,0.0,0.0,0.430184,...,0.048112,0.044366,0.275913,0.0,0.213993,0.072746,8.862124,0.287724,0.002855,0.0


In [24]:
f = open('./embeddings/143198342.json')
json.load(f)['embed']

[3.1392018795013428,
 0.45692506432533264,
 0.05753694102168083,
 0.1833513081073761,
 0.6046263575553894,
 0.0,
 0.07399831712245941,
 0.6274509429931641,
 0.14066441357135773,
 0.0,
 0.003458946943283081,
 0.0,
 0.012456062249839306,
 0.0,
 0.8055237531661987,
 0.0,
 0.0,
 0.4547027051448822,
 0.8217483162879944,
 2.842183828353882,
 0.0001373205886920914,
 0.001193956471979618,
 0.4974694550037384,
 0.25893181562423706,
 0.5113561153411865,
 0.7396478056907654,
 1.9847999811172485,
 0.4448423981666565,
 0.013942639343440533,
 0.0,
 0.14919695258140564,
 0.5194391012191772,
 0.13404369354248047,
 0.564998984336853,
 0.05657849460840225,
 0.6619640588760376,
 0.015570377930998802,
 0.30792921781539917,
 1.6364431381225586,
 0.7794991731643677,
 0.0,
 0.03625382110476494,
 0.6868680715560913,
 0.0,
 0.0,
 0.41060084104537964,
 2.3071091175079346,
 0.12152387201786041,
 0.0,
 0.7443828582763672,
 0.04301657900214195,
 0.021998697891831398,
 0.0,
 0.23812802135944366,
 0.1826192885637283

In [42]:
pd.DataFrame(emb[0]).T


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,2.662371,0.700146,0.109135,0.118933,0.285432,0.0,0.056807,1.094137,0.299815,0.0,...,0.341651,0.0,0.301559,0.0,0.094908,0.0,3.773804,0.470698,0.059108,0.074295




In [49]:
copy.info()

<class 'pandas.core.series.Series'>
RangeIndex: 2 entries, 0 to 1
Series name: None
Non-Null Count  Dtype 
--------------  ----- 
2 non-null      object
dtypes: object(1)
memory usage: 144.0+ bytes


In [41]:
emb.tolist()[0]

[2.6623706817626953,
 0.7001462578773499,
 0.10913549363613129,
 0.11893250048160553,
 0.285431832075119,
 0.0,
 0.05680680647492409,
 1.0941365957260132,
 0.2998146116733551,
 0.0,
 0.0,
 0.0,
 0.02555103227496147,
 0.0,
 0.75201815366745,
 0.0,
 0.0,
 0.33973702788352966,
 1.0540727376937866,
 3.1909658908843994,
 0.0051797847263514996,
 0.0,
 0.7133926749229431,
 0.4141094386577606,
 0.6783243417739868,
 0.5688267350196838,
 2.396399974822998,
 0.48838505148887634,
 0.01870453916490078,
 0.0,
 0.08318854868412018,
 0.42252710461616516,
 0.3742983341217041,
 0.6053541898727417,
 0.11314796656370163,
 0.43995043635368347,
 0.018108755350112915,
 0.5070152282714844,
 0.8644654154777527,
 1.0168756246566772,
 0.0,
 0.04175630584359169,
 0.7205200791358948,
 0.07794129848480225,
 0.0,
 0.22979235649108887,
 1.7563141584396362,
 0.0,
 0.0,
 1.3723140954971313,
 0.08732569217681885,
 0.031054770573973656,
 0.0,
 0.5552941560745239,
 0.12013604491949081,
 0.004642259329557419,
 0.0561370030

In [34]:
emb.shape

(1, 2048)

# Listing

In [2]:
# Utility functions for rules extraction
CONSTANT_BOOL_DICT = {
    "YES": 'YES',
    "NO": 'NO',
    "UNKNOWN": 'UNKNOWN',
    "NOT_REQUIRED": 'NOT_REQUIRED'
}

def process_location(location):
    country = location.split(', ')[-1]
    return {"location":location, "country": country}

def process_details_items(details_items):
    details_items_dict = {}
    title_without_num_dict = {
        'Studio': {"num_of_studio": 1},
        
        'Half-bath':{"num_of_bath": 0.5},
        'Private half-bath':{"num_of_private_bath": 0.5},
        'Shared half-bath':{"num_of_shared_bath": 0.5},
    }
    title_without_dict = {
        "guest":"num_of_guest_capacity",
        'bedroom':"num_of_bedroom",
        'private bedroom':"num_of_private_bedroom",
        'bed':"num_of_bed",
        "room": "num_of_room",
        'bath':"num_of_bath",
        'private bath':"num_of_private_bath",
        'shared bath':"num_of_shared_bath",
    }
    for item in details_items:
        title = item['title']
        if title in title_without_num_dict:
            temp_dict = title_without_num_dict[title]
        else:
            item_split = item['title'].split(" ")
            num_of_item =  float(item_split[0])
            if len(item_split) > 2:
                item_name = " ".join(item_split[1:])
            else:
                item_name = item_split[1]
            if item_name[-1] == 's':
                item_name = item_name[:-1]
                
            temp_dict = {
                title_without_dict[item_name]: num_of_item
            }
            
        
        added_key = list(temp_dict.keys())[0]
        if added_key in details_items_dict:
            temp_num = float(temp_dict[added_key] + details_items_dict[added_key])
            details_items_dict[added_key] = temp_num
        else:
            temp = {**details_items_dict, **temp_dict}
            details_items_dict = temp
            
    return details_items_dict

def process_description(description):
    if type(description) == list: 
        desc = " ".join([item['text'] for item in description if 'text' in item])
    else:
        desc = description 
    
    return {"description": desc}

def process_category_rating(category_rating):
    category_rating_dict = {
        'Cleanliness': "cleanliness",
        'Accuracy': "accuracy",
        'Communication': 'communication',
        'Location': 'location',
        'Check-in': 'checkIn',
        'Value': 'value'
    }
    data_dict = {}
    for item in category_rating:
        label = item['label']
        if label in category_rating_dict:
            data_dict[f"{category_rating_dict[label]}_rating"] = float(item['localizedRating'])
            
    return data_dict

def process_host_features(host_features):
    data_dict = {}
    for item in host_features:
        title = item['title']
        subtitle = item['subtitle']
        if title == 'Languages' or title == 'Language':   
            languges_split = subtitle.split(", ")
            num_of_host_languages = len(languges_split)
            data_dict["host_languages"] = subtitle
            data_dict["num_of_host_languages"] = num_of_host_languages
        elif title == "Response rate":
             data_dict["host_response_rate"] = int(subtitle[:-1]) / 100
        elif title == "Response time":
            data_dict["host_response_time"] = subtitle
        elif title == "Policy number":
            data_dict["host_policy_number"] = subtitle
        elif title == "National Tourism Registry number":
            data_dict["host_national_tourism_registry_number"] = subtitle
        else:
            print("Unknown title: ", title)
    
    return data_dict

def process_host_tags(host_tags):
    data_dict = {
        "host_received_reviews": 0,
        "host_is_verified": False,
        "is_superhost_from_host_tags": False,
        "is_airbnb_org_supporter": False
    }
    for item in host_tags:
        icon = item['icon']
        title = item['title']
        if icon == 'COMPACT_STAR':
            data_dict["host_received_reviews"] = int(title.split(" ")[0].replace(",", ""))
        elif icon == 'COMPACT_VERIFIED':
            data_dict["host_is_verified"] = True
        elif icon == 'SUPERHOST':
            data_dict["is_superhost_from_host_tags"] = True
        elif icon == 'AIRMOJI_PROGRAM_ORG':
            data_dict["is_airbnb_org_supporter"] = True
        else:
            print("Unknown icon: ", icon)
    
    return data_dict

def process_house_rules(house_rules):    
    data_dict = {
        "house_rule_checkIn_time": None,
        "house_rule_check_out_time": None,
        "house_rule_is_self_checkIn": False,
        "house_rule_self_checkIn_type": None,
        "house_rule_is_smoking_allowed": CONSTANT_BOOL_DICT['UNKNOWN'],
        "house_rule_is_pet_allowed": CONSTANT_BOOL_DICT['UNKNOWN'],
        "house_rule_no_child_allowed": False,
        "house_rule_type_of_not_suitable_child": None,
        "house_rule_no_party_allowed": False,
        "house_rule_has_quiet_hours": False,
        "house_rule_quiet_hours": None,
        "house_rule_is_commercial_photography_allowed": CONSTANT_BOOL_DICT['UNKNOWN'],
        "house_rule_max_guest_capacity": None,
        "house_rule_is_flexible_checkIn": False
    }
    
    def check_none_icon(title):
        if 'Check-in' in title:
            data_dict['house_rule_checkIn_time'] = title
        elif 'Checkout' in title:
            data_dict['house_rule_check_out_time'] = title
        elif 'maximum' in title:
            data_dict["house_rule_max_guest_capacity"] = int(title.split(" ")[0])
        elif title == 'No pets':
            data_dict["house_rule_is_pet_allowed"] = CONSTANT_BOOL_DICT['NO']
        elif title == 'Pets allowed':
            data_dict["house_rule_is_pet_allowed"] = CONSTANT_BOOL_DICT['YES']
        elif title == 'No parties or events':
            data_dict["house_rule_no_party_allowed"] = True
        elif title == 'Smoking is allowed':
            data_dict["house_rule_is_smoking_allowed"] = CONSTANT_BOOL_DICT['YES']
        elif title == "No smoking":
            data_dict["house_rule_is_smoking_allowed"] = CONSTANT_BOOL_DICT['NO']
        elif 'Self check-in' in title:
            data_dict["house_rule_is_self_checkIn"] = True
            data_dict["house_rule_self_checkIn_type"] = title
        elif 'Flexible check-in' in title:
             data_dict['house_rule_is_flexible_checkIn'] = True
        else:
            print('Unexpected none icon title: ', title)
    
    for item in house_rules:
        icon = item['icon']
        title = item['title']
        if icon == 'COMPACT_CLOCK':
            if 'Check-in' in title:
                data_dict['house_rule_checkIn_time'] = title
            elif 'Checkout' in title:
                data_dict['house_rule_check_out_time'] = title
            else:
                print('Unexpected clock')
        elif icon == 'COMPACT_CHECK_IN':
            data_dict["house_rule_is_self_checkIn"] = True
            data_dict["house_rule_self_checkIn_type"] = title
        elif icon == 'COMPACT_SMOKING_NOT_ALLOWED':
            data_dict["house_rule_is_smoking_allowed"] = CONSTANT_BOOL_DICT['NO']
        elif icon == 'AIRMOJI_HOUSE_RULES_YES_SMOKING':
            data_dict["house_rule_is_smoking_allowed"] = CONSTANT_BOOL_DICT['YES']
        elif icon == 'COMPACT_NO_PETS':
            data_dict["house_rule_is_pet_allowed"] = CONSTANT_BOOL_DICT['NO']
        elif icon == 'COMPACT_PETS':
            data_dict["house_rule_is_pet_allowed"] = CONSTANT_BOOL_DICT['YES']
        elif icon == 'COMPACT_NO_CHILD':
            data_dict["house_rule_no_child_allowed"] = True
            data_dict["house_rule_type_of_not_suitable_child"] = title.split("Not suitable for ")[1]
        elif icon == 'COMPACT_NO_EVENTS':
            data_dict["house_rule_no_party_allowed"] = True
        elif icon == 'COMPACT_EVENING':
            data_dict["house_rule_has_quiet_hours"] = True
            data_dict["house_rule_quiet_hours"] = title.split("Quiet hours: ")[1]
        elif icon == 'COMPACT_NO_CAMERA':
            data_dict["house_rule_is_commercial_photography_allowed"] = CONSTANT_BOOL_DICT['NO']
        elif icon == 'COMPACT_CAMERA':
            data_dict["house_rule_is_commercial_photography_allowed"] = CONSTANT_BOOL_DICT['YES']
        elif icon == None:
            check_none_icon(title)
        else:
            print("Unknown icon: ", icon)
    
    return data_dict
    
def process_listing_expectations(listing_expectations): 
    icon_dict = {
        'COMPACT_STAIRS': "listing_expectation_must_climb_stair",
        'AIRMOJI_HOUSE_RULES_SHARED_SPACE': "listing_expectation_has_shared_space",
        'AIRMOJI_HOUSE_RULES_PROPERTY_PET': "listing_expectation_has_pet_live_on_property",
        'AIRMOJI_HOUSE_RULES_NOISE': "listing_expectation_has_potential_for_noise",
        'AIRMOJI_STATUS_CANCELLED': "listing_expectation_has_amenity_limitations",
        'AIRMOJI_HOUSE_RULES_NO_PARKING': "listing_expectation_no_parking_on_property",
        'AIRMOJI_HOUSE_RULES_WEAPONS': "listing_expectation_has_weapons_on_property"
    }
    data_dict = {
        "listing_expectation_must_climb_stair": False,
        "listing_expectation_has_shared_space": False,
        "listing_expectation_has_pet_live_on_property":False,
        "listing_expectation_has_potential_for_noise": False,
        "listing_expectation_has_amenity_limitations": False,
        "listing_expectation_no_parking_on_property": False,
        "listing_expectation_has_weapons_on_property": False,
    }
    
    for item in listing_expectations:
        icon = item['icon']
        subtitle = item['subtitle']
        if icon in icon_dict:
            listing_expectation_key = icon_dict[icon]
            data_dict[listing_expectation_key] = True
            data_dict[f"subtitle_{listing_expectation_key}"] = subtitle
        else:
            print("Unknown icon: ", icon)
        
    return data_dict
        
def process_safety_expectations_and_amenities(safety_expectations_and_amenities): 

    carbon_monoxide_dict = {
        'No carbon monoxide alarm': CONSTANT_BOOL_DICT['NO'],
        'Carbon monoxide alarm not reported': CONSTANT_BOOL_DICT['UNKNOWN'],
        'Carbon monoxide alarm': CONSTANT_BOOL_DICT['YES'],
        'Carbon monoxide detector not required': CONSTANT_BOOL_DICT['NOT_REQUIRED'],
    }
    smoke_dict = {
        'Smoke alarm': CONSTANT_BOOL_DICT['YES'],
        'No smoke alarm': CONSTANT_BOOL_DICT['NO'],
        'Smoke alarm not reported': CONSTANT_BOOL_DICT['UNKNOWN'],
    }
    
    data_dict = {
        "safety_expectation_is_covid-19_safety_practices_applied": False,
        "safety_expectation_has_carbon_monoxide_alarm": CONSTANT_BOOL_DICT['UNKNOWN'],
        "safety_expectation_has_smoke_alarm": CONSTANT_BOOL_DICT['UNKNOWN'],
        "safety_expectation_has_security_camera": False,
        "safety_expectation_may_encounter_potentially_dangerous_animal":False,
        "safety_expectation_is_nearby_body_of_water":False,
        "safety_expectation_has_climbing_or_play_structure": False,
        "safety_expectation_heights_without_protection": False,
        "safety_expectation_pool_or_hottub_without_a_gate_or_lock": False,
        "safety_expectation_no_child_allowed": False,
        "safety_expectation_type_of_not_suitable_child": None,
    }
    
    for item in safety_expectations_and_amenities:
        title = item['title']
        lowercased_title = title.lower()
        subtitle = item['subtitle']
        
        if title == "Airbnb's COVID-19 safety practices apply":
            safety_expectation_key = "safety_expectation_is_covid-19_safety_practices_applied"
            val = True
        elif 'carbon monoxide' in lowercased_title:
            safety_expectation_key = "safety_expectation_has_carbon_monoxide_alarm"
            val = carbon_monoxide_dict[title]
        elif 'smoke' in lowercased_title:
            safety_expectation_key = "safety_expectation_has_smoke_alarm"
            val = smoke_dict[title]
        elif title == "Security camera/recording device":
            safety_expectation_key = "safety_expectation_has_security_camera"
            val = True
        elif title == "May encounter potentially dangerous animal":
            safety_expectation_key = "safety_expectation_may_encounter_potentially_dangerous_animal"
            val = True
        elif title == "Nearby lake, river, other body of water":
            safety_expectation_key = "safety_expectation_is_nearby_body_of_water"
            val = True
        elif title == "Climbing or play structure":
            safety_expectation_key = "safety_expectation_has_climbing_or_play_structure"
            val = True
        elif title == "Heights without rails or protection":
            safety_expectation_key = "safety_expectation_heights_without_protection"
            val = True
        elif title == "Pool/hot tub without a gate or lock":
            safety_expectation_key = "safety_expectation_pool_or_hottub_without_a_gate_or_lock"
            val = True
        elif 'Not suitable for' in title:
            safety_expectation_key = "safety_expectation_no_child_allowed"
            val = True
            data_dict["safety_expectation_type_of_not_suitable_child"] = title.split("Not suitable for ")[1]
        else:
            print("Unknown icon: ", icon)
        data_dict[safety_expectation_key] = val
        if safety_expectation_key != "is_covid-19_safety_practices_applied":
            data_dict[f"subtitle_{safety_expectation_key}"] = subtitle
        
    return data_dict

def process_sleeping_arrangement(sleeping_arrangement):
    sleeping_arrangement_dict = {
        'queen bed': 'queen_bed',
        'single bed': 'single_bed',
        'king bed': 'king_bed',
        'sofa bed': 'sofa_bed',
        'double bed': 'double_bed',
        'couch': 'couch',
        'mattress': 'mattress',
        'crib': 'crib',
        'bunk bed': 'bunk_bed',
        'toddler bed': 'toddler_bed',
        'hammock': 'hammock',
        'day bed': 'day_bed',
        'futon bed': 'futon_bed',
        'murphy bed': 'murphy_bed',
        'water bed': 'water_bed',
    }
    data_dict = {}
    def transform_sleeping_arrangement(string):
        num = ''.join([i for i in string if i.isdigit()])
        if num:
            num = int(num)
        else:
            num = 1
        amenity = ''.join([i for i in string if not i.isdigit()]).strip().lower()

        return (num, amenity)
    

    for item in sleeping_arrangement:
        subtitle = item['subtitle']
        sleeping_list = [transform_sleeping_arrangement(amenity) for amenity in subtitle.split(', ')]
        for sleeping in sleeping_list:
            num = sleeping[0]
            sleeping_amenity = sleeping[1]
            for key in sleeping_arrangement_dict.keys():
                if key in sleeping_amenity:
                    data_dict_key = f"sleeping_arrangement_num_of_{sleeping_arrangement_dict[key]}"
                    if data_dict_key in data_dict:
                        temp = data_dict[data_dict_key] + num
                        data_dict[data_dict_key] = temp
                    else:
                        data_dict[data_dict_key] = num
    
    return data_dict

amenties_icon_dict_f = open("./amenties_icon_dict.json")
amenties_icon_dict = json.load(amenties_icon_dict_f)
print("YES: ", len(amenties_icon_dict['YES']), "NO: ", len(amenties_icon_dict['NO']))
import copy
def clean_amenties_icon_dict(amenties_icon_dict):
                            # Not sure what does the amenities mean based on its included titles
    yes_dict_unwanted_icon = ['SYSTEM_ROOFTOP_DECK', 'SYSTEM_MAPS_BAR', 'SYSTEM_BOAT_SAIL', "SYSTEM_BAKING_SHEET",
                              "SYSTEM_BUZZER","SYSTEM_GOLF","SYSTEM_CHILD",
                              # Repeated and tallied
                              "SYSTEM_DETECTOR_CO", "SYSTEM_DETECTOR_SMOKE",
#                               # Repeated and not tallied
#                               'SYSTEM_SURVEILLANCE', 'SYSTEM_NO_STAIRS', 'SYSTEM_PETS', 'SYSTEM_SMOKING_ALLOWED'
    ]
    #Repeated with yes group amenity
    no_dict_unwanted_icon = ['SYSTEM_NO_ESSENTIALS', 'SYSTEM_OFFLINE', 'SYSTEM_NO_TV', 'SYSTEM_NO_WASHER',
                             'SYSTEM_NO_DETECTOR_CO2', 'SYSTEM_NO_HEATER', 'SYSTEM_NO_SURVEILLANCE',
                             'SYSTEM_NO_DETECTOR_SMOKE', 'SYSTEM_NO_AIR_CONDITIONING', 'SYSTEM_NO_PRIVATE_ENTRANCE',
                             'SYSTEM_NO_SHAMPOO'
    ]
    temp = copy.deepcopy(amenties_icon_dict)
    yes_dict = temp['YES']
    no_dict = temp['NO']
    for icon in yes_dict_unwanted_icon:
        del yes_dict[icon]
    for icon in no_dict_unwanted_icon:
        del no_dict[icon]
            
    return {
        "YES":yes_dict,
        "NO":no_dict
    }
            
cleaned_amenties_icon_dict = clean_amenties_icon_dict(amenties_icon_dict)
print("YES: ", len(cleaned_amenties_icon_dict['YES']), "NO: ", len(cleaned_amenties_icon_dict['NO']))


def process_amenities(amenities):
    data_dict = {}
    special_icon = {
        "SYSTEM_TOWEL": lambda title: "SYSTEM_BABY_GATE" if title == 'Baby safety gates' else None 
    }
    icons_that_needs_change_name = {
        "SYSTEM_CALENDAR":"SYSTEM_LONG_TERM_STAYS_ALLOWED",
        "SYSTEM_KEY": "SYSTEM_SELF_CHECKIN",
        "SYSTEM_SHOWER":"SYSTEM_OUTDOOR_SHOWER",
        'SYSTEM_SNOWFLAKE':'SYSTEM_AIR_CONDITIONING',
        "SYSTEM_MAPS_CAR_RENTAL":"SYSTEM_PARKING_AREA",
        "SYSTEM_FLOWER": "SYSTEM_GARDEN",
        "SYSTEM_MAPS_RESORT": "SYSTEM_RESORT",
        "SYSTEM_MINI_BAR":"SYSTEM_MINI_FRIDGE",
        "SYSTEM_DOOR": "SYSTEM_PRIVATE_ENTRANCE",
        "SYSTEM_HOST_OWNERS": "SYSTEM_HOST_GREETING",
        "SYSTEM_DINING_TABLE": "SYSTEM_DINING_AREA",
        'SYSTEM_CLEAN':'SYSTEM_CLEAN_BEFORE_CHECKOUT',
        "SYSTEM_DESERT_CACTUS":"SYSTEM_DESERT_VIEW",
        "SYSTEM_CABLE":"SYSTEM_ETHERNET",
        "SYSTEM_DIAPER": "SYSTEM_CHANGING_DIAPER_TABLE",
        "SYSTEM_TRASH": "SYSTEM_TRASH_COMPACTOR",
        "SYSTEM_THERMOMETER": "SYSTEM_HEATER",
    }
    def find_icon(ori_icon, title, available):
        icon = None
        right_dict = cleaned_amenties_icon_dict['YES'] if available else cleaned_amenties_icon_dict['NO']
        if ori_icon:
            if ori_icon in right_dict:
                icon = ori_icon
        else:
            for icon_key in right_dict:
                title_list = right_dict[icon_key]
                if title in title_list:
                    icon = icon_key
                    break

        if icon in special_icon:
            icon = special_icon[icon](title)
        elif icon in icons_that_needs_change_name:
            icon = icons_that_needs_change_name[icon]
                
        return icon
    ### Main script here
    for am_grp in amenities:
        am_ = am_grp['amenities']
        for am in am_:
            icon = find_icon(am['icon'], am['title'], am['available'])
            if icon:
                amenity = icon.split("SYSTEM_")[1]
                data_dict[f"amenity_has_{amenity}"] = True
            
    return data_dict
            

YES:  113 NO:  13
YES:  104 NO:  2


In [3]:

RULES_EXTRACTION_LISTING = {
    "location": lambda x: process_location(x),
    "price_per_night": lambda x: {"price_per_night": int(''.join(c for c in x if c.isdigit()))},
    "detail_items": lambda x: process_details_items(x),
    "description": lambda x: process_description(x),
    "category_rating": lambda x: process_category_rating(x),
    "host_name": lambda x: {"host_name": x.split("Hosted by ")[1]},
    "joined_date_of_host": lambda x: {"host_joined_date": x.split("Joined in ")[1]},
    "host_features": lambda x: process_host_features(x),
    "host_tags": lambda x: process_host_tags(x),
    "house_rules": lambda x: process_house_rules(x),
    "listing_expectations": lambda x: process_listing_expectations(x),
    "safety_expectations_and_amenities": lambda x: process_safety_expectations_and_amenities(x),
    "sleeping_arrangement": lambda x: process_sleeping_arrangement(x),
    "amenities": lambda x: process_amenities(x),
}
def extract_features_from_listing(listing, rules):
    features_dict = {}
    for feature in listing:
            if feature in RULES_EXTRACTION_LISTING:
                customized_features = rules[feature](listing[feature]) if listing[feature] else {}
                res = {**customized_features, **features_dict}
                features_dict = res
            else:
                features_dict[feature] = listing[feature]

    return features_dict


def convert_listing_json_into_df():
    all_listings = []
    directory = f"../../data/{RAW_FOLDER}/listing/temp_detail"
    i = 0
    for filename in os.listdir(directory):
        f = os.path.join(directory, filename)
        if os.path.isfile(f) and "listing_detail" in filename:
            i += 1
            listing_id= filename.split("listing_detail_")[1].split('.')[0]
            listing_f = open(f"{directory}/{filename}")
            listing_json = json.load(listing_f)
            features_from_listing = extract_features_from_listing(listing_json, RULES_EXTRACTION_LISTING)
            features_from_listing['listing_id'] = listing_id
            features_all = {**features_from_listing }
            all_listings.append(features_all)
    all_listings_df = pd.DataFrame(all_listings)
    return all_listings_df

# Preprocessing operations start here
all_listings_df = convert_listing_json_into_df()
all_listings_df["location_disclaimer"] =  all_listings_df["location_disclaimer"].apply(lambda x: True if x else False)
all_listings_df['host_joined_date'] = pd.to_datetime(all_listings_df['host_joined_date'])
values = {
    #detail_items
    "num_of_guest_capacity": 0,
    "num_of_bedroom": 0,
    "num_of_private_bedroom": 0,
    "num_of_bed": 0,
    "num_of_room": 0,
    "num_of_bath": 0,
    "num_of_private_bath": 0,
    "num_of_shared_bath": 0,
    "num_of_studio": 0, 
    #host_tags
    "host_received_reviews": 0,
    "host_is_verified": False,
    "is_superhost_from_host_tags": False,
    "is_airbnb_org_supporter": False,
    #house_rules
    "house_rule_is_self_checkIn": False,
    "house_rule_is_smoking_allowed": CONSTANT_BOOL_DICT['UNKNOWN'],
    "house_rule_is_pet_allowed": CONSTANT_BOOL_DICT['UNKNOWN'],
    "house_rule_no_child_allowed": False,
    "house_rule_no_party_allowed": False,
    "house_rule_has_quiet_hours": False,
    "house_rule_is_commercial_photography_allowed": CONSTANT_BOOL_DICT['UNKNOWN'],
    "house_rule_is_flexible_checkIn": False,
    #listing_expectations
    "listing_expectation_must_climb_stair": False,
    "listing_expectation_has_shared_space": False,
    "listing_expectation_has_pet_live_on_property":False,
    "listing_expectation_has_potential_for_noise": False,
    "listing_expectation_has_amenity_limitations": False,
    "listing_expectation_no_parking_on_property": False,
    "listing_expectation_has_weapons_on_property": False,
    #safety_expectation
    "safety_expectation_is_covid-19_safety_practices_applied": False,
    "safety_expectation_has_carbon_monoxide_alarm": CONSTANT_BOOL_DICT['UNKNOWN'],
    "safety_expectation_has_smoke_alarm": CONSTANT_BOOL_DICT['UNKNOWN'],
    "safety_expectation_has_security_camera": False,
    "safety_expectation_may_encounter_potentially_dangerous_animal":False,
    "safety_expectation_is_nearby_body_of_water":False,
    "safety_expectation_has_climbing_or_play_structure": False,
    "safety_expectation_heights_without_protection": False,
    "safety_expectation_pool_or_hottub_without_a_gate_or_lock": False,
    "safety_expectation_no_child_allowed": False,
    #sleeping arrangement
    'sleeping_arrangement_num_of_queen_bed': 0,
    'sleeping_arrangement_num_of_single_bed': 0,
    'sleeping_arrangement_num_of_king_bed': 0,
    'sleeping_arrangement_num_of_sofa_bed': 0,
    'sleeping_arrangement_num_of_double_bed': 0,
    'sleeping_arrangement_num_of_couch': 0,
    'sleeping_arrangement_num_of_mattress': 0,
    'sleeping_arrangement_num_of_crib': 0,
    'sleeping_arrangement_num_of_bunk_bed': 0,
    'sleeping_arrangement_num_of_toddler_bed': 0,
    'sleeping_arrangement_num_of_hammock': 0,
    'sleeping_arrangement_num_of_day_bed': 0,
    'sleeping_arrangement_num_of_futon_bed': 0,
    'sleeping_arrangement_num_of_murphy_bed': 0,
    'sleeping_arrangement_num_of_water_bed': 0,
}
amenities_columns_dict = {}
for col in all_listings_df.columns:
    if col.startswith("amenity_has"):
        amenities_columns_dict[col] = False
        
final_values_dict = {**values, **amenities_columns_dict}
all_listings_df.fillna(value=final_values_dict, inplace=True)

def preprocess(df):
    temp = df.copy()
    removed_listing = ['47790035' ]
    temp = temp[~temp['listing_id'].isin(removed_listing)]
    temp = temp[temp['host_user_id'].notnull()]
#     temp.at[5202,'num_of_guest_capacity'] = 16
#     temp.at[8323,'num_of_guest_capacity'] = 16
    return temp
all_listings_df = preprocess(all_listings_df)
display(all_listings_df.head())
display(all_listings_df.info())

24698
24698


Unnamed: 0,amenity_has_SHAMPOO,amenity_has_TOILETRIES,amenity_has_HANGERS,amenity_has_CRIB,amenity_has_TOYS,amenity_has_HIGH_CHAIR,amenity_has_AIR_CONDITIONING,amenity_has_FIRE_EXTINGUISHER,amenity_has_BEACH,amenity_has_LUGGAGE_DROP,...,amenity_has_PIANO,amenity_has_VIDEO_GAME,host_national_tourism_registry_number,num_of_private_bedroom,sleeping_arrangement_num_of_hammock,num_of_room,sleeping_arrangement_num_of_day_bed,sleeping_arrangement_num_of_futon_bed,sleeping_arrangement_num_of_murphy_bed,sleeping_arrangement_num_of_water_bed
0,True,True,True,True,True,True,True,True,True,True,...,False,False,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,False,False,False,False,False,False,True,True,False,False,...,False,False,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,True,True,True,False,False,False,True,True,True,True,...,False,False,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,False,True,False,False,False,False,False,False,False,False,...,False,False,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,True,True,True,False,False,False,True,True,True,True,...,False,False,,0.0,0.0,0.0,0.0,0.0,0.0,0.0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 24554 entries, 0 to 24697
Columns: 218 entries, amenity_has_SHAMPOO to sleeping_arrangement_num_of_water_bed
dtypes: bool(130), datetime64[ns](1), float64(39), int64(1), object(47)
memory usage: 19.7+ MB


None

In [4]:
# def display_full(d): 
#     with pd.option_context('display.max_rows', None,
#                            'display.max_columns', None,
#                            ):
#         display(d)
        
# display_full(all_listings_df.select_dtypes(include=['bool']).columns)


with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    display(all_listings_df.isnull().sum())

amenity_has_SHAMPOO                                                           0
amenity_has_TOILETRIES                                                        0
amenity_has_HANGERS                                                           0
amenity_has_CRIB                                                              0
amenity_has_TOYS                                                              0
amenity_has_HIGH_CHAIR                                                        0
amenity_has_AIR_CONDITIONING                                                  0
amenity_has_FIRE_EXTINGUISHER                                                 0
amenity_has_BEACH                                                             0
amenity_has_LUGGAGE_DROP                                                      0
amenity_has_LONG_TERM_STAYS_ALLOWED                                           0
amenity_has_SELF_CHECKIN                                                      0
amenity_has_DOORMAN                     

In [20]:
all_cols = [
    #detail_items
    "num_of_guest_capacity",
    "num_of_bedroom",
    "num_of_private_bedroom",
    "num_of_bed",
    "num_of_room",
    "num_of_private_bath",
    "num_of_shared_bath",
    "num_of_studio", 
    "num_of_bath",  
    #house_rules
    "house_rule_is_self_checkIn",
    "house_rule_is_smoking_allowed",
    "house_rule_is_pet_allowed",
    "house_rule_no_child_allowed",
    "house_rule_no_party_allowed",
    "house_rule_has_quiet_hours",
    "house_rule_is_commercial_photography_allowed",
    #listing_expectations
    "listing_expectation_must_climb_stair",
    "listing_expectation_has_shared_space",
    "listing_expectation_has_pet_live_on_property",
    "listing_expectation_has_potential_for_noise",
    "listing_expectation_has_amenity_limitations",
    "listing_expectation_no_parking_on_property",
    "listing_expectation_has_weapons_on_property",
    #safety_expectation
    "safety_expectation_has_carbon_monoxide_alarm",
    "safety_expectation_has_smoke_alarm",
    "safety_expectation_has_security_camera",
    "safety_expectation_may_encounter_potentially_dangerous_animal",
    "safety_expectation_is_nearby_body_of_water",
    "safety_expectation_has_climbing_or_play_structure",
    "safety_expectation_heights_without_protection",
    "safety_expectation_pool_or_hottub_without_a_gate_or_lock",
    "safety_expectation_no_child_allowed",
    #sleeping arrangement
    'sleeping_arrangement_num_of_queen_bed',
    'sleeping_arrangement_num_of_single_bed',
    'sleeping_arrangement_num_of_king_bed',
    'sleeping_arrangement_num_of_sofa_bed',
    'sleeping_arrangement_num_of_double_bed',
    'sleeping_arrangement_num_of_couch',
    'sleeping_arrangement_num_of_mattress',
    'sleeping_arrangement_num_of_crib',
    'sleeping_arrangement_num_of_bunk_bed',
    'sleeping_arrangement_num_of_toddler_bed',
    'sleeping_arrangement_num_of_hammock',
    'sleeping_arrangement_num_of_day_bed',
    'sleeping_arrangement_num_of_futon_bed',
    'sleeping_arrangement_num_of_murphy_bed',
    'sleeping_arrangement_num_of_water_bed',
    "price_per_night", #null
    "country", #null
    "listing_type", 
    'lat',                                                                           
    'lng',                                                                           
    'location_disclaimer',
    "listing_detail_type", 
]
not_feature_cols = [
    'host_user_id',
    "host_joined_date",
    "listing_id",
]
amenities_col = [col for col in list(all_listings_df.columns) if col.startswith('amenity_has_')]
all_cols.extend(amenities_col)
all_cols.extend(not_feature_cols )
features_cols = [col for col in all_cols if col not in not_feature_cols]
features_col_dict = {
    "all_cols": all_cols,
    "features_cols": features_cols
}

print(len(features_col_dict["all_cols"]), len(features_col_dict["features_cols"]))
write_dict_into_json(features_col_dict, "features_col_dict.json")
temp_ = all_listings_df[all_cols].copy().dropna().reset_index(drop=True)
need_to_
for col in temp_[features_cols].select_dtypes("object"):
    temp_[col] = temp_[col].astype('category')

display(temp_.isnull().sum().value_counts())
temp_.to_parquet("../../data/processed/listings.parquet", index=False)

162 159


0    162
dtype: int64

Unnamed: 0,house_rule_is_smoking_allowed,house_rule_is_pet_allowed,house_rule_is_commercial_photography_allowed,safety_expectation_has_carbon_monoxide_alarm,safety_expectation_has_smoke_alarm,country,listing_type,listing_detail_type
0,NO,NO,UNKNOWN,NO,YES,Malaysia,Entire villa,NORMAL
1,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,Turkey,Entire villa,NORMAL
2,YES,YES,UNKNOWN,NOT_REQUIRED,YES,Australia,Camper/RV,NORMAL
3,UNKNOWN,NO,UNKNOWN,UNKNOWN,UNKNOWN,France,Treehouse,NORMAL
4,NO,YES,UNKNOWN,UNKNOWN,YES,South Korea,Private room in cottage,NORMAL
...,...,...,...,...,...,...,...,...
24282,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,UNKNOWN,Colombia,Private room in home,NORMAL
24283,NO,NO,UNKNOWN,YES,YES,United Kingdom,Entire home,NORMAL
24284,NO,NO,UNKNOWN,UNKNOWN,UNKNOWN,Taiwan,Private room in home,NORMAL
24285,NO,NO,UNKNOWN,UNKNOWN,UNKNOWN,Malaysia,Entire home,NORMAL


In [18]:
## Havent fully cleaned: sleeping_arrangement, amenities

count = 0
test_dict = {}
directory = f"../../data/{RAW_FOLDER}/listing/temp_detail"
i = 0

def find_icon_test(ori_icon, title, available):
    special_icon = {
        "SYSTEM_TOWEL": lambda title: "SYSTEM_BABY_GATE" if title == 'Baby safety gates' else None 
    }
    icons_that_needs_change_name = {
        "SYSTEM_CALENDAR":"SYSTEM_LONG_TERM_STAYS_ALLOWED",
        "SYSTEM_KEY": "SYSTEM_SELF_CHECKIN",
        "SYSTEM_SHOWER":"SYSTEM_OUTDOOR_SHOWER",
        'SYSTEM_SNOWFLAKE':'SYSTEM_AIR_CONDITIONING',
        "SYSTEM_MAPS_CAR_RENTAL":"SYSTEM_PARKING_AREA",
        "SYSTEM_FLOWER": "SYSTEM_GARDEN",
        "SYSTEM_MAPS_RESORT": "SYSTEM_RESORT",
        "SYSTEM_MINI_BAR":"SYSTEM_MINI_FRIDGE",
        "SYSTEM_DOOR": "SYSTEM_PRIVATE_ENTRANCE",
        "SYSTEM_HOST_OWNERS": "SYSTEM_HOST_GREETING",
        "SYSTEM_DINING_TABLE": "SYSTEM_DINING_AREA",
        'SYSTEM_CLEAN':'SYSTEM_CLEAN_BEFORE_CHECKOUT',
        "SYSTEM_DESERT_CACTUS":"SYSTEM_DESERT_VIEW",
        "SYSTEM_CABLE":"SYSTEM_ETHERNET",
        "SYSTEM_DIAPER": "SYSTEM_CHANGING_DIAPER_TABLE",
        "SYSTEM_TRASH": "SYSTEM_TRASH_COMPACTOR",
        "SYSTEM_THERMOMETER": "SYSTEM_HEATER",
        "SYSTEM_NO_DETECTOR_CO2": "SYSTEM_NO_DETECTOR_CO",
        "SYSTEM_OFFLINE": "SYSTEM_WI_FI"
    }
    
    icon = None
    right_dict = cleaned_amenties_icon_dict['YES'] if available else cleaned_amenties_icon_dict['NO']
    if ori_icon:
        if ori_icon in right_dict:
            icon = ori_icon
    else:
        for icon_key in right_dict:
            title_list = right_dict[icon_key]
            if title in title_list:
                icon = icon_key
                break

    if icon in special_icon:
        icon = special_icon[icon](title)
    elif icon in icons_that_needs_change_name:
        icon = icons_that_needs_change_name[icon]

    return icon    

for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    if os.path.isfile(f) and "listing_detail" in filename:
        is_current_got = False
        i += 1
        listing_id= filename.split("listing_detail_")[1].split('.')[0]
        listing_f = open(f"{directory}/{filename}")
        listing_json = json.load(listing_f)
        key = 'amenities'
        if listing_json[key]:
            for item in listing_json[key]:
                #                 val = json.dumps(list(item.keys()))
                    amenities = item['amenities']
                    if amenities:
                        for am in amenities:
                            val = find_icon_test(am['icon'], am['title'], am['available'])
#                             if icon == 'SYSTEM_PARKING_AREA' and not is_current_got:
#                                 is_current_got = True
#                                 count += 1
                            if val in test_dict:
                                temp = test_dict[val] + 1
                                test_dict[val] = temp
                            else:
                                test_dict[val] = 1


# if  and not is_current_got:
#     count +=1
#     is_current_got = True
print("Count: ", count)
print(len(test_dict))
test_dict

Count:  0
119


{'SYSTEM_SHAMPOO': 19625,
 'SYSTEM_TOILETRIES': 21332,
 'SYSTEM_HANGERS': 22653,
 'SYSTEM_CRIB': 4570,
 'SYSTEM_TOYS': 2865,
 'SYSTEM_HIGH_CHAIR': 4121,
 'SYSTEM_AIR_CONDITIONING': 15885,
 'SYSTEM_DETECTOR_SMOKE': 12668,
 'SYSTEM_FIRE_EXTINGUISHER': 14443,
 'SYSTEM_BEACH': 4809,
 'SYSTEM_LUGGAGE_DROP': 8876,
 'SYSTEM_LONG_TERM_STAYS_ALLOWED': 21297,
 'SYSTEM_SELF_CHECKIN': 8882,
 'SYSTEM_DOORMAN': 2613,
 'SYSTEM_NO_KITCHEN': 5326,
 'SYSTEM_WI_FI': 24691,
 'SYSTEM_NO_TV': 7344,
 'SYSTEM_NO_WASHER': 11804,
 'SYSTEM_NO_HAIR_DRYER': 7162,
 'SYSTEM_NO_DETECTOR_CO': 17323,
 'SYSTEM_NO_PRIVATE_ENTRANCE': 11419,
 'SYSTEM_NO_HEATER': 11093,
 'SYSTEM_VIEW_MOUNTAIN': 4526,
 'SYSTEM_HAIRDRYER': 17436,
 'SYSTEM_OUTDOOR_SHOWER': 2461,
 'SYSTEM_WASHER': 12702,
 'SYSTEM_IRON': 13627,
 'SYSTEM_TV': 17442,
 'SYSTEM_POOL_TABLE': 484,
 'SYSTEM_HEATER': 13746,
 'SYSTEM_SURVEILLANCE': 4248,
 'SYSTEM_FIRST_AID_KIT': 12071,
 'SYSTEM_WORKSPACE': 8818,
 'SYSTEM_COOKING_BASICS': 33148,
 'SYSTEM_REFRIGERATOR': 22

In [26]:
data = {
    "date":["September 2016"]
}
df = pd.DataFrame(data)
df['date'] = pd.to_datetime(df['date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    1 non-null      datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 136.0 bytes
