In [1]:
%%capture
!pip install sentence-transformers

In [2]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

import locale
locale.setlocale(locale.LC_ALL, '')

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
df = pd.read_csv('/content/drive/My Drive/listings.csv')
# Drop images, columns with scraping info and other columns which I think are unnecessary
df.drop(columns=['scrape_id', 'last_scraped', 'source', 
                 'host_url', 'picture_url', 'host_thumbnail_url', 
                 'host_picture_url', 'first_review', 'last_review', 'license',
                 'calendar_last_scraped', 'id', 'listing_url',
                 'host_id', 'host_neighbourhood',
                 'host_name'], inplace=True)
df.host_since = pd.to_datetime(df.host_since, infer_datetime_format=True)
df.price = df.price.apply(lambda x: locale.atof(x[1:].replace(',', '')))
df.host_response_rate = df.host_response_rate.apply(lambda x: locale.atof(x[:-1].replace(',', '')) if not pd.isna(x) else x)
df.host_acceptance_rate = df.host_acceptance_rate.apply(lambda x: locale.atof(x[:-1].replace(',', '')) if not pd.isna(x) else x)
df

Unnamed: 0,name,description,neighborhood_overview,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,...,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,Explore the heart of old Paris,"Cozy, well-appointed and graciously designed s...","You are within walking distance to the Louvre,...",2009-02-14,"İstanbul, Turkey",The flat is owned by journalists who spend a l...,within an hour,100.0,99.0,f,...,4.79,4.82,4.95,4.55,f,1,1,0,0,1.98
1,MARAIS - 2ROOMS APT - 2/4 PEOPLE,"VERY CONVENIENT, WITH THE BEST LOCATION !<br /...",,2008-08-30,"Paris, France","I am a writer,54, author of novels, books of l...",within an hour,100.0,72.0,t,...,4.91,4.88,4.93,4.72,f,2,2,0,0,2.26
2,Large & sunny flat with balcony !,Very large & nice apartment all for you! <br /...,,2009-06-18,"Paris, France",Hello ! \r\nOur apartment is great and I am su...,,,0.0,f,...,5.00,5.00,5.00,5.00,f,1,1,0,0,0.04
3,"Cozy, Central Paris: WALK or VELIB EVERYWHERE !",Location! Location! Location! Just bring your ...,,2009-07-29,"New York, NY","I am a Native New Yorker (yes, I was born and ...",,,0.0,f,...,,,,,f,1,1,0,0,
4,Saint Germain Musee d'orsay,<b>The space</b><br />This beautiful apartment...,,2010-06-25,"Paris, France","Bonjour,\r\nMy name is delphine with my family...",within a day,59.0,30.0,f,...,5.00,5.00,5.00,4.00,f,78,78,0,0,0.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55099,Aborigine,The flat is decorated in a pure and design way...,Very quiet area.<br />Four times a week there ...,2022-12-03,,,,,,f,...,,,,,f,1,1,0,0,
55100,Gratte-ciel - bien située - proche Tour Eiffel,"In the heart of the 15th arrondissement, this ...",The 15th arrondissement is a pleasant and very...,2022-12-08,"Paris, France",Bonjour !\nJe m’appelle Osmane et suis un gran...,,,100.0,f,...,,,,,f,1,1,0,0,
55101,Charming apartment 2 P - Malesherbe,This pretty studio specially designed for love...,This studio is located in the 8th arrondisseme...,2020-11-05,"Paris, France",,,,,f,...,,,,,f,91,91,0,0,
55102,Appartement Guisarde,Profitez d'un logement élégant et central. Le ...,,2020-01-09,"Paris, France",,,,,f,...,,,,,f,4,4,0,0,


In [4]:
# I will make a binary classification
# All reviews that have score less than 4 for at least one section (bad reviews) will be assigned class 0
# all others (good reviews) are class 1
threshold = 4
df['target'] = 1 - (df.filter(regex="review_") < threshold).any(axis=1).astype(int)
df.drop(columns=df.filter(regex="review_").columns, inplace=True)

In [5]:
# We see that classes are highly unbalanced - only 10% of all samples belong to class 0
df['target'].sum() / df.shape[0]

0.8950892857142857

Next, I'll analyze the usefullness of each column and will discard unnecessary ones. Also, I will do a data preprocessing and data cleaning.

In [6]:
# out of these two columns, 'neighbourhood_cleansed' is preferrable
# as it does not contain NaN values and has more detailed info
df[['neighbourhood', 'neighbourhood_cleansed']]

Unnamed: 0,neighbourhood,neighbourhood_cleansed
0,"Paris, Ile-de-France, France",Hôtel-de-Ville
1,,Hôtel-de-Ville
2,,Opéra
3,,Louvre
4,,Luxembourg
...,...,...
55099,"Paris, Île-de-France, France",Popincourt
55100,"Paris, Île-de-France, France",Vaugirard
55101,"Paris, Île-de-France, France",Batignolles-Monceau
55102,,Luxembourg


In [7]:
# So I'll drop 'neighbourhood' column
df.drop(columns=['neighbourhood'], inplace=True)

In [8]:
nan_frac = df.isna().sum(axis=0) / df.shape[0]
list(zip(df.columns, nan_frac.tolist()))

[('name', 0.0006896051103368176),
 ('description', 0.010307781649245064),
 ('neighborhood_overview', 0.4250508130081301),
 ('host_since', 0.00038109756097560977),
 ('host_location', 0.16860844947735193),
 ('host_about', 0.5232469512195121),
 ('host_response_time', 0.45475827526132406),
 ('host_response_rate', 0.45475827526132406),
 ('host_acceptance_rate', 0.3878665795586527),
 ('host_is_superhost', 0.0007621951219512195),
 ('host_listings_count', 0.00038109756097560977),
 ('host_total_listings_count', 0.00038109756097560977),
 ('host_verifications', 0.0),
 ('host_has_profile_pic', 0.00038109756097560977),
 ('host_identity_verified', 0.00038109756097560977),
 ('neighbourhood_cleansed', 0.0),
 ('neighbourhood_group_cleansed', 1.0),
 ('latitude', 0.0),
 ('longitude', 0.0),
 ('property_type', 0.0),
 ('room_type', 0.0),
 ('accommodates', 0.0),
 ('bathrooms', 1.0),
 ('bathrooms_text', 0.0021777003484320556),
 ('bedrooms', 0.16361788617886178),
 ('beds', 0.013955429732868757),
 ('amenities',

In [9]:
# Drop columns with a large number of NaN values
df.drop(columns=['bathrooms', 'calendar_updated', 'neighbourhood_group_cleansed'], inplace=True)
nan_frac = df.isna().sum(axis=0) / df.shape[0]

In [10]:
# For columns that contain a significant number of NaN values I will create
# an additional column which will specify whether a specific row contains a NaN value
# For text columns I will substitute '' instead of NaN values
# For numeric columns I will substitute the mean instead of NaN values
big_nan_columns = nan_frac[nan_frac > 0.25]
display(big_nan_columns)
for colname in big_nan_columns.index:
    df[colname + '_isna'] = df[colname].isna().astype(int)
    
for colname in ['host_response_time', 'neighborhood_overview', 'host_about']:
    df.loc[df[colname].isna(), colname] = ''
    
for colname in ['host_response_rate', 'host_acceptance_rate']:
    df.loc[df[colname].isna(), colname] = df[colname].mean()

neighborhood_overview    0.425051
host_about               0.523247
host_response_time       0.454758
host_response_rate       0.454758
host_acceptance_rate     0.387867
dtype: float64

In [11]:
# For 'bedrooms' column
# Values of 33 and 50 look like outliers, so I'll remove them
# I will also substitute NaN values for 0
print(np.unique(df['bedrooms'].to_numpy(), return_counts=True))

df = df[(df['bedrooms'] > 26) & (df['bedrooms'] != 26)]
df.loc[df.bedrooms.isna(), 'bedrooms'] = 0

(array([ 1.,  2.,  3.,  4.,  5.,  6.,  7., 13., 26., 33., 38., 50., nan]), array([34164,  8482,  2610,   655,   122,    32,    12,     1,     1,
           2,     1,     6,  9016]))


In [12]:
# impute values for missing categoricals
df.loc[df.host_location.isna(), 'host_location'] = 'not specified'
df.loc[(df.host_verifications == 'None'), 'host_verifications'] = '[]'

In [13]:
# Host verification contains a list of categorical values for each sample
host_ver = np.unique(np.hstack(df.host_verifications.apply(eval).tolist()))
host_ver

array(['email', 'phone', 'photographer', 'work_email'], dtype='<U32')

In [14]:
# Let's one-hot encode them
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer(sparse_output=True)

df = df.join(
            pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(df.pop('host_verifications').apply(eval)),
                index=df.index,
                columns=mlb.classes_))


In [15]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [16]:
stopwords = nltk.corpus.stopwords.words('english') + nltk.corpus.stopwords.words('french')

In [17]:
def amenity_helper(amenity):
    amenity = amenity.translate(str.maketrans('', '', string.punctuation + '–’')).strip()
    amenity = re.sub(r'\s+', ' ', amenity)
    # print(amenity)
    words = amenity.lower().split(' ')
    # print(words)
    words = list(map(lambda x: x.strip(), words))
    # print(words)
    words = list(filter(lambda x: x not in stopwords, words))
    return ' '.join(words)

In [18]:
df['num_amenities'] = df.amenities.apply(lambda x: len(eval(x)))

In [19]:
from sklearn.preprocessing import MultiLabelBinarizer
import re
import string

def helper(x):
    # x = re.sub(r'[!.-]', '', x)
    x = eval(x)
    x = list(map(amenity_helper, x))
    
    return x
    
mlb = MultiLabelBinarizer(sparse_output=True)

ams = df.pop('amenities')
# ams = df.amenities
cats = pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(ams.apply(helper)),
                index=df.index,
                columns=mlb.classes_)

# I'll leave only those amenities which occur in a meaningful number of
# listings. Others are just a garbage
cats = cats[cats.sum(axis=0).sort_values(ascending=False)[:250].index]
df = df.join(cats)

In [20]:
# Fill nan values in this column with 0
df['reviews_per_month'] = df.reviews_per_month.fillna(0)

In [21]:
# After all these manipulations only ~2% of data rows have NaN values
# That's not much and I can safely remove them
df.isna().any(axis=1).sum() / df.shape[0]

0.025682445186583417

In [22]:
df.dropna(axis=0, inplace=True)

In [23]:
# Preprocess columns with boolean values
for colname in ['instant_bookable', 'host_has_profile_pic', 'host_is_superhost']:
    df[colname] = df[colname].apply(lambda x: 1 if x == 't' else 0)


In [24]:
# Preprocess categorical variables
from sklearn.preprocessing import OneHotEncoder

for colname in ['host_response_time', 'property_type', 'room_type', 
                'neighbourhood_cleansed', 'bathrooms_text']:
    mlb = OneHotEncoder(drop='first')

    df = df.join(
                pd.DataFrame.sparse.from_spmatrix(
                    mlb.fit_transform(list(map(lambda x: [x], df.pop(colname).tolist()))),
                    index=df.index,
                    columns=list(map(lambda x: colname + '_' + str(x), mlb.categories_[0][1:]))))


In [25]:
def host_for_n_months(x):
    time_diff = pd.Timestamp(year=2022, month=12, day=31) - x
    months = time_diff.days // 31
    
    return [months > 3, months > 6, months > 12, months > 24, months > 60]

host_for = df.host_since.apply(host_for_n_months)
host_for = pd.DataFrame.from_dict(dict(zip(host_for.index, host_for.values))).T
host_for.columns = [f'host_for_{i}_months' for i in [3,6,12,24,60]]
df.drop(columns=['host_since'], inplace=True)
df = df.join(host_for)

In [26]:
### Create sentence embeddings out of sentence transformers
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm

model = SentenceTransformer('all-MiniLM-L6-v2')
for colname in tqdm(['name', 'description', 
                     'neighborhood_overview', 
                     'host_location', 'host_about']):
    embeddings = model.encode(df[colname].tolist(), show_progress_bar=True)
    print(f"Embedding size: {embeddings.shape[1]}")
    df.drop(columns=[colname], inplace=True)
    df = df.join(pd.DataFrame(embeddings,
                 columns=[colname + '_' + str(i) for i in range(embeddings.shape[1])],
                 index=df.index))

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/1678 [00:00<?, ?it/s]

Embedding size: 384


Batches:   0%|          | 0/1678 [00:00<?, ?it/s]

Embedding size: 384


Batches:   0%|          | 0/1678 [00:00<?, ?it/s]

Embedding size: 384


Batches:   0%|          | 0/1678 [00:00<?, ?it/s]

Embedding size: 384


Batches:   0%|          | 0/1678 [00:00<?, ?it/s]

Embedding size: 384


In [27]:
# Let's check that class distribution is approximately the same
# After all manipulations with the dataset
df['target'].sum() / df.shape[0]

0.8953819787261788

In [28]:
# The final version of the dataset
df

Unnamed: 0,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_has_profile_pic,host_identity_verified,latitude,longitude,accommodates,...,host_about_374,host_about_375,host_about_376,host_about_377,host_about_378,host_about_379,host_about_380,host_about_381,host_about_382,host_about_383
0,100.000000,99.000000,0,1.0,1.0,1,t,48.852470,2.358350,2,...,-0.036702,-0.006426,-0.026741,0.034306,0.019295,0.030644,-0.085336,0.010104,0.035925,-0.036641
1,100.000000,72.000000,1,2.0,8.0,1,t,48.859090,2.353150,4,...,-0.017486,0.055021,-0.028222,0.071166,-0.076620,-0.075099,0.113459,-0.023742,-0.068883,-0.053559
2,92.824863,0.000000,0,1.0,1.0,1,t,48.874170,2.342450,2,...,0.050156,0.007319,-0.067589,-0.005215,0.081819,0.037558,-0.004160,0.077768,-0.127035,0.009763
3,92.824863,0.000000,0,2.0,4.0,1,t,48.860060,2.348630,1,...,0.060607,-0.007697,-0.132673,-0.009741,0.057311,0.022254,-0.006917,-0.003019,-0.127199,-0.003113
4,59.000000,30.000000,0,86.0,225.0,1,t,48.855580,2.331190,2,...,0.000394,0.020882,-0.101436,0.044868,0.094903,-0.045515,0.015559,0.039693,-0.078218,-0.040584
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55099,92.824863,83.094275,0,1.0,1.0,1,t,48.859871,2.370187,4,...,0.107304,0.011428,0.013367,-0.012747,0.061454,0.035641,0.158746,0.126409,0.046549,-0.015717
55100,92.824863,100.000000,0,1.0,2.0,1,t,48.851021,2.287371,6,...,0.026508,-0.096726,-0.092025,-0.009683,0.101337,-0.046755,0.002040,0.143603,-0.091213,-0.040202
55101,92.824863,83.094275,0,93.0,108.0,1,f,48.881870,2.317000,2,...,0.107304,0.011428,0.013367,-0.012747,0.061454,0.035641,0.158746,0.126409,0.046549,-0.015717
55102,92.824863,83.094275,0,5.0,6.0,1,t,48.851777,2.333944,2,...,0.107304,0.011428,0.013367,-0.012747,0.061454,0.035641,0.158746,0.126409,0.046549,-0.015717


In [29]:
# will save it for modeling
df.to_csv('/content/drive/My Drive/listings_clean_new.csv', index=False)

  df.to_csv('/content/drive/My Drive/listings_clean_new.csv', index=False)
