#Analysis of the Airbnb data of London

In this project, we analyze the Airbnb data of London and try to use the data to predict the review scores of each listing. The data sets are available [here](http://insideairbnb.com/get-the-data.html).

##Reading data

There are three files, which are regarding the calendar, listings, and reviews. Among the three files, the listings data provide detailed information of the amenities, prices, description, etc. of all the listings. Thus, we focus on the listings data for the moment.

In [1]:
import pandas as pd


listings = pd.read_csv('../Airbnb/src/data_sets/listings.csv.gz')
listings = listings.rename(columns={'id': 'listing_id'})

As we want to predict the review scores, we remove all the missing values and bin it into five groups. Then we clean the data and transform the format of some of the variables that have (relatively) strong relationship with the review scores.

Notice that special care has to be taken regarding the host verifications and amenities: Each item contains a list of features of each listing in a string format; thus, we first extract all the features in that string and then convert them into multiple variables of binary values where `1` indicates the presence of this feature and `0` indicates absence. Also, there are two amenities regarding missing translations; these are removed from the amenities.

In [2]:
import numpy as np


resp = 'review_scores_rating'
listings = listings[~listings[resp].isnull()].reset_index(0, True)
listings[resp] = pd.cut(listings[resp], np.arange(0, 120, 20), labels=np.arange(5),
                        include_lowest=True)

In [3]:
def transform_zipcode(df):
    variable = 'zipcode'
    df[variable] = df[variable].fillna('')
    df[variable] = df[variable].str.upper().str.replace('[^0-9A-Z]+', '')
    df.loc[df[variable].str.len() >= 8, variable] = ''
    df.loc[df[variable].str.len() >= 5, variable] \
        = df.loc[df[variable].str.len() >= 5, variable].str.slice(0, -3)
    return df


listings = transform_zipcode(listings)

In [4]:
from sklearn.preprocessing import LabelEncoder


def transform_label(df, variables):
    encoder = LabelEncoder()
    for variable in variables:
        df[variable] = df[variable].fillna('')
        df[variable] = encoder.fit_transform(df[variable])
    return df


listings = transform_label(listings, ['experiences_offered',
                                      'host_response_time',
                                      'zipcode',
                                      'property_type',
                                      'room_type',
                                      'bed_type',
                                      'cancellation_policy'])

In [5]:
def transform_host_since(df):
    df['host_since'] = pd.to_datetime(df['host_since'], yearfirst=True)
    df['host_for'] = (pd.Timestamp.now() - df['host_since']).dt.days
    return df


listings = transform_host_since(listings)

In [6]:
def transform_boolean(df, variables):
    for variable in variables:
        df[variable] = df[variable].map({'f': 0, 't': 1}, 'ignore')
    return df


listings = transform_boolean(listings, ['host_is_superhost',
                                        'host_has_profile_pic',
                                        'host_identity_verified',
                                        'is_location_exact',
                                        'requires_license',
                                        'instant_bookable',
                                        'require_guest_profile_picture',
                                        'require_guest_phone_verification'])

In [7]:
from sklearn.feature_extraction.text import CountVectorizer


def transform_text(df, variables):
    vectorizer = CountVectorizer(stop_words='english', max_features=100)
    tmp = []
    for variable in variables:
        df[variable] = df[variable].fillna('')
        sparse = vectorizer.fit_transform(df[variable])
        sparse = pd.DataFrame(sparse.toarray(), columns=sorted(vectorizer.vocabulary_.keys())) \
            .add_prefix(variable + '.')
        tmp.append(sparse)
    df = pd.DataFrame(pd.concat([df] + tmp, 1))
    return df


listings = transform_text(listings, ['host_verifications'])

In [8]:
def transform_percent(df, variables):
    for variable in variables:
        df[variable] = df[variable].str.strip('%')
        df[variable] = df[variable].astype(np.float64) / 100
    return df


listings = transform_percent(listings, ['host_response_rate'])

In [9]:
def transform_price(df, variables):
    for variable in variables:
        df[variable] = df[variable].str.strip('$').str.replace(',', '')
        df[variable] = df[variable].astype(np.float64)
    return df


listings = transform_price(listings, ['price'])

In [10]:
def transform_amenities(df):
    variable = 'amenities'
    df[variable] = df[variable].str.replace(r'[:\-\./ ]', '_').str.replace(r'[\(\)]', '') \
        .str.replace(r'_+', '_').str.lower()
    df = transform_text(df, [variable])
    columns_to_remove = [column for column in df.columns if 'missing' in column]
    df = pd.DataFrame(df.drop(columns_to_remove, 1))
    return df


listings = transform_amenities(listings)

After transforming the format of the variables, we remove those that have high percentage of missing values and/or have little or no relationship with the review scores.

In [11]:
listings = listings.drop(['listing_url',
                          'scrape_id',
                          'last_scraped',
                          'name',
                          'summary',
                          'space',
                          'description',
                          'neighborhood_overview',
                          'notes',
                          'transit',
                          'access',
                          'interaction',
                          'house_rules',
                          'thumbnail_url',
                          'medium_url',
                          'picture_url',
                          'xl_picture_url',
                          'host_url',
                          'host_name',
                          'host_since',
                          'host_location',
                          'host_about',
                          'host_acceptance_rate',
                          'host_thumbnail_url',
                          'host_picture_url',
                          'host_neighbourhood',
                          'host_listings_count',
                          'host_total_listings_count',
                          'host_verifications',
                          'host_has_profile_pic',
                          'host_identity_verified',
                          'street',
                          'neighbourhood',
                          'neighbourhood_cleansed',
                          'neighbourhood_group_cleansed',
                          'city',
                          'state',
                          'market',
                          'smart_location',
                          'country_code',
                          'country',
                          'latitude',
                          'longitude',
                          'is_location_exact',
                          'amenities',
                          'square_feet',
                          'weekly_price',
                          'monthly_price',
                          'security_deposit',
                          'cleaning_fee',
                          'guests_included',
                          'extra_people',
                          'minimum_nights',
                          'maximum_nights',
                          'calendar_updated',
                          'has_availability',
                          'availability_30',
                          'availability_60',
                          'availability_90',
                          'availability_365',
                          'calendar_last_scraped',
                          'first_review',
                          'last_review',
                          'review_scores_accuracy',
                          'review_scores_cleanliness',
                          'review_scores_checkin',
                          'review_scores_communication',
                          'review_scores_location',
                          'review_scores_value',
                          'requires_license',
                          'license',
                          'jurisdiction_names',
                          'instant_bookable',
                          'cancellation_policy',
                          'require_guest_profile_picture',
                          'require_guest_phone_verification',
                          'reviews_per_month'], 1)

Notice that there are too many variables regarding host verifications and amenities, and some of them may be redundant to predict the review scores. For example, almost every host is verified by phone, so this variable is of little significance. Therefore, we perform $\chi^{2}$ test to contract the feature space by selecting those with a significantly small $p$-value.

In [12]:
from itertools import compress
from sklearn.feature_selection import chi2


def remove_redundant_features(df, variables):
    for variable in variables:
        variables_list = [item for item in listings.columns if variable in item]
        tmp = df[variables_list + [resp]].dropna()
        _, p_val = chi2(tmp[variables_list], tmp[resp])
        variables_list = list(compress(variables_list, (p_val > 0.05)))
        df = df.drop(variables_list, 1)
    return df


listings = remove_redundant_features(listings, ['host_verifications', 'amenities'])

After removing the redundant variables, the split the data into the train and test data.

In [13]:
from sklearn.model_selection import train_test_split


train, test = train_test_split(listings)
train = train.reset_index(0, True)
test = test.reset_index(0, True)

Notice that there are two variables of high cardinality, which are the host ID and zipcode. We transform them by blending the prior and the posterior of each class of the variables using an S-shape function parameterized by the number of samples in that class: when we have more samples in that class, we tend to trust more on the posterior; if we have few samples, we tend to use the prior more as the posterior estimate would be highly unreliable.

In [14]:
K = 5
F = 5
R = 0.01


def compute_lambda(count):
    return 1 / (1 + np.exp(-(count - K) / F))


def compute_prob(row):
    prior, posterior, lambda_val = row
    return lambda_val * posterior + (1 - lambda_val) * prior


def transform_train_test(train, test, variables):
    prior = train[resp].value_counts(True).sort_index().reset_index()
    prior['index'] = prior['index'].map(lambda x: 'prior.' + str(x))
    prior = prior.transpose()
    prior = prior.rename(columns=prior.iloc[0]).drop('index', 0).reset_index()
    
    for variable in variables:
        new_prior = pd.DataFrame(pd.concat([prior] * len(train[variable].unique()),
                                           ignore_index=True))
        new_prior['index'] = train[variable].unique()
        new_prior = new_prior.rename(columns={'index': variable})

        posterior = train.groupby(variable)[resp].value_counts(True) \
            .rename(variable + '.posterior').reset_index()
        posterior = posterior.pivot(index=variable,
                                    columns=resp,
                                    values=variable + '.posterior').reset_index().fillna(0)
        columns = dict(zip(np.arange(5),
                           [variable + '.posterior.' + score
                            for score in np.arange(5).astype(str)]))
        posterior = posterior.rename(columns=columns)

        count = train.groupby([variable, resp])[resp].count() \
            .rename(variable + '.count').reset_index()
        count[resp] = count[resp].astype(np.int64)
        count = count.pivot(index=variable,
                            columns=resp,
                            values=variable + '.count').reset_index().fillna(0)
        columns = dict(zip(np.arange(5),
                           [variable + '.count.' + score
                            for score in np.arange(5).astype(str)]))
        count = count.rename(columns=columns)
        
        tmp = new_prior.merge(posterior).merge(count)
        
        for i in np.arange(5):
            tmp[variable + '.lambda.' + str(i)] = tmp[variable + '.count.' + str(i)] \
                .map(compute_lambda)
        
        for i in np.arange(5):
            tmp[variable + '.prob.' + str(i)] = tmp[['prior.' + str(i),
                                                     variable + '.posterior.' + str(i),
                                                     variable + '.lambda.' + str(i)]] \
                .apply(compute_prob, 1)
        
        tmp = tmp[[variable] + [variable + '.prob.' + str(i) for i in np.arange(5)]]
        train = train.merge(tmp, 'left').drop(variable, 1)
        train[[variable + '.prob.' + str(i) for i in np.arange(5)]] \
            *= (1 + np.random.uniform(-0.5, 0.5, (len(train), 5)) * R)
        test = test.merge(tmp, 'left').drop(variable, 1)
        test[[variable + '.prob.' + str(i) for i in np.arange(5)]] \
            *= (1 + np.random.uniform(-0.5, 0.5, (len(test), 5)) * R)
    
    return train, test

In [15]:
train, test = transform_train_test(train, test, ['host_id', 'zipcode'])

Finally, we use the transformed data to train an XGBoost classifier to predict the review scores.

In [16]:
X_train = train.drop([resp, 'listing_id'], 1)
y_train = train[resp]
X_test = test.drop([resp, 'listing_id'], 1)
y_test = test[resp]

In [17]:
import xgboost as xgb

d_train = xgb.DMatrix(X_train, y_train)

params = {'silent': 1,
          'eta': 0.05,
          'max_depth': 5,
          'subsample': 0.75,
          'colsample_bytree': 0.75,
          'objective': 'multi:softmax',
          'num_class': 5,
          'eval_metric': 'mlogloss'}
num_boost_round = 1000

bst = xgb.train(params, d_train, num_boost_round)

d_test = xgb.DMatrix(X_test)
pred = bst.predict(d_test)