In [1]:
import numpy as np
import pandas as pd

# Cleaned Data

In [2]:
airbnb = pd.read_csv('data/airbnb.csv', low_memory=False)

In [3]:
relevantCols = ['amenities', 'first_review', 'host_neighbourhood', 
            'last_review', 'neighbourhood_cleansed', 'nightly_price', 
            'price_per_stay', 'property_type', 'review_scores_rating', 
            'room_type', 'number_of_reviews']

airbnb = airbnb[relevantCols]

In [4]:
airbnb.amenities = airbnb.amenities.str.slice(1,-1).str.replace('"', '').str.split(',')

In [5]:
airbnb.first_review = pd.to_datetime(airbnb.first_review.str.replace('/','-'))
airbnb.last_review = pd.to_datetime(airbnb.last_review.str.replace('/','-'))

  airbnb.first_review = pd.to_datetime(airbnb.first_review.str.replace('/','-'))
  airbnb.last_review = pd.to_datetime(airbnb.last_review.str.replace('/','-'))


In [6]:
airbnb.nightly_price = (airbnb.nightly_price.str.slice(1)
                    .str.strip().str.replace(',', ''))
airbnb.nightly_price = airbnb.nightly_price.replace('', np.NaN).astype(float)
airbnb.price_per_stay = (airbnb.price_per_stay.str.slice(1)
                    .str.strip().str.replace(',', ''))
airbnb.price_per_stay = airbnb.price_per_stay.replace('', np.NaN).astype(float)

In [7]:
# number of days of reviews since first review
airbnb['days_of_reviews'] = (airbnb.last_review - airbnb.first_review).dt.days
# number of amenities
airbnb['num_amenities'] = airbnb.amenities.str.len()

In [8]:
airbnb = airbnb.dropna()
airbnb.head()

Unnamed: 0,amenities,first_review,host_neighbourhood,last_review,neighbourhood_cleansed,nightly_price,price_per_stay,property_type,review_scores_rating,room_type,number_of_reviews,days_of_reviews,num_amenities
1,"[TV, Wifi, Air conditioning, Kitchen, Free par...",2017-04-22,La Jolla,2017-08-31,La Jolla,1400.0,1400.0,Apartment,100.0,Entire home/apt,2.0,131.0,21.0
4,"[TV, Wifi, Kitchen, Free parking on premises, ...",2015-02-22,La Jolla,2018-09-23,La Jolla,110.0,110.0,Condominium,90.0,Private room,2.0,1309.0,11.0
10,"[TV, Cable TV, Internet, Wifi, Air conditionin...",2016-10-10,Black Mountain Ranch,2017-05-27,Rancho Penasquitos,75.0,75.0,Bungalow,93.0,Entire home/apt,3.0,229.0,34.0
12,"[TV, Internet, Wifi, Kitchen, Free parking on ...",2015-11-19,Pacific Beach,2018-07-02,Mission Bay,75.0,4500.0,House,100.0,Private room,27.0,956.0,37.0
13,"[TV, Wifi, Pool, Kitchen, Free parking on prem...",2016-02-21,Pacific Beach,2019-05-12,Pacific Beach,65.0,3900.0,Condominium,97.0,Entire home/apt,12.0,1176.0,32.0


# T-tests, Linear Regression

In [9]:
def decision(pvalue, alpha):
    if pvalue < alpha:
        print(f'reject H0: pvalue={pvalue} ≥ {alpha}')  
    else: 
        print(f'fail to reject H0: pvalue={pvalue} < {alpha}')

alpha = 0.01

In [10]:
import scipy.stats as stats
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind_from_stats

Null: There is no difference in review ratings for housing and apartment Airbnbs

Alternative: There **is** a difference in review ratings for housing and apartment Airbnbs

In [11]:
house = airbnb[airbnb['property_type'] == 'House']['review_scores_rating']
house_mean = house.mean()
house_std = house.std(ddof=1)
house_no = house.size

apt = airbnb[airbnb['property_type'] == 'Apartment']['review_scores_rating']
apt_mean = apt.mean()
apt_std = apt.std(ddof=1)
apt_no = apt.size

tstat, pval = ttest_ind_from_stats(house_mean, house_std, house_no, 
                                   apt_mean, apt_std, apt_no, alternative='two-sided')

decision(pval, alpha)

reject H0: pvalue=2.277684792467291e-14 ≥ 0.01


Now let's see if there is a correlation between number of reviews per day and review rating.

Null: the \# of reviews per day and the review rating are independent

Alternative: the \# of reviews per day and the review rating are **NOT** independent

In [12]:
T_stat, pvalue, _, _ = stats.chi2_contingency(
    pd.crosstab(airbnb['number_of_reviews'] / airbnb['days_of_reviews'], 
                airbnb['review_scores_rating']))

print(f'T_stat = {T_stat}, pvalue = {pvalue}')
decision(pvalue, alpha)

T_stat = 247984.1477994755, pvalue = 1.0
fail to reject H0: pvalue=1.0 < 0.01


Let's also see if there is a correlation between the number of amenities and the review rating.

Null: the # of amenities and the review rating are independent

Alternative: the # of amenities are the review rating are **NOT** Independent

In [40]:
T_stat, pvalue, _, _ = stats.chi2_contingency(
    pd.crosstab(airbnb['num_amenities'], 
                airbnb['review_scores_rating']))

print(f'T_stat = {T_stat}, pvalue = {pvalue}')
decision(pvalue, alpha)

T_stat = 3480.7272609591078, pvalue = 0.019734051576013403
fail to reject H0: pvalue=0.019734051576013403 ≥ 0.01


Seems like we fail to reject the null, which means the two distribution are independent

Now let's make a linear regression model:

In [36]:
import statsmodels.formula.api as smf
full_model_formula = f' review_scores_rating ~ {"+".join(airbnb.columns.drop(["review_scores_rating", "amenities"]))}'

elastic_fit = smf.ols(full_model_formula, data=airbnb).\
                fit_regularized(method='elastic_net', alpha=0.3, L1_wt=1.0)

print(elastic_fit.params)

Intercept                                           90.739091
first_review[T.Timestamp('2010-07-20 00:00:00')]     0.000000
first_review[T.Timestamp('2010-09-09 00:00:00')]     0.000000
first_review[T.Timestamp('2010-12-05 00:00:00')]     0.000000
first_review[T.Timestamp('2011-01-04 00:00:00')]     0.000000
                                                      ...    
nightly_price                                       -0.000150
price_per_stay                                       0.000368
number_of_reviews                                    0.006782
days_of_reviews                                      0.000000
num_amenities                                        0.126631
Length: 3200, dtype: float64


In [39]:
final_params = elastic_fit.params[elastic_fit.params != 0].index[1:]
print(final_params)

print(smf.ols( f'review_scores_rating ~ {"+".join(final_params)}', airbnb).fit().summary())

Index(['nightly_price', 'price_per_stay', 'number_of_reviews',
       'num_amenities'],
      dtype='object')
                             OLS Regression Results                             
Dep. Variable:     review_scores_rating   R-squared:                       0.019
Model:                              OLS   Adj. R-squared:                  0.019
Method:                   Least Squares   F-statistic:                     45.59
Date:                  Tue, 04 Jun 2024   Prob (F-statistic):           5.43e-38
Time:                          17:57:07   Log-Likelihood:                -31427.
No. Observations:                  9372   AIC:                         6.286e+04
Df Residuals:                      9367   BIC:                         6.290e+04
Df Model:                             4                                         
Covariance Type:              nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0

Is there a difference in distribution of airbnbs across neighborhoods?