In [3]:
import pandas as pd
import csv
import re
import statsmodels.api as sm
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np

from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [4]:
file_url = '/Users/xzhou/github/project_archives/files_airbnb/{}/calendar.csv'

sf_file = file_url.format('sf_airbnb')
ok_file = file_url.format('ok_airbnb')
sc_file = file_url.format('sc_airbnb')
scz_file = file_url.format('scz_airbnb')

In [5]:
pricing_sf = pd.read_csv(sf_file, parse_dates=['date'])
pricing_ok = pd.read_csv(ok_file, parse_dates=['date'])
pricing_sc = pd.read_csv(sc_file, parse_dates=['date'])
pricing_scz = pd.read_csv(scz_file, parse_dates=['date'])

In [6]:
print(pricing_sf.shape, pricing_ok.shape, pricing_sc.shape, pricing_scz.shape)

(2420680, 4) (1057770, 4) (2068820, 4) (573050, 4)


In [7]:
frames = [pricing_sf, pricing_ok, pricing_sc, pricing_scz]
keys = ['pricing_sf', 'pricing_ok', 'pricing_sc', 'pricing_scz']

In [8]:
df_pricing = pd.concat(frames, keys=keys)

In [9]:
df_pricing.listing_id.nunique()

16768

In [10]:
df_pricing.head(20)

Unnamed: 0,Unnamed: 1,listing_id,date,available,price
pricing_sf,0,958,2019-08-05,f,
pricing_sf,1,958,2019-08-04,f,
pricing_sf,2,958,2019-08-03,f,
pricing_sf,3,958,2019-08-02,f,
pricing_sf,4,958,2019-08-01,f,
pricing_sf,5,958,2019-07-31,f,
pricing_sf,6,958,2019-07-30,f,
pricing_sf,7,958,2019-07-29,f,
pricing_sf,8,958,2019-07-28,f,
pricing_sf,9,958,2019-07-27,f,


In [11]:
# Filter down to houses that are available and have listing prices

df_pricing_smaller = df_pricing[df_pricing['available']=='t']

In [12]:
df_pricing_smaller.shape

(2853357, 4)

In [13]:
df_pricing_smaller.listing_id.nunique()

14770

In [14]:
df_pricing_smaller.date.describe(include='all')

count                 2853357
unique                    389
top       2018-11-03 00:00:00
freq                    10393
first     2018-08-06 00:00:00
last      2019-08-29 00:00:00
Name: date, dtype: object

In [15]:
df_pricing_smaller.columns

Index(['listing_id', 'date', 'available', 'price'], dtype='object')

In [16]:
def currency_to_float(string):
    """
    Convert currency to float
    """
    float_ = float(str(string).strip('$').replace(',',''))
    return float_

In [17]:
# Convert price to float format

df_pricing_smaller['price'] = df_pricing_smaller['price'].apply(lambda x: currency_to_float(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [18]:
# Add a month column based on date

df_pricing_smaller['month'] = df_pricing_smaller['date'].dt.month

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [19]:
# Add a day_of_week column based on Date. Use default format: Monday=0, Sunday=6

df_pricing_smaller['day_of_week'] = df_pricing_smaller['date'].dt.weekday

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [20]:
df_pricing_smaller.head()

Unnamed: 0,Unnamed: 1,listing_id,date,available,price,month,day_of_week
pricing_sf,63,958,2019-05-02,t,181.0,5,3
pricing_sf,64,958,2019-05-01,t,181.0,5,2
pricing_sf,65,958,2019-04-30,t,181.0,4,1
pricing_sf,66,958,2019-04-29,t,181.0,4,0
pricing_sf,67,958,2019-04-28,t,181.0,4,6


In [21]:
df_pricing_smaller.shape

(2853357, 6)

In [22]:
df_pricing_smaller.describe(include ='all')

Unnamed: 0,listing_id,date,available,price,month,day_of_week
count,2853357.0,2853357,2853357,2853357.0,2853357.0,2853357.0
unique,,389,1,,,
top,,2018-11-03 00:00:00,t,,,
freq,,10393,2853357,,,
first,,2018-08-06 00:00:00,,,,
last,,2019-08-29 00:00:00,,,,
mean,14055230.0,,,213.2333,6.571592,2.988646
std,8563144.0,,,703.5042,3.629179,2.003061
min,958.0,,,10.0,1.0,0.0
25%,6453774.0,,,80.0,3.0,1.0


In [23]:
# Count how many months each house was listed in the last year

df_month_count = df_pricing_smaller.groupby(['listing_id']).month.nunique().reset_index()
df_month_count.rename(columns={'month':'month_count'}, inplace=True)
df_month_count.head()

Unnamed: 0,listing_id,month_count
0,958,10
1,3083,5
2,3264,3
3,5739,2
4,5858,12


In [24]:
# Count how many days of week each house was listed in the last year

df_day_of_week_count = df_pricing_smaller.groupby(['listing_id']).day_of_week.nunique().reset_index()
df_day_of_week_count.rename(columns={'day_of_week':'day_of_week_count'}, inplace=True)
df_day_of_week_count.head()

Unnamed: 0,listing_id,day_of_week_count
0,958,7
1,3083,7
2,3264,7
3,5739,7
4,5858,7


In [25]:
df_pricing_with_counts = pd.merge(left=df_pricing_smaller, 
                                  right=df_month_count,
                                  how='left', 
                                  left_on='listing_id', 
                                  right_on='listing_id')
df_pricing_with_counts.head()

Unnamed: 0,listing_id,date,available,price,month,day_of_week,month_count
0,958,2019-05-02,t,181.0,5,3,10
1,958,2019-05-01,t,181.0,5,2,10
2,958,2019-04-30,t,181.0,4,1,10
3,958,2019-04-29,t,181.0,4,0,10
4,958,2019-04-28,t,181.0,4,6,10


In [27]:
df_pricing_with_counts = pd.merge(left=df_pricing_with_counts, 
                                  right=df_day_of_week_count,
                                  how='left', 
                                  left_on='listing_id', 
                                  right_on='listing_id')
df_pricing_with_counts.head()

Unnamed: 0,listing_id,date,available,price,month,day_of_week,month_count,day_of_week_count
0,958,2019-05-02,t,181.0,5,3,10,7
1,958,2019-05-01,t,181.0,5,2,10,7
2,958,2019-04-30,t,181.0,4,1,10,7
3,958,2019-04-29,t,181.0,4,0,10,7
4,958,2019-04-28,t,181.0,4,6,10,7


In [29]:
df_pricing_with_counts.describe(include='all')

Unnamed: 0,listing_id,date,available,price,month,day_of_week,month_count,day_of_week_count
count,2853357.0,2853357,2853357,2853357.0,2853357.0,2853357.0,2853357.0,2853357.0
unique,,389,1,,,,,
top,,2018-11-03 00:00:00,t,,,,,
freq,,10393,2853357,,,,,
first,,2018-08-06 00:00:00,,,,,,
last,,2019-08-29 00:00:00,,,,,,
mean,14055230.0,,,213.2333,6.571592,2.988646,10.20866,6.995392
std,8563144.0,,,703.5042,3.629179,2.003061,2.696574,0.1206497
min,958.0,,,10.0,1.0,0.0,1.0,1.0
25%,6453774.0,,,80.0,3.0,1.0,9.0,7.0


In [30]:
df_pricing_with_counts.price.describe(percentiles=[0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95])

count    2.853357e+06
mean     2.132333e+02
std      7.035042e+02
min      1.000000e+01
5%       4.100000e+01
10%      5.400000e+01
25%      8.000000e+01
50%      1.350000e+02
75%      2.250000e+02
90%      3.850000e+02
95%      5.500000e+02
max      1.024520e+05
Name: price, dtype: float64

In [31]:
# Remove outliers for pricing. Use 5% and 95% percentiles as cutoff

df_pricing_with_counts = df_pricing_with_counts[(df_pricing_with_counts.price>=40)
                                                & (df_pricing_with_counts.price<=550)]

In [32]:
df_pricing_with_counts.shape

(2599154, 8)

In [33]:
# Drill down to houses that price available in 12 months and 7 days a week

df_pricing_available = df_pricing_with_counts[
                                              (df_pricing_with_counts['month_count']==12)
                                              & (df_pricing_with_counts['day_of_week_count']==7)
                                             ]

In [34]:
df_pricing_available.shape

(1497091, 8)

In [35]:
df_pricing_available.listing_id.nunique()

4582

In [36]:
df_pricing_available.isnull().sum()

listing_id           0
date                 0
available            0
price                0
month                0
day_of_week          0
month_count          0
day_of_week_count    0
dtype: int64

In [37]:
# Group by month by weekday, and calculate an average pricing

df_pricing_by_month_weekday = df_pricing_available.groupby(['listing_id', 'month', 'day_of_week']) \
                              .price.mean()  \
                              .reset_index()
df_pricing_by_month_weekday.head()

Unnamed: 0,listing_id,month,day_of_week,price
0,5858,1,0,235.0
1,5858,1,1,235.0
2,5858,1,2,235.0
3,5858,1,3,235.0
4,5858,1,4,235.0


In [38]:
df_pricing_by_month_weekday.describe(include='all')

Unnamed: 0,listing_id,month,day_of_week,price
count,370857.0,370857.0,370857.0,370857.0
mean,13507310.0,6.455588,3.004762,165.770394
std,8696651.0,3.462933,2.003016,107.889496
min,5858.0,1.0,0.0,40.0
25%,5632672.0,3.0,1.0,85.0
50%,13782240.0,6.0,3.0,135.0
75%,21261870.0,10.0,5.0,215.0
max,28152470.0,12.0,6.0,550.0


In [39]:
# Check for null values if any

df_pricing_by_month_weekday.isnull().sum()

listing_id     0
month          0
day_of_week    0
price          0
dtype: int64

In [40]:
# Rename the average price column to avoid confusion

df_pricing_by_month_weekday.rename(columns={'price':'price_by_month_week'}, inplace=True)

In [41]:
df_pricing_by_month_weekday.shape

(370857, 4)

In [99]:
def get_month(number):
    """
    Given a number, return the name of month.
    
    """
    month_lookup = {1:'Jan.', 2:'Feb.', 3:'Mar.', 4:'Apr.', 5:'May.', 6:'Jun.',
                    7:'Jul.', 8:'Aug.', 9:'Sep.', 10:'Oct.', 11:'Nov.', 12:'Dec.'}
    
    return month_lookup[number]

In [100]:
def get_weekdays(number):
    """
    Given a number, return the name of month.
    
    """
    weekday_lookup = {0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 
                      4:'Friday', 5:'Saturday', 6:'Sunday'}
    
    return weekday_lookup[number]

In [102]:
df_pricing_by_month_weekday['month'] = df_pricing_by_month_weekday['month']  \
                                       .apply(lambda x: get_month(x))

In [103]:
df_pricing_by_month_weekday['day_of_week'] = df_pricing_by_month_weekday['day_of_week']  \
                                             .apply(lambda x: get_weekdays(x))

In [112]:
df_pricing_final = pd.get_dummies(df_pricing_by_month_weekday, 
                                  prefix='',
                                  prefix_sep='',
                                  columns=['month', 'day_of_week'])

In [113]:
df_pricing_final.columns

Index(['listing_id', 'price_by_month_week', 'Apr', 'Aug', 'Dec', 'Feb', 'Jan',
       'Jul', 'Jun', 'Mar', 'May', 'Nov', 'Oct', 'Sep', 'Friday', 'Monday',
       'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday'],
      dtype='object')

In [44]:
df_pricing_final.shape

(370857, 21)

In [46]:
pkl_file = '/Users/xzhou/github/project_archives/files_airbnb/{}.pkl'

pkl_path = pkl_file.format('listings_with_selected_features')

df_listings_reviews = pd.read_pickle(pkl_path)
df_listings_reviews.head()

Unnamed: 0,id,bedrooms,cleaning_fee,room_type_Entire home/apt,reviews_per_month,city_San Francisco,city_Oakland,bathrooms,accommodates,number_of_reviews,...,neighbourhood_cleansed_Palo Alto,host_is_superhost_f,amenities_Lock on bedroom door,amenities_Private entrance,amenities_Familykid friendly,district_Santa Clara,neighbourhood_cleansed_San Jose,host_identity_verified_f,ref_district,ref_jurisdiction
0,958,1.0,2.0,1,1.38,1,0,1.0,3,152,...,0,0,0,1,1,0,0,0,San Francisco,SAN FRANCISCO
1,5858,2.0,2.0,1,0.99,1,0,1.0,5,112,...,0,1,0,1,1,0,0,0,San Francisco,SAN FRANCISCO
2,7918,1.0,1.69897,0,0.16,1,0,4.0,2,17,...,0,1,1,1,0,0,0,0,San Francisco,SAN FRANCISCO
3,8142,1.0,1.69897,0,0.15,1,0,4.0,2,7,...,0,1,1,1,1,0,0,0,San Francisco,SAN FRANCISCO
4,8567,2.0,2.09691,1,0.27,1,0,1.0,6,30,...,0,1,0,0,1,0,0,1,San Francisco,SAN FRANCISCO


In [47]:
df_listings_reviews.shape

(10352, 33)

In [60]:
df_final = pd.merge(left=df_pricing_final, 
                    right=df_listings_reviews,
                    how='inner',
                    left_on='listing_id',
                    right_on='id')

In [61]:
df_final.shape

(245629, 54)

In [62]:
df_final.isnull().sum()

listing_id                                  0
price_by_month_week                         0
month_1                                     0
month_2                                     0
month_3                                     0
month_4                                     0
month_5                                     0
month_6                                     0
month_7                                     0
month_8                                     0
month_9                                     0
month_10                                    0
month_11                                    0
month_12                                    0
day_of_week_0                               0
day_of_week_1                               0
day_of_week_2                               0
day_of_week_3                               0
day_of_week_4                               0
day_of_week_5                               0
day_of_week_6                               0
id                                

In [63]:
df_final.shape

(245629, 54)

In [64]:
df_final.columns

Index(['listing_id', 'price_by_month_week', 'month_1', 'month_2', 'month_3',
       'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9',
       'month_10', 'month_11', 'month_12', 'day_of_week_0', 'day_of_week_1',
       'day_of_week_2', 'day_of_week_3', 'day_of_week_4', 'day_of_week_5',
       'day_of_week_6', 'id', 'bedrooms', 'cleaning_fee',
       'room_type_Entire home/apt', 'reviews_per_month', 'city_San Francisco',
       'city_Oakland', 'bathrooms', 'accommodates', 'number_of_reviews',
       'host_listings_count', 'review_scores_rating_log10', 'host_yrs',
       'guests_included', 'calculated_host_listings_count',
       'review_scores_location_log10', 'beds', 'yrs_since_first_review',
       'host_response_rate', 'amenities_Air conditioning',
       'review_scores_value_log10', 'jurisdiction_names_Santa Cruz County, CA',
       'amenities_Cable TV', 'neighbourhood_cleansed_Palo Alto',
       'host_is_superhost_f', 'amenities_Lock on bedroom door',
       'ameniti

In [65]:
# Drop duplicate and reference columns

df_final.drop(['id','ref_district', 'ref_jurisdiction' ], axis=1, inplace=True)

In [66]:
df_final.shape

(245629, 51)

In [67]:
# Create correlation matrix
corr_matrix = df_final.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.90
to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]

to_drop

[]

In [68]:
# Drop features that have high correlation

df_final.drop(to_drop, axis=1, inplace=True)

In [69]:
df_final.shape

(245629, 51)

In [70]:
pkl_file = '/Users/xzhou/github/project_archives/files_airbnb/{}.pkl'
pkl_path = pkl_file.format('listings_seasonality')

try: 
    with open(pkl_path, 'rb') as file:
        df_final = pd.read_pickle(file)
except:
    with open(pkl_path, 'wb') as file:
        pd.to_pickle(df_final, file)

In [72]:
# Perform multiple modeling for price prediction

X = df_final.drop(['listing_id', 'price_by_month_week'], axis=1)
y = df_final['price_by_month_week']

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.3, random_state=42)

In [74]:
ss = StandardScaler()
X_train_trans = ss.fit_transform(X_train)
X_test_trans = ss.transform (X_test)

In [77]:
lr = linear_model.LinearRegression()
lr.fit(X_train_trans, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [78]:
lr.score(X_test_trans, y_test)

0.6211990071126822

In [79]:
y_predict = lr.predict(X_test_trans)

In [80]:
r2_score(y_predict, y_test)

0.38097753560682224

In [81]:
mean_squared_error(y_predict, y_test)

4152.883413095962

In [82]:
model=sm.OLS(y_train, sm.add_constant(X_train_trans))
fit=model.fit()
fit.summary()

0,1,2,3
Dep. Variable:,price_by_month_week,R-squared:,0.615
Model:,OLS,Adj. R-squared:,0.615
Method:,Least Squares,F-statistic:,5726.0
Date:,"Mon, 17 Sep 2018",Prob (F-statistic):,0.0
Time:,01:35:10,Log-Likelihood:,-960380.0
No. Observations:,171940,AIC:,1921000.0
Df Residuals:,171891,BIC:,1921000.0
Df Model:,48,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,160.9735,0.156,1032.138,0.000,160.668,161.279
x1,-6.923e+11,8.62e+11,-0.803,0.422,-2.38e+12,9.97e+11
x2,-6.947e+11,8.65e+11,-0.803,0.422,-2.39e+12,1e+12
x3,-6.921e+11,8.61e+11,-0.803,0.422,-2.38e+12,9.96e+11
x4,-6.906e+11,8.6e+11,-0.803,0.422,-2.38e+12,9.94e+11
x5,-6.938e+11,8.64e+11,-0.803,0.422,-2.39e+12,9.99e+11
x6,-6.947e+11,8.65e+11,-0.803,0.422,-2.39e+12,1e+12
x7,-6.935e+11,8.63e+11,-0.803,0.422,-2.39e+12,9.98e+11
x8,-6.747e+11,8.4e+11,-0.803,0.422,-2.32e+12,9.71e+11

0,1,2,3
Omnibus:,35245.586,Durbin-Watson:,2.003
Prob(Omnibus):,0.0,Jarque-Bera (JB):,107792.697
Skew:,1.061,Prob(JB):,0.0
Kurtosis:,6.247,Cond. No.,481000000000000.0


In [84]:
rf = RandomForestRegressor(n_estimators=500, 
                           criterion='mse', 
                           random_state=3, 
                           n_jobs=-1)
rf.fit(X_train_trans, y_train)

y_train_pred = rf.predict(X_train_trans)
y_test_pred = rf.predict(X_test_trans)

print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train, y_train_pred),
        mean_squared_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' % (
        r2_score(y_train, y_train_pred),
        r2_score(y_test, y_test_pred)))

MSE train: 6.275, test: 46.334
R^2 train: 0.999, test: 0.996


In [85]:
df_final.listing_id.nunique()

3006

In [86]:
# Double check on price range. The range is as expected

y.describe(include='all')

count    245629.000000
mean        161.184664
std         104.196466
min          40.000000
25%          85.000000
50%         129.000000
75%         210.000000
max         550.000000
Name: price_by_month_week, dtype: float64

In [93]:
# Map feature importance with labels

feature_labels = X.columns
feature_tuples = zip (feature_labels, rf.feature_importances_)
feature_lists = [list(feature_tuple) for feature_tuple in feature_tuples]

In [94]:
# Order features based on importance

feature_lists.sort(key=lambda x: abs(x[1]), reverse=True)
feature_lists

[['bedrooms', 0.41539271239871695],
 ['cleaning_fee', 0.07056349520807106],
 ['room_type_Entire home/apt', 0.06710217614155958],
 ['accommodates', 0.06152756207677519],
 ['reviews_per_month', 0.04820969602586177],
 ['host_listings_count', 0.04268704680364848],
 ['number_of_reviews', 0.030449019423465874],
 ['bathrooms', 0.026738801666754766],
 ['review_scores_rating_log10', 0.025195363282753944],
 ['calculated_host_listings_count', 0.022218468730701723],
 ['host_yrs', 0.017645781474501483],
 ['city_San Francisco', 0.01761570948821343],
 ['host_response_rate', 0.015216314152970763],
 ['yrs_since_first_review', 0.011592553554916004],
 ['guests_included', 0.0108933618394976],
 ['jurisdiction_names_Santa Cruz County, CA', 0.010732405026194698],
 ['beds', 0.00936549182893976],
 ['city_Oakland', 0.009356596426260801],
 ['host_is_superhost_f', 0.008670821525023869],
 ['host_identity_verified_f', 0.00781472157957092],
 ['review_scores_value_log10', 0.007519074342685424],
 ['review_scores_locat

In [96]:
features = [list_[0] for list_ in feature_lists]
features

['bedrooms',
 'cleaning_fee',
 'room_type_Entire home/apt',
 'accommodates',
 'reviews_per_month',
 'host_listings_count',
 'number_of_reviews',
 'bathrooms',
 'review_scores_rating_log10',
 'calculated_host_listings_count',
 'host_yrs',
 'city_San Francisco',
 'host_response_rate',
 'yrs_since_first_review',
 'guests_included',
 'jurisdiction_names_Santa Cruz County, CA',
 'beds',
 'city_Oakland',
 'host_is_superhost_f',
 'host_identity_verified_f',
 'review_scores_value_log10',
 'review_scores_location_log10',
 'amenities_Cable TV',
 'amenities_Lock on bedroom door',
 'amenities_Familykid friendly',
 'amenities_Air conditioning',
 'neighbourhood_cleansed_San Jose',
 'day_of_week_5',
 'day_of_week_4',
 'amenities_Private entrance',
 'month_9',
 'neighbourhood_cleansed_Palo Alto',
 'month_11',
 'month_10',
 'month_8',
 'month_12',
 'month_7',
 'month_6',
 'district_Santa Clara',
 'month_1',
 'month_5',
 'month_2',
 'month_3',
 'month_4',
 'day_of_week_6',
 'day_of_week_3',
 'day_of_wee

In [95]:
columns =['listing_id']
columns.extend