In the previous Jupyter Notebook, we saw that there is a strong causal relationship between Zillow's housing index and the number of Airbnb properties in an area. We also saw there was a weaker, but still signicant causal relationship between Zillow's housing index and the price of Airbnb properties in an area. Finally, we saw that the quantity of Airbnbs in an area and the price of Airbnb rentals had a weak causal relationship. 

# Using Features to Predict Airbnb Daily Rental Prices

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import datasets, neighbors
from sklearn.tree import export_graphviz
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,PolynomialFeatures

In [4]:
df = pd.read_csv('listings2.csv').replace(np.nan,0)
features = ['bathrooms','bedrooms','beds','review_scores_checkin','review_scores_cleanliness','review_scores_communication','review_scores_location','review_scores_rating','review_scores_value','rate']
df.head()
df['price'] = df['price'].astype(float)
df = df.drop(['Unnamed: 0'],axis =1)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [5]:
df

Unnamed: 0,accommodates,amenities,availability_30,bathrooms,bed_type,bedrooms,beds,cancellation_policy,city,has_availability,...,review_scores_location,review_scores_rating,review_scores_value,room_type,state,weekly_price,zipcode,f,t,rate
0,2.0,"{""Wireless Internet"",""Air Conditioning"",Kitche...",0.0,1.0,Real Bed,1.0,1.0,moderate,Asheville,0.0,...,9.0,96.0,10.0,Entire home/apt,NC,$650.00,28806,39.0,326.0,0.106849
1,4.0,"{TV,""Cable TV"",Internet,""Wireless Internet"",""A...",17.0,1.0,Real Bed,1.0,2.0,strict,Asheville,0.0,...,9.0,96.0,9.0,Entire home/apt,NC,0,28806,137.0,228.0,0.375342
2,1.0,"{TV,""Wireless Internet"",""Air Conditioning"",Kit...",29.0,1.0,Real Bed,1.0,1.0,flexible,Asheville,0.0,...,0.0,0.0,0.0,Private room,NC,0,28806,1.0,364.0,0.002740
3,6.0,"{Internet,""Wireless Internet"",""Air Conditionin...",28.0,2.5,Real Bed,1.0,6.0,moderate,Asheville,0.0,...,0.0,0.0,0.0,Shared room,NC,0,28806,4.0,361.0,0.010959
4,6.0,"{Internet,""Wireless Internet"",""Free Parking on...",25.0,2.5,Real Bed,1.0,6.0,moderate,Asheville,0.0,...,9.0,93.0,10.0,Shared room,NC,0,28806,12.0,353.0,0.032877
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50317,2.0,"{""Wireless Internet"",Kitchen,""Free parking on ...",29.0,1.0,Real Bed,1.0,1.0,strict,malibu,0.0,...,0.0,0.0,0.0,Private room,CA,0,90265,186.0,179.0,0.509589
50318,10.0,"{TV,""Wireless Internet"",""Air conditioning"",Kit...",29.0,5.0,Real Bed,5.0,5.0,flexible,malibu,0.0,...,0.0,0.0,0.0,Entire home/apt,CA,0,90265,1.0,364.0,0.002740
50319,4.0,"{TV,Internet,""Wireless Internet"",""Air conditio...",28.0,1.0,Real Bed,2.0,2.0,moderate,la habra,0.0,...,0.0,0.0,0.0,Private room,CA,0,90631,194.0,171.0,0.531507
50320,8.0,"{TV,""Wireless Internet"",""Air conditioning"",Kit...",24.0,4.0,Real Bed,3.0,6.0,flexible,malibu,0.0,...,0.0,0.0,0.0,Entire home/apt,CA,0,90265,6.0,359.0,0.016438


## Linear Regression, Ridge Regression, Ridge Regression (Degree 2 Transform)

In [6]:
X = df[features]
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=66)
lm = LinearRegression()
linreg = lm.fit(X_train,y_train)
linreg_score = cross_val_score(linreg,X_train,y_train)

RidgeModel = Ridge(alpha=0.1) 
ridgereg = RidgeModel.fit(X_train, y_train)

pr=PolynomialFeatures(degree=2)
X_train_pr = pr.fit_transform(X_train)
X_test_pr = pr.fit_transform(X_test)
ridgereg_pr = RidgeModel.fit(X_train_pr, y_train)

print("===Linear Regression ()===")
print ("Weights:")
print(linreg.coef_)
print('Intercept:')
print(linreg.intercept_)
print('\n')
print("Cross Validation Score")
print(cross_val_score(linreg,X_test,y_test))
print('\n')

print("===Ridge Regression ()===")
print ("Weights:")
print(ridgereg.coef_)
print('Intercept:')
print(ridgereg.intercept_)
print('\n')
print("Cross Validation Score")
print(cross_val_score(ridgereg,X_test,y_test))

print('\n')
print("===Ridge Regression After Degree 2 Polynomial Transform ()===")
print ("Weights:")
print(ridgereg_pr.coef_)
print('Intercept:')
print(ridgereg_pr.intercept_)
print('\n')
print("Cross Validation Score")
print(cross_val_score(ridgereg_pr,X_test_pr,y_test))


===Linear Regression ()===
Weights:
[172.23732461  79.62837671  -2.9154619  -20.19064992  10.04548173
  -7.65541021  21.65756366   0.32439547 -17.60299709 -23.64215924]
Intercept:
-60.702634898108016


Cross Validation Score
[0.22815604 0.37331713 0.29714146 0.35170695 0.31642972]


===Ridge Regression ()===
Weights:
[ 0.00000000e+00  2.38582471e+01  6.48574511e+00  4.86601777e+01
  2.45803795e+01 -1.82167783e+01  2.92800271e+01 -5.63638157e+01
 -1.07504274e+01  9.22889330e+01 -1.45511198e+02  1.48924485e+01
  6.45204937e+01 -7.66951046e+00 -2.63107450e+01  1.69623856e+01
  4.33584799e+00  1.05837169e+01 -1.48699107e+00 -4.63791718e+00
 -2.78350310e+01  1.50921585e+01 -2.58026983e+01 -1.29680136e+01
 -2.26564851e+00  7.09088279e+00  4.02415124e+00 -5.06071176e-01
  7.58060474e+00 -3.14109919e+01  1.44524783e+00  6.64788069e+00
  5.95728973e-01 -8.63297641e+00  3.92223667e+00  1.20153904e+00
 -1.34358071e+01  1.67876824e+01 -6.12902991e-02 -7.42575610e-01
  4.72443931e-01 -2.27012002e+0

We can see that Linear Regression, Ridge Regression, and Ridge Regression with a degree 2 polynomial transform yields poor predictive power. This indiciates that the price of Airbnb rental is not dependant on the features in a significant manner. 

# Predicting Zillow Housing Index from Airbnb Rentals

In [9]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import ElasticNet
from sklearn import preprocessing


In [14]:
df2 = pd.read_csv('ts_master.csv').drop(['Unnamed: 0'],axis =1)

In [15]:
#normalize data
#df2['value']=(df2['value']-df2['value'].mean())/df2['value'].std()
#df2['num_airbnb']=(df2['num_airbnb']-df2['num_airbnb'].mean())/df2['num_airbnb'].std()
#df2['ave_airbnb_price']=(df2['ave_airbnb_price']-df2['ave_airbnb_price'].mean())/df2['ave_airbnb_price'].std()

In [16]:
df2

Unnamed: 0,date,value,num_airbnb,ave_airbnb_price
0,2010-01,250926.2063,90.745424,118.615277
1,2010-02,250249.7925,90.912863,118.615277
2,2010-03,245685.2327,90.717496,118.615277
3,2010-04,245018.8845,90.800000,118.615277
4,2010-05,245832.8560,90.875099,118.615277
...,...,...,...,...
85,2017-02,335652.6278,93.605824,128.120915
86,2017-03,337094.3182,93.604403,142.199012
87,2017-04,338668.5369,92.694602,139.073489
88,2017-05,340450.8523,92.700284,135.348575


## Linear Regression, Ridge Regression, Ridge Regression (Degree 2 Transform)

In [20]:
features = ['num_airbnb','ave_airbnb_price']
X = df2[features]
y = df2['value']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=66)
lm = LinearRegression()
linreg = lm.fit(X_train,y_train)
linreg_score = cross_val_score(linreg,X_train,y_train)

RidgeModel = Ridge(alpha=0.1) 
ridgereg = RidgeModel.fit(X_train, y_train)

pr=PolynomialFeatures(degree=2)
X_train_pr = pr.fit_transform(X_train)
X_test_pr = pr.fit_transform(X_test)
ridgereg_pr = RidgeModel.fit(X_train_pr, y_train)

print("===Linear Regression ()===")
print ("Weights:")
print(linreg.coef_)
print('Intercept:')
print(linreg.intercept_)
print('\n')
print("Cross Validation Score")
print(cross_val_score(linreg,X_test,y_test))
print('\n')

print("===Ridge Regression ()===")
print ("Weights:")
print(ridgereg.coef_)
print('Intercept:')
print(ridgereg.intercept_)
print('\n')
print("Cross Validation Score")
print(cross_val_score(ridgereg,X_test,y_test))

print('\n')
print("===Ridge Regression After Degree 2 Polynomial Transform ()===")
print ("Weights:")
print(ridgereg_pr.coef_)
print('Intercept:')
print(ridgereg_pr.intercept_)
print('\n')
print("Cross Validation Score")
print(cross_val_score(ridgereg_pr,X_test_pr,y_test))

===Linear Regression ()===
Weights:
[12422.95303683  3158.72519083]
Intercept:
-1250132.366897277


Cross Validation Score
[-1.61163509  0.72076436  0.79751999  0.76561112  0.65704083]


===Ridge Regression ()===
Weights:
[     0.           2734.11017277 -16121.50115027   -878.77861841
   1312.17602964   -394.60943093]
Intercept:
581626.2270865735


Cross Validation Score
[-1.61529469  0.7222711   0.79610593  0.76595246  0.65707104]


===Ridge Regression After Degree 2 Polynomial Transform ()===
Weights:
[     0.           2734.11017277 -16121.50115027   -878.77861841
   1312.17602964   -394.60943093]
Intercept:
581626.2270865735


Cross Validation Score
[-6.17849445  0.6853684   0.1911773   0.75245677  0.67619853]


## Elastic Net Regression

In [17]:
enr = ElasticNet(alpha=1.0, l1_ratio=0.5)
cross_val = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(enr, X, y, scoring='neg_mean_absolute_error', cv=cross_val, n_jobs=-1)
# force scores to be positive
scores = np.absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

Mean MAE: 20265.228 (3376.736)


## MLP Regression

In [18]:
from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=1)
mlp_reg = MLPRegressor(hidden_layer_sizes=35, activation='relu',random_state=72, max_iter=500).fit(X_train, y_train)
mlp_reg.predict(X_test)
mlp_reg.score(X_test,y_test) #rquared value



-61.782540359957544