In [1]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV, LassoCV, Lasso, Ridge
from sklearn.model_selection import train_test_split, GridSearchCV, KFold , cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures

import statsmodels.api as sm
import statsmodels.formula.api as smf

import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import datetime

plt.style.use('seaborn')
sns.set_palette("husl")
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

In [2]:
abnb_df = pd.read_csv('data/listing_final.csv')
abnb_df = abnb_df.drop(columns= ['Unnamed: 0'])

In [3]:
abnb_df.rename(index = str, columns ={'f': 'not_superhost', 't': 'is_superhost', 'zvi' :'rent_index', 'strict_14_with_grace_period' :'strict_14',\
                                     'super_strict_30': 'strict_30', 'super_strict_60': 'strict_60'}, inplace = True)

abnb_df.rename(str.lower , axis ='columns', inplace=True)

In [19]:
abnb_df.columns
abnb_df.price.mean()
abnb_df.price.median()

99.0

In [5]:
X = abnb_df[['accommodates', 'bathrooms', 'bedrooms', 'beds',
       'security_deposit', 'cleaning_fee', 'guests_included',
       'extra_people','number_of_reviews', 'rent_index', 'number_of_days_as_host',
       'number_of_amenities']]
y = abnb_df['price']

In [25]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

X = add_constant(abnb_df)
pd.Series([np.sqrt(variance_inflation_factor(X.values, i)) 
               for i in range(X.shape[1])], 
              index=X.columns)

const                          88.508184
host_response_rate              1.174971
accommodates                    1.933762
bathrooms                       1.244250
bedrooms                        1.845592
beds                            1.771975
price                           1.292523
security_deposit                1.113562
cleaning_fee                    1.411137
guests_included                 1.397018
extra_people                    1.082490
availability_60                 3.800357
availability_90                 3.845226
number_of_reviews               1.406811
review_scores_rating            8.637532
review_scores_accuracy          7.910332
review_scores_cleanliness       5.927721
review_scores_checkin           8.071427
review_scores_communication     8.530952
review_scores_location          5.384086
review_scores_value             6.586822
rent_index                      1.098257
number_of_days_as_host          1.154668
number_of_amenities             1.147925
review_period   

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.3, random_state = 93)

In [9]:
degree = 3
model_poly1 = make_pipeline(PolynomialFeatures(degree), LinearRegression())
model_poly1.fit(X_train, y_train)
s = model_poly1.score(X_train, y_train)
r = np.sqrt(mean_squared_error(y_train, model_poly1.predict(X_train)))
print('R^2:', s, 'RMSE:' , r)

R^2: 0.5890874894573377 RMSE: 75.17502793381944


In [10]:
s = model_poly1.score(X_test, y_test)
r = np.sqrt(mean_squared_error(y_test, model_poly1.predict(X_test)))
print('R^2:', s, 'RMSE:' , r)

R^2: -0.12289971425589852 RMSE: 123.29753557960213


In [12]:
ssX = StandardScaler()
X_train_scaled = ssX.fit_transform(X_train)

In [21]:
degree = 4
alphas = [100]
cv = 5

model_poly2 = make_pipeline(PolynomialFeatures(degree), RidgeCV(alphas = alphas, cv = cv))
model_poly2.fit(X_train_scaled, y_train)
s = model_poly2.score(X_train_scaled, y_train) 
r = np.sqrt(mean_squared_error(y_train, model_poly1.predict(X_train_scaled)))
print('For alpha =', alphas, 'R^2:', s, 'RMSE:' , r)

For alpha = [100] R^2: 0.7324857879297131 RMSE: 1444.8700149510753


In [26]:
X_test_scaled = ssX.fit_transform(X_test)

In [27]:
s = model_poly2.score(X_test_scaled, y_test) 
r = np.sqrt(mean_squared_error(y_test, model_poly1.predict(X_test_scaled)))
print('For alpha =', alphas, 'R^2:', s, 'RMSE:' , r)

For alpha = [100] R^2: -5.936403324333513 RMSE: 361.1277615234676


In [None]:
X = abnb_df[['bathrooms', 'bedrooms', 'beds',
       'security_deposit', 'cleaning_fee', 'guests_included',
       'extra_people','number_of_reviews', 'rent_index', 'number_of_days_as_host',
       'number_of_amenities']]
y = abnb_df['price']