# Importing Packages

In [21]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,cross_val_score,KFold,RandomizedSearchCV
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.preprocessing import LabelEncoder
import time
import re
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

In [2]:
df=pd.read_csv('newyork_airbnb.csv')

now we will convert amenities column ie. we will extract every single values and then join them to our dataset

In [3]:
# We create an array with the features we want to keep.
selected_features = ['name', 'neighbourhood_cleansed', 'room_type', 'guests_included', 'minimum_nights',
                     'number_of_reviews', 'review_scores_rating', 'amenities', 'property_type',
                     'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type', 'price']
selected_df= df.copy()[selected_features]
selected_df.rename(columns = {'neighbourhood_cleansed':'neighbourhood'}, inplace = True)

In [4]:
selected_df['price']=selected_df['price'].apply(lambda x: x.replace('$',''))
selected_df['price']=selected_df['price'].apply(lambda x: float(x.replace(',','')))

In [5]:

##%%time
regex = r"{([^}]*)}"
regex2 = r"translation.\w\D+.."
listings_cp = selected_df.copy()
listings_cp['amenities'] = selected_df['amenities'].map(lambda amns: re.search(regex, amns).group(1))
listings_cp['amenities'] = selected_df['amenities'].map(lambda amns: re.sub(regex2, '', amns))
listings_cp['amenities'] = selected_df['amenities'].map(lambda amns: amns.replace("\"", ""))

In [6]:

# The code for adding the amenities colums is currently commented for practicality
#amenity_ohe = listings_cp.amenities.str.get_dummies(sep = ",")
#amenities_cols = amenity_ohe.columns.values
# dataset = pd.concat([listings_cp, amenity_ohe], axis=1)
dataset = selected_df.query('price <= 500')
dataset = dataset.drop('amenities', axis=1)
dataset = dataset.drop('name', axis=1)

In [7]:
dataset.isnull().sum()

neighbourhood               0
room_type                   0
guests_included             0
minimum_nights              0
number_of_reviews           0
review_scores_rating    21944
property_type               0
accommodates                0
bathrooms                 129
bedrooms                  148
beds                      928
bed_type                    3
price                       0
dtype: int64

In [8]:
dataset.dropna(inplace=True)


# Handling Null Values  

Since there was many empty columns in our dataset so we have to treat them before fitting to the model

In [9]:
# The numeric features that present null values are: review_scores_rating, bathrooms, bedrooms
# and beds from previous notebook https://github.com/Summi-bhai/Airbnb_dataset/blob/master/Analysis_Part_1.ipynb

dataset['review_scores_rating'] = dataset['review_scores_rating'].fillna(dataset['review_scores_rating'].median())
dataset['bathrooms'] = dataset['bathrooms'].fillna(dataset['bathrooms'].median())
dataset['bedrooms'] = dataset['bedrooms'].fillna(dataset['bedrooms'].median())
dataset['beds'] = dataset['beds'].fillna(dataset['beds'].median())

# Converting categorical features to float


In [10]:
dataset.dtypes

neighbourhood            object
room_type                object
guests_included           int64
minimum_nights            int64
number_of_reviews         int64
review_scores_rating    float64
property_type            object
accommodates              int64
bathrooms               float64
bedrooms                float64
beds                    float64
bed_type                 object
price                   float64
dtype: object

In [11]:

# it was showing error with bed_type that's why we will be handling it by our hands
cat_columns = ['neighbourhood', 'room_type', 'property_type','bed_type']
encoder=LabelEncoder()
for col in cat_columns:
    dataset[col]=encoder.fit_transform(dataset[col])

# converting features to float
dataset = dataset.astype(float)
dataset.dtypes

neighbourhood           float64
room_type               float64
guests_included         float64
minimum_nights          float64
number_of_reviews       float64
review_scores_rating    float64
property_type           float64
accommodates            float64
bathrooms               float64
bedrooms                float64
beds                    float64
bed_type                float64
price                   float64
dtype: object

# Splitting the data 


In [12]:
y=dataset['price']
dataset.drop('price',axis=1,inplace=True)

X=dataset

# now we have training,validation and test data
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.1)
X_train,X_val,y_train,y_val=train_test_split(X_train,y_train,test_size=0.2)



Now we are going to define .fit() and cross-validation methods then we will fit different different models of regression from sklearn library and then finally use randomized search cv for better hyper-parameter choice 

In [31]:

def fit_model(model,X_train, y_train, X_val, y_val, cross_val=False, cv_folds=5):
    
    model.fit(X_train,y_train)
    #Predict values:
    training_predictions = model.predict(X_train)
    validation_predictions = model.predict(X_val)
    model_report(X_train,y_train, X_val,y_val, training_predictions, validation_predictions)

    if cross_val:
        evaluate_cross_validation(model, X_train, y_train, cv_folds)
        
def model_report(X_train,y_train, X_val,y_val, training_predictions, validation_predictions):
    #Print model report:
    print("\nModel Report")
    print("Training")
    print("Mean Absolute Error : {}".format(mean_absolute_error(y_train, training_predictions)))
    print("Root Mean Squared Error : {}".format(np.sqrt(mean_squared_error(y_train, training_predictions))))
    print("R2 Score: {}".format(r2_score(y_train, training_predictions)))
    print("\n")
    print("Validation")
    print("Mean Absolute Error : {}".format(mean_absolute_error(y_val, validation_predictions)))
    print("Root Mean Squared Error : {}".format(np.sqrt(mean_squared_error(y_val, validation_predictions))))
    print("R2 Score: {}".format(r2_score(y_val, validation_predictions)))
    
def evaluate_cross_validation(model, X_train, y_train, K):
    
    cv = KFold(n_splits=K, shuffle=True, random_state=2)
    scores = cross_val_score(model, X_train, y_train, cv=cv)
    print()
    print(scores)
    print("Mean score: {} max is {} and min {}".format(scores.mean(), max(scores), min(scores)))   
    
def random_search(model, param_grid, X_train, y_train, X_val, y_val, cv):
    
    random_search = RandomizedSearchCV(model, param_grid, scoring="neg_mean_squared_error", n_jobs=2, cv=cv)
    random_result = random_search.fit(X_train, y_train)

    # summarize results
    print("Best: %f using %s" % (random_result.best_score_, random_result.best_params_))
    means = random_result.cv_results_['mean_test_score']
    stds = random_result.cv_results_['std_test_score']
    params = random_result.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))

    validation_predictions = random_result.predict(X_val)

    print("Validation")
    print("Mean Absolute Error : {}".format(mean_absolute_error(y_val, validation_predictions)))
    print("Root Mean Squared Error : {}".format(np.sqrt(mean_squared_error(y_val, validation_predictions))))
    print("R2 Score: {}".format(r2_score(y_val, validation_predictions)))


# Model selection and training

 For now we will fit are dataset on Decision tree and Random forest.

In [32]:
model=DecisionTreeRegressor(random_state=8)
fit_model(model,X_train,y_train,X_val,y_val,cross_val=True)


Model Report
Training
Mean Absolute Error : 1.6937544060822054
Root Mean Squared Error : 9.581057727365469
R2 Score: 0.9849578951020708


Validation
Mean Absolute Error : 41.120209817936725
Root Mean Squared Error : 68.36990010850042
R2 Score: 0.1951965059466796

[0.23128123 0.17649688 0.19420428 0.20902082 0.24359383]
Mean score: 0.21091941202938597 max is 0.2435938348100085 and min 0.17649688354975582


In [33]:
model=RandomForestRegressor()
fit_model(model,X_train,y_train,X_val,y_val,cross_val=True)


Model Report
Training
Mean Absolute Error : 13.56358851626989
Root Mean Squared Error : 23.385464970966563
R2 Score: 0.9103864425891506


Validation
Mean Absolute Error : 32.35069227134165
Root Mean Squared Error : 51.88226528137649
R2 Score: 0.5365554040316143

[0.54479998 0.52522746 0.54135664 0.54315833 0.54033323]
Mean score: 0.5389751289772429 max is 0.5447999844928277 and min 0.5252274642306107


# Hyperparameter Tuning

Now we had dry runned our model over default parameters but however unfortunately our model overfitted the data so now we will tune our parameters with the help of random search cv we will only using random search for decision tree.

In [34]:
# Tunning Parameters
max_depth = [5, 10, 15, 20]
max_features = ['log2', 'sqrt', 'auto']
min_samples_leaf = [10, 50, 70, 100]
param_grid = dict(max_depth=max_depth, max_features=max_features, 
                  min_samples_leaf=min_samples_leaf)
kfold = KFold(n_splits=5, shuffle=True, random_state=2)

random_search(DecisionTreeRegressor(random_state=8), param_grid, X_train, y_train, X_val, y_val, kfold)

Best: -2688.065789 using {'min_samples_leaf': 70, 'max_features': 'auto', 'max_depth': 10}
-2698.475869 (104.469392) with: {'min_samples_leaf': 100, 'max_features': 'auto', 'max_depth': 10}
-3008.774149 (145.895530) with: {'min_samples_leaf': 70, 'max_features': 'sqrt', 'max_depth': 10}
-3001.631794 (153.043338) with: {'min_samples_leaf': 100, 'max_features': 'log2', 'max_depth': 15}
-2688.065789 (114.498584) with: {'min_samples_leaf': 70, 'max_features': 'auto', 'max_depth': 10}
-2926.320075 (133.482540) with: {'min_samples_leaf': 50, 'max_features': 'sqrt', 'max_depth': 20}
-3001.631794 (153.043338) with: {'min_samples_leaf': 100, 'max_features': 'sqrt', 'max_depth': 15}
-2908.662077 (129.264471) with: {'min_samples_leaf': 70, 'max_features': 'auto', 'max_depth': 5}
-2913.229312 (126.435332) with: {'min_samples_leaf': 10, 'max_features': 'log2', 'max_depth': 20}
-3429.431163 (145.564581) with: {'min_samples_leaf': 70, 'max_features': 'sqrt', 'max_depth': 5}
-2994.623288 (120.670659) 

In [36]:
model = DecisionTreeRegressor(max_depth=20, min_samples_leaf=50, random_state=8)
fit_model(model,X_train,y_train,X_val,y_val,cross_val=True)


Model Report
Training
Mean Absolute Error : 30.2612396445677
Root Mean Squared Error : 49.04630615408939
R2 Score: 0.6058202509274686


Validation
Mean Absolute Error : 31.47499523679866
Root Mean Squared Error : 50.748549752650256
R2 Score: 0.5565882123513648

[0.57549068 0.56771866 0.56552532 0.56252498 0.56295446]
Mean score: 0.5668428178208071 max is 0.5754906781072666 and min 0.5625249779581796


#  Conclusion

We have a model which have maximum accuracy of 57% in and min accuracy of 56% terms of R2_score.

