In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
# 1. Prepare data as dataframe:
data = pd.read_csv("AB_NYC_2019.csv")
data.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [4]:
#2. Apply Label Encoding for Categorical Data

from sklearn.preprocessing import LabelEncoder
# Apply label encoding to 'neighbourhood_group' and 'room_type'
le=LabelEncoder()
data['group_encoded']=le.fit_transform(data['neighbourhood_group'])
data['room_type']=le.fit_transform(data['room_type'])

# Calculate mean price for each neighborhood_group and neighborhood combination
grouped = data.groupby(['neighbourhood_group', 'neighbourhood'])['price'].mean()
mapping_dict = grouped.to_dict()

# Replace neighborhood column with mean prices
data['neighbourhood_encoded'] = data.apply(lambda x: mapping_dict.get((x['neighbourhood_group'], x['neighbourhood']), np.nan), axis=1)


In [None]:
# 6. Select only the numerical columns for normalization

from sklearn.preprocessing import StandardScaler
numerical_cols = new_data.select_dtypes(include=['int64','float64']).drop(['room_type', 'price','group_encoded'], axis=1).columns.tolist()
# Normalize the numerical columns using StandardScaler
scaler = StandardScaler()
new_data[numerical_cols] = scaler.fit_transform(new_data[numerical_cols])

In [None]:
# 7. Drop columns with na values:
new_data['price']=np.log(new_data['price'])
new_data = new_data.dropna()

# Section 2: Modelling & Results

In [None]:
# 13. Train, Test, Split:
feature = coef[coef != 0].index.tolist()
dt = new_data[feature]

# Import train_test_split
from sklearn.model_selection import train_test_split

X = dt
y = new_data['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 666)

Model 1: Plain Vanila Decision Tree

In [None]:
# 1. Import model from libraries
from sklearn.tree import DecisionTreeRegressor

decision_tree = DecisionTreeRegressor(max_depth=6,min_samples_leaf=0.12)
decision_tree.fit(X_train, y_train)

In [None]:
#2. Apply gridsearch to find optimal parameters
from sklearn.model_selection import GridSearchCV

pgrid = {"max_depth": [3,4,5,6,7,8,9,10],
          "min_samples_leaf": [ 0.12,0.14,0.16,0.18,0.20]}
grid_search = GridSearchCV(DecisionTreeRegressor(), param_grid=pgrid, scoring='neg_mean_squared_error', cv=10)
grid_search.fit(X_train, y_train)
y_predict = grid_search.best_estimator_.predict(X_test)

y_pred = decision_tree.predict(X_test)

In [None]:
# 3. Calculate r2 Score:

from sklearn.metrics import r2_score
r2_score(y_test, y_pred)

Model 3: Extra Trees Regressor

In [None]:
#  1. Import model from libraries
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import ExtraTreesRegressor

regressor = ExtraTreesRegressor(n_estimators = 500, bootstrap=True, random_state = 0)
regressor.fit(X_train, y_train)

In [None]:
# 2. Apply gridsearch to find optimal parameter for max_depth of ExtraTreesRegressor

from sklearn.model_selection import GridSearchCV

pgrid = {"max_depth": [4, 5, 6, 7, 8, 9, 10, 11, 12]}
grid_search = GridSearchCV(ExtraTreesRegressor(), param_grid=pgrid, scoring='neg_mean_squared_error', cv=10)
grid_search.fit(X_train, y_train)
y_predict = grid_search.best_estimator_.predict(X_test)

y_pred = regressor.predict(X_test)

In [None]:
# 3. Calculate r2 Score:

from sklearn.metrics import r2_score
r2_score(y_test, y_pred)