In [21]:
import pandas as pd
import numpy as np
from sklearn import tree, metrics
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from io import StringIO  
from IPython.display import Image  
import pydotplus

import graphviz

from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import OrdinalEncoder

In [2]:
#import data
features = pd.read_csv('../data/features_data_cleaned.csv')
# Create an instance of the OrdinalEncoder
ordinal_encoder = OrdinalEncoder()

In [3]:
#decide how to split categories based off value counts of continuous variables and ordinal encode them 
features['minimum_nights'].value_counts()

2      3325
1      2766
30      879
3       850
31       86
4        60
5        29
28       26
7        23
90       16
29       10
60        9
14        8
20        5
45        4
32        3
25        3
10        3
15        3
999       3
6         3
180       2
120       2
27        2
93        1
270       1
150       1
300       1
21        1
26        1
Name: minimum_nights, dtype: int64

In [4]:
# categories: 1, 2, 3, 4, 5-13, 14-19, 20-29, 30-40, >40

conditions = [
    (features['minimum_nights'] == 1), (features['minimum_nights'] == 2), (features['minimum_nights'] == 3),
    (features['minimum_nights'] == 4), (features['minimum_nights'] >= 5) & (features['minimum_nights'] < 14),
    (features['minimum_nights'] >= 14) & (features['minimum_nights'] < 20),
    (features['minimum_nights'] >= 20) & (features['minimum_nights'] < 30), 
    (features['minimum_nights'] >= 30) & (features['minimum_nights'] <= 40), (features['minimum_nights'] > 40)
]
groups = ['1', '2', '3', '4', '5-13', '14-19', '20-29', '30-40', '>40']

# Create a new column 'group' based on the conditions
features['g_min_night'] = np.select(conditions, groups, default='unknown')

#ordinal encode the row
features['e_min_night'] = ordinal_encoder.fit_transform(features[['g_min_night']])

In [5]:
features['price'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 8126 entries, 0 to 8125
Series name: price
Non-Null Count  Dtype  
--------------  -----  
8126 non-null   float64
dtypes: float64(1)
memory usage: 63.6 KB


In [6]:
features['bathroom_count'].value_counts()

 1.0     3668
 2.0     1376
 2.5      705
 3.5      639
 3.0      560
 4.0      472
 1.5      299
 4.5      120
 6.0       53
 8.0       51
 5.0       49
 7.0       34
 9.0       19
 5.5       18
 6.5       13
 12.0       7
 0.0        6
 16.0       6
 7.5        5
 8.5        5
 13.5       3
 12.5       3
 9.5        3
-1.0        2
 10.5       2
 0.5        2
 17.5       2
 17.0       1
 19.0       1
 18.0       1
 14.0       1
Name: bathroom_count, dtype: int64

In [7]:
# categories: <=0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0, 5.5,6, 6.5, 7-10, >10

conditions = [
    (features['bathroom_count'] <= 0), (features['bathroom_count'] == 0.5), (features['bathroom_count'] == 1.0),
    (features['bathroom_count'] == 1.5), (features['bathroom_count'] == 2.0), (features['bathroom_count'] == 2.5),
    (features['bathroom_count'] == 3.0), (features['bathroom_count'] == 3.5), (features['bathroom_count'] == 4.0),
    (features['bathroom_count'] == 4.5), (features['bathroom_count'] == 5.0), (features['bathroom_count'] == 5.5),
    (features['bathroom_count'] == 6.0), (features['bathroom_count'] == 6.5), 
    (features['bathroom_count'] >= 7.0) & (features['bathroom_count'] <= 10), (features['bathroom_count'] > 10)
     ]
groups = ['<=0', '0.5', '1.0', '1.5', '2.0', '2.5', '3.0', '3.5', '4.0', '4.5', '5.0', '5.5','6', '6.5', '7-10', '>10']

# Create a new column 'group' based on the conditions
features['g_bath_count'] = np.select(conditions, groups, default='unknown')

#ordinal encode the row
features['e_bath_count'] = ordinal_encoder.fit_transform(features[['g_bath_count']])

In [8]:
features['bedrooms'].value_counts()

1.0     3014
2.0     1820
3.0     1613
4.0     1359
8.0      108
6.0       78
5.0       74
7.0       13
12.0      13
16.0      11
10.0       8
9.0        6
14.0       3
15.0       2
22.0       1
11.0       1
18.0       1
13.0       1
Name: bedrooms, dtype: int64

In [9]:
# categories: 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, >9.0

conditions = [
    (features['bedrooms'] == 1.0), (features['bedrooms'] == 2.0), (features['bedrooms'] == 3.0), 
    (features['bedrooms'] == 4.0), (features['bedrooms'] == 5.0), (features['bedrooms'] == 6.0),
    (features['bedrooms'] == 7.0), (features['bedrooms'] == 8.0), (features['bedrooms'] > 9.0)
     ]
groups = ['1.0', '2.0', '3.0', '4.0', '5.0', '6.0', '7.0', '8.0', '>9.0']

# Create a new column 'group' based on the conditions
features['g_bedroom_count'] = np.select(conditions, groups, default='unknown')

#ordinal encode the row
features['e_bedroom_count'] = ordinal_encoder.fit_transform(features[['g_bedroom_count']])

In [10]:
features['beds'].value_counts()

 2.0      1900
 1.0      1536
 3.0      1182
 4.0       975
 5.0       624
 6.0       551
 7.0       316
 8.0       262
 9.0       150
 10.0      150
 11.0      102
 12.0       78
 13.0       46
-1.0        46
 14.0       44
 16.0       37
 15.0       32
 20.0       16
 22.0       12
 18.0       12
 24.0       11
 17.0        9
 21.0        4
 28.0        4
 23.0        4
 30.0        3
 26.0        3
 27.0        2
 25.0        2
 32.0        2
 40.0        2
 29.0        2
 19.0        2
 36.0        2
 31.0        1
 50.0        1
 111.0       1
Name: beds, dtype: int64

In [11]:
# categories: -1, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10-15, 15-20, >20

conditions = [
    (features['beds'] == -1.0), (features['beds'] == 1.0), (features['beds'] == 2.0),
    (features['beds'] == 3.0), (features['beds'] == 4.0), (features['beds'] == 5.0), 
    (features['beds'] == 6.0), (features['beds'] == 7.0), (features['beds'] == 8.0), 
    (features['beds'] == 9.0), (features['beds'] >= 10.0) & (features['beds'] < 15.0),
    (features['beds'] >= 15.0) & (features['beds'] < 20), (features['beds'] >= 20)
     ]

groups= ['-1','1.0', '2.0', '3.0', '4.0', '5.0', '6.0', '7.0', '8.0', '9.0', '10-15', '15-20', '>20']

# Create a new column 'group' based on the conditions
features['g_bedscount'] = np.select(conditions, groups, default='unknown')

#ordinal encode the row
features['e_bedscount'] = ordinal_encoder.fit_transform(features[['g_bedscount']])

In [12]:
#lets try and run the model linear decision tree 
features.head()

Unnamed: 0,name,price,neighbourhood_cleansed,room_type,minimum_nights,accommodates,bedrooms,beds,bathroom_count,bathroom_type,g_min_night,e_min_night,g_bath_count,e_bath_count,g_bedroom_count,e_bedroom_count,g_bedscount,e_bedscount
0,Nashville Charm,40.0,District 6,Private room,30,2,2.0,3.0,1.0,private bath,30-40,5.0,1.0,1.0,2.0,1.0,3.0,5.0
1,Large Main Suite near Lake *ladies only NS plz,45.0,District 12,Private room,30,1,1.0,1.0,1.0,private bath,30-40,5.0,1.0,1.0,1.0,0.0,1.0,1.0
2,Vandy/Belmont/10 mins to Broadway - Sunny 800 ...,90.0,District 18,Entire home/apt,2,4,2.0,2.0,1.0,bath,2,2.0,1.0,1.0,2.0,1.0,2.0,4.0
3,"SuperSweetSTUDIO, jacuzzi, open Nov 23, 6 mo",39.0,District 12,Private room,30,3,1.0,5.0,1.0,private bath,30-40,5.0,1.0,1.0,1.0,0.0,5.0,7.0
4,"MorningstarHouse, monthly room- open Aug 19",33.0,District 12,Private room,30,1,1.0,3.0,1.0,shared bath,30-40,5.0,1.0,1.0,1.0,0.0,3.0,5.0


In [13]:
#check if accomodates is categorical or numeric
features['accommodates'].dtypes

dtype('int64')

In [14]:
# divide the variables into X and y 

X= features[['neighbourhood_cleansed', 'room_type', 'e_min_night','accommodates',
             'e_bedroom_count','e_bedscount','e_bath_count', 'bathroom_type']]
y= features.price

In [15]:
#lets dummy and see what happens to the other variables
X = pd.get_dummies(X)
X.head()

Unnamed: 0,e_min_night,accommodates,e_bedroom_count,e_bedscount,e_bath_count,neighbourhood_cleansed_District 1,neighbourhood_cleansed_District 10,neighbourhood_cleansed_District 11,neighbourhood_cleansed_District 12,neighbourhood_cleansed_District 13,...,neighbourhood_cleansed_District 7,neighbourhood_cleansed_District 8,neighbourhood_cleansed_District 9,room_type_Entire home/apt,room_type_Hotel room,room_type_Private room,room_type_Shared room,bathroom_type_bath,bathroom_type_private bath,bathroom_type_shared bath
0,5.0,2,1.0,5.0,1.0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1,5.0,1,0.0,1.0,1.0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,1,0
2,2.0,4,1.0,4.0,1.0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
3,5.0,3,0.0,7.0,1.0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,1,0
4,5.0,1,0.0,5.0,1.0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,1


In [16]:
#model babayyy ... linear decision tree

In [17]:
# Call train_test_split on X, y. Make the test_size = 0.25, and random_state = 246
X_train,X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 246)

In [18]:
# Declare a variable called entr_model and use tree.DecisionTreeClassifier. 
regressor = DecisionTreeRegressor()

# Call fit() on entr_model
regressor = regressor.fit(X_train, y_train) 

# Call predict() on entr_model with X_test passed to it, and assign the result to a variable y_pred 
y_pred = regressor.predict(X_test) 

# Call Series on our y_pred variable with the following: pd.Series(y_pred)
y_pred = pd.Series(y_pred)

# Check out entr_model
regressor

In [19]:
# Now we want to visualize the tree
#_ = tree.plot_tree(regressor, filled=True)

In [23]:
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [24]:
print('Mean squared error: ', mse)
print('Mean absolute error: ', mae)
print('r-squared score: ', r2)

Mean squared error:  3256537.3686720235
Mean absolute error:  149.1067080871837
r-squared score:  -24.03659712641245
