In [1]:
import pandas as pd # data processing
import numpy as np # working with arrays
import matplotlib.pyplot as plt # visualization
from matplotlib import rcParams # figure size
from termcolor import colored as cl # text customization

from sklearn.tree import DecisionTreeClassifier as dtc # tree algorithm
from sklearn.model_selection import train_test_split # splitting the data
from sklearn.metrics import accuracy_score # model precision
from sklearn.tree import plot_tree # tree diagram



In [2]:
rcParams['figure.figsize'] = (25, 30)

In [3]:
df = pd.read_csv('housePrice.csv')

print(cl(df.head(), attrs = ['bold']))

   Area  Room  Parking  Warehouse  Elevator         Address         Price  \
0    63     1     True       True      True         Shahran  1.850000e+09   
1    60     1     True       True      True         Shahran  1.850000e+09   
2    79     2     True       True      True          Pardis  5.500000e+08   
3    95     2     True       True      True   Shahrake Qods  9.025000e+08   
4   123     2     True       True      True  Shahrake Gharb  7.000000e+09   

   Price(USD)  
0    61666.67  
1    61666.67  
2    18333.33  
3    30083.33  
4   233333.33  


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3465 entries, 0 to 3464
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Area        3465 non-null   int64  
 1   Room        3465 non-null   int64  
 2   Parking     3465 non-null   bool   
 3   Warehouse   3465 non-null   bool   
 4   Elevator    3465 non-null   bool   
 5   Address     3442 non-null   object 
 6   Price       3465 non-null   float64
 7   Price(USD)  3465 non-null   float64
dtypes: bool(3), float64(2), int64(2), object(1)
memory usage: 145.6+ KB


In [5]:
df.Address = pd.factorize(df.Address)[0]
print (df)

      Area  Room  Parking  Warehouse  Elevator  Address         Price  \
0       63     1     True       True      True        0  1.850000e+09   
1       60     1     True       True      True        0  1.850000e+09   
2       79     2     True       True      True        1  5.500000e+08   
3       95     2     True       True      True        2  9.025000e+08   
4      123     2     True       True      True        3  7.000000e+09   
...    ...   ...      ...        ...       ...      ...           ...   
3460    86     2     True       True      True       16  3.500000e+09   
3461    83     2     True       True      True       78  6.800000e+09   
3462    75     2    False      False     False       27  3.650000e+08   
3463   105     2     True       True      True       46  5.600000e+09   
3464    82     2    False       True      True       27  3.600000e+08   

      Price(USD)  
0       61666.67  
1       61666.67  
2       18333.33  
3       30083.33  
4      233333.33  
...      

In [6]:
priceGps=[]
priceIndex=[]
for i in range(1500):
    priceGps.append(i*100000000.0)
    if i!=0:
        priceIndex.append(i)
df['PriceCat'] = pd.cut(x=df['Price'],bins=priceGps,labels=priceIndex)
print(df.PriceCat)

0       19
1       19
2        6
3       10
4       70
        ..
3460    35
3461    68
3462     4
3463    56
3464     4
Name: PriceCat, Length: 3465, dtype: category
Categories (1499, int64): [1 < 2 < 3 < 4 ... 1496 < 1497 < 1498 < 1499]


In [7]:
X_var = df[['Area', 'Room', 'Parking', 'Warehouse', 'Address']].values # independent variable
y_var = df['PriceCat'].values # dependent variable
print(y_var)
print(cl('X variable samples : {}'.format(X_var[:5]), attrs = ['bold']))
print(cl('Y variable samples : {}'.format(y_var[:5]), attrs = ['bold']))

[19, 19, 6, 10, 70, ..., 35, 68, 4, 56, 4]
Length: 3465
Categories (1499, int64): [1 < 2 < 3 < 4 ... 1496 < 1497 < 1498 < 1499]
X variable samples : [[63 1 True True 0]
 [60 1 True True 0]
 [79 2 True True 1]
 [95 2 True True 2]
 [123 2 True True 3]]
Y variable samples : [19, 19, 6, 10, 70]
Categories (1499, int64): [1 < 2 < 3 < 4 ... 1496 < 1497 < 1498 < 1499]


In [8]:

X_train, X_test, y_train, y_test = train_test_split(X_var, y_var, train_size=0.75, random_state = 0)

print(cl('X_train shape : {}'.format(X_train.shape), attrs = ['bold'], color = 'black'))
print(cl('X_test shape : {}'.format(X_test.shape), attrs = ['bold'], color = 'black'))
print(cl('y_train shape : {}'.format(y_train.shape), attrs = ['bold'], color = 'black'))
print(cl('y_test shape : {}'.format(y_test.shape), attrs = ['bold'], color = 'black'))

X_train shape : (2598, 5)
X_test shape : (867, 5)
y_train shape : (2598,)
y_test shape : (867,)


In [9]:
model = dtc(criterion = 'entropy', max_depth =15)
model.fit(X_train, y_train)

pred_model = model.predict(X_test)

print(cl('Accuracy of the model is {:.0%}'.format(accuracy_score(y_test, pred_model)), attrs = ['bold']))


Accuracy of the model is 15%


In [None]:
feature_names = df.columns[:5]
target_names = df['PriceCat'].unique().tolist()

plot_tree(model, 
          feature_names = feature_names, 
          class_names =str( target_names),
          filled = True, 
          rounded = True)

plt.savefig('tree_visualization.png',dpi=1200) 
print('success')

success
