In [40]:
import pandas as pd
import numpy as np
import IPython
import graphviz
import re
from IPython.display import display
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz

In [41]:
data_path = '/Users/timlee/data/iowa-housing/train.csv'
df = pd.read_csv(data_path)

## To make things simple we will only choose a few columns

In [42]:
df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [43]:
selected_cols = ['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea',
         'LotShape', 'LandContour', 'Utilities', 'LotConfig',
        'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt','GrLivArea','FullBath',
       'HalfBath', 'BedroomAbvGr', 'TotRmsAbvGrd', 'YrSold', 'SaleType',
       'SaleCondition', 'SalePrice']

In [55]:
test_cols = ['GrLivArea', 'TotRmsAbvGrd', 'LotArea' ]
labels = df['SalePrice']

df_trn = df[test_cols].copy()
df_trn.fillna(0, inplace=True)
df_trn.shape, labels.shape

((1460, 3), (1460,))

In [46]:
sample = df_trn['GrLivArea']

In [47]:
n = len(sample)
score = 100000

for i in range(n):
    left = sample < sample[i]
    right = sample >= sample[i]
    

### Drawing Tree

In [48]:
def draw_tree(t, df, size=10, ratio=0.6, precision=0):
    """ Draws a representation of a random forest in IPython.
    Parameters:
    -----------
    t: The tree you wish to draw
    df: The data used to train the tree. This is used to get the names of the features.
    """
    s=export_graphviz(t, out_file=None, feature_names=df.columns, filled=True,
                      special_characters=True, rotate=True, precision=precision)
    IPython.display.display(graphviz.Source(re.sub('Tree {',
       f'Tree {{ size={size}; ratio={ratio}', s)))

### First Prototype  a Decision Tree

In [188]:
class Dtree():
    def __init__(self,x,y, filter_indices = None, print_offset=' ', max_depth=None):
        self.print_offset = print_offset
            
        if filter_indices is None:
            self.indices = list(x.index)
        else:
            self.indices = filter_indices
            
        self.x = x
        self.y = y
        # self.feature_idx =1
        self.stored_score = float('inf')
        self.stored_split_feature = 0
        self.stored_split_value = 0
        self.feature_list = x.columns
        self.left_idx = None
        self.right_idx = None

    def spawn(self):
        self.split_data()
        if (len(self.left_idx) > 50) & (len(self.right_idx) > 50 ) :
            print(self.print_offset + 'left: ' + str(len(self.left_idx)))
            self.left_tree = Dtree(self.x, self.y, self.left_idx, self.print_offset + ' left -> ')
            self.left_tree.spawn()
            print(self.print_offset + 'right: ' + str(len(self.right_idx)))
            self.right_tree= Dtree(self.x, self.y, self.right_idx, self.print_offset + ' right-> ')
            self.right_tree.spawn()
    
    def split_data(self):
        self.check_all_features()
        feat_name = self.feature_list[self.stored_split_feature]
        x_col = self.x[[feat_name]]
        x_col = x_col.iloc[self.indices,:]
        self.left_idx  = list(x_col[x_col[feat_name] < self.stored_split_value].index)
        self.right_idx  = list(x_col[x_col[feat_name] >= self.stored_split_value].index)
        
    
    
    def check_all_features(self):
        for i in range(len(self.feature_list)-1):
            self.find_split_in_single_feature(i)  
            
        print(self.stored_score, self.feature_list[self.stored_split_feature], self.stored_split_value)
    
    
    def find_split_in_single_feature(self, feature_idx):
        X_train = self.x
        y_train = self.y

        x,y = X_train.iloc[self.indices, feature_idx], y_train.values[self.indices]
        
        for split_index in x.index:
            lhs_x = x<=x[split_index]
            rhs_x = x>x[split_index]

            if rhs_x.sum()==0:
                continue
            else:
                lhs_y_std = y[lhs_x].std()
                rhs_x_std = y[rhs_x].std()

                curr_score = lhs_y_std*lhs_x.sum() + rhs_x_std*rhs_x.sum()
                if curr_score<self.stored_score: 
                    self.stored_split_feature = feature_idx
                    self.stored_score = curr_score
                    self.stored_split_value = x[split_index]

In [189]:
myTree = Dtree(df_trn,labels)
myTree.spawn()

88277501.8016 GrLivArea 1487
 left: 766
24657809.9223 GrLivArea 1198
  left -> left: 424
10725526.9811 GrLivArea 803
  left -> right: 342
12549252.8085 GrLivArea 1416
  left ->  right-> left: 250
8614923.77704 TotRmsAbvGrd 6
  left ->  right->  left -> left: 67
2284532.15607 TotRmsAbvGrd 4
  left ->  right->  left -> right: 183
6234563.26991 TotRmsAbvGrd 6
  left ->  right-> right: 92
3418397.09112 GrLivArea 1419
 right: 694
51572035.0487 GrLivArea 2295
  right-> left: 586
36319311.4231 GrLivArea 1824
  right->  left -> left: 365
19511407.2925 TotRmsAbvGrd 7
  right->  left ->  left -> left: 116
6019641.18904 GrLivArea 1641
  right->  left ->  left -> right: 249
12786660.4719 TotRmsAbvGrd 7
  right->  left -> right: 221
15633333.0209 TotRmsAbvGrd 10
  right-> right: 108
11868918.8559 GrLivArea 3493
