In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import max_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_validate, cross_val_score
import numpy as np
from timeit import default_timer as timer

In [2]:
dataFile = "../data/car_import_original.csv"
df = pd.read_csv(dataFile,sep=',')
gt = df.price.values
df.drop(columns=["price"], inplace=True)
df.head()

Unnamed: 0,symboling,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,length,...,num_of_cylinders,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg
0,3,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,...,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27
1,3,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,...,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27
2,1,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,...,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26
3,2,audi,gas,std,four,sedan,fwd,front,99.8,176.6,...,four,109,mpfi,3.19,3.4,10.0,102,5500,24,30
4,2,audi,gas,std,four,sedan,4wd,front,99.4,176.6,...,five,136,mpfi,3.19,3.4,8.0,115,5500,18,22


In [3]:
df = pd.get_dummies(df, columns=["make",
                            "fuel_type",
                            "aspiration",
                            "num_of_doors",
                            "body_style",
                            "drive_wheels",
                            "engine_location",
                            "engine_type",
                            "num_of_cylinders",
                            "fuel_system"]);
df.head()

Unnamed: 0,symboling,wheel_base,length,width,height,curb_weight,engine_size,bore,stroke,compression_ratio,...,num_of_cylinders_six,num_of_cylinders_three,num_of_cylinders_twelve,fuel_system_1bbl,fuel_system_2bbl,fuel_system_idi,fuel_system_mfi,fuel_system_mpfi,fuel_system_spdi,fuel_system_spfi
0,3,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,...,0,0,0,0,0,0,0,1,0,0
1,3,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,...,0,0,0,0,0,0,0,1,0,0
2,1,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,...,1,0,0,0,0,0,0,1,0,0
3,2,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,...,0,0,0,0,0,0,0,1,0,0
4,2,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,...,0,0,0,0,0,0,0,1,0,0


In [4]:
scaler = MinMaxScaler((0.0, 1.0))
df = pd.DataFrame(scaler.fit_transform(df))
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,59,60,61,62,63,64,65,66,67,68
0,1.0,0.058309,0.413433,0.324786,0.083333,0.411171,0.260377,0.664286,0.290476,0.125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.058309,0.413433,0.324786,0.083333,0.411171,0.260377,0.664286,0.290476,0.125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.6,0.230321,0.449254,0.444444,0.383333,0.517843,0.343396,0.1,0.666667,0.125,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.8,0.38484,0.529851,0.504274,0.541667,0.329325,0.181132,0.464286,0.633333,0.1875,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.8,0.373178,0.529851,0.521368,0.541667,0.518231,0.283019,0.464286,0.633333,0.0625,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [5]:
train_x = []
train_y = []
test_x = []
test_y = []
trainCount = len(df) * 0.8
for i in range(len(df)):
    if i < trainCount:
        train_x.append(df.values[i])
        train_y.append(gt[i])
    else:
        test_x.append(df.values[i])
        test_y.append(gt[i])
        
assert (len(train_x) == len(train_y))
assert (len(test_x) == len(test_y))
    
print(f"Train size: {len(train_x)}, Test size: {len(test_x)}")

Train size: 155, Test size: 38


In [6]:
decisionTree = DecisionTreeRegressor(min_samples_leaf=10)
decisionTree.fit(train_x, train_y)
tree = decisionTree.tree_
r2 = decisionTree.score(test_x, test_y)
print(f"Tree MSL = {2} with {tree.node_count} nodes, R2 = {r2}")

Tree MSL = 2 with 25 nodes, R2 = 0.45459306341709305


In [7]:
decisionTree = DecisionTreeRegressor(min_samples_leaf=5)
decisionTree.fit(train_x, train_y)
tree = decisionTree.tree_
r2 = decisionTree.score(test_x, test_y)
print(f"Tree MSL = {2} with {tree.node_count} nodes, R2 = {r2}")

Tree MSL = 2 with 51 nodes, R2 = 0.5418145067264788


In [9]:
decisionTree = DecisionTreeRegressor(min_samples_leaf=2)
decisionTree.fit(train_x, train_y)
tree = decisionTree.tree_
r2 = decisionTree.score(test_x, test_y)
print(f"Tree MSL = {2} with {tree.node_count} nodes, R2 = {r2}")

Tree MSL = 2 with 133 nodes, R2 = 0.5080953647376514
