In [153]:
import numpy as np
import pandas as pd

In [154]:
data = pd.read_csv("/content/AirfoilSelfNoise.csv")
data.head(5)

Unnamed: 0,f,alpha,c,U_infinity,delta,SSPL
0,800,0.0,0.3048,71.3,0.002663,126.201
1,1000,0.0,0.3048,71.3,0.002663,125.201
2,1250,0.0,0.3048,71.3,0.002663,125.951
3,1600,0.0,0.3048,71.3,0.002663,127.591
4,2000,0.0,0.3048,71.3,0.002663,127.461


In [155]:
data.info

<bound method DataFrame.info of          f  alpha       c  U_infinity     delta     SSPL
0      800    0.0  0.3048        71.3  0.002663  126.201
1     1000    0.0  0.3048        71.3  0.002663  125.201
2     1250    0.0  0.3048        71.3  0.002663  125.951
3     1600    0.0  0.3048        71.3  0.002663  127.591
4     2000    0.0  0.3048        71.3  0.002663  127.461
...    ...    ...     ...         ...       ...      ...
1498  2500   15.6  0.1016        39.6  0.052849  110.264
1499  3150   15.6  0.1016        39.6  0.052849  109.254
1500  4000   15.6  0.1016        39.6  0.052849  106.604
1501  5000   15.6  0.1016        39.6  0.052849  106.224
1502  6300   15.6  0.1016        39.6  0.052849  104.204

[1503 rows x 6 columns]>

In [156]:
class Node():
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, var_red=None, value=None):
        ''' constructor ''' 
        
        # for decision node
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.var_red = var_red
        
        # for leaf node
        self.value = value

In [175]:
class DecisionTreeRegressor():
    def __init__(self, min_samples_split=5, max_depth=3):
        ''' constructor '''
        
        # initialize the root of the tree 
        self.root = None
        
        # stopping conditions
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        
    def build_tree(self, dataset, curr_depth=0):
        ''' recursive function to build the tree '''
        
        X, Y = dataset[:,:-1], dataset[:,-1]
        num_samples, num_features = np.shape(X)
        best_split = {}
        # split until stopping conditions are met
        if num_samples>=self.min_samples_split and curr_depth<=self.max_depth:
            # find the best split
            best_split = self.get_best_split(dataset, num_samples, num_features)
            # check if information gain is positive
            if best_split["var_red"]>0:
                # recur left
                left_subtree = self.build_tree(best_split["dataset_left"], curr_depth+1)
                # recur right
                right_subtree = self.build_tree(best_split["dataset_right"], curr_depth+1)
                # return decision node
                return Node(best_split["feature_index"], best_split["threshold"], 
                            left_subtree, right_subtree, best_split["var_red"])
        
        # compute leaf node
        leaf_value = self.calculate_leaf_value(Y)
        # return leaf node
        return Node(value=leaf_value)
    
    def get_best_split(self, dataset, num_samples, num_features):
        ''' function to find the best split '''
        
        # dictionary to store the best split
        best_split = {}
        max_var_red = -float("inf")
        # loop over all the features
        for feature_index in range(num_features):
            feature_values = dataset[:, feature_index]
            possible_thresholds = np.unique(feature_values)
            # loop over all the feature values present in the data
            for threshold in possible_thresholds:
                # get current split
                dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
                # check if childs are not null
                if len(dataset_left)>0 and len(dataset_right)>0:
                    y, left_y, right_y = dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1]
                    # compute information gain
                    curr_var_red = self.variance_reduction(y, left_y, right_y)
                    # update the best split if needed
                    if curr_var_red>max_var_red:
                        best_split["feature_index"] = feature_index
                        best_split["threshold"] = threshold
                        best_split["dataset_left"] = dataset_left
                        best_split["dataset_right"] = dataset_right
                        best_split["var_red"] = curr_var_red
                        max_var_red = curr_var_red
                        
        # return best split
        return best_split
    
    def split(self, dataset, feature_index, threshold):
        ''' function to split the data '''
        
        dataset_left = np.array([row for row in dataset if row[feature_index]<=threshold])
        dataset_right = np.array([row for row in dataset if row[feature_index]>threshold])
        return dataset_left, dataset_right
    
    def variance_reduction(self, parent, l_child, r_child):
        ''' function to compute variance reduction '''
        
        weight_l = len(l_child) / len(parent)
        weight_r = len(r_child) / len(parent)
        reduction = np.var(parent) - (weight_l * np.var(l_child) + weight_r * np.var(r_child))
        return reduction
    
    def calculate_leaf_value(self, Y):
        ''' function to compute leaf node '''
        
        val = np.mean(Y)
        return val
                
    def print_tree(self, tree=None, indent=" "):
        ''' function to print the tree '''
        
        if not tree:
            tree = self.root

        if tree.value is not None:
            print(tree.value)

        else:
            print("X_"+str(tree.feature_index), "<=", tree.threshold, "?", tree.var_red)
            print("%sleft:" % (indent), end="")
            self.print_tree(tree.left, indent + indent)
            print("%sright:" % (indent), end="")
            self.print_tree(tree.right, indent + indent)
    
    def fit(self, X, Y):
        ''' function to train the tree '''
        
        dataset = np.concatenate((X, Y), axis=1)
        self.root = self.build_tree(dataset)
        
    def make_prediction(self, x, tree):
        ''' function to predict new dataset '''
        
        if tree.value!=None: return tree.value
        feature_val = x[tree.feature_index]
        if feature_val<=tree.threshold:
            return self.make_prediction(x, tree.left)
        else:
            return self.make_prediction(x, tree.right)
    
    def predict(self, X):
        ''' function to predict a single data point '''
        
        preditions = [self.make_prediction(x, self.root) for x in X]
        return preditions
    

In [158]:
#X = data.iloc[:, :-1].values
#Y = data.iloc[:, -1].values.reshape(-1,1)
#from sklearn.model_selection import train_test_split
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=31)

In [176]:
X = data.iloc[:, :-1].values
Y = data.iloc[:, -1].values.reshape(-1,1)

In [177]:
# Select ratio
ratio = 0.75

n_train = int(X.shape[0] *ratio)
n_test = X.shape[0] - n_train
indices = np.random.permutation(X.shape[0])
# Split data into test and train
X_train = X[indices[:n_train], :]
Y_train = Y[indices[:n_train], :]
X_test = X[indices[n_train:], :]
Y_test = Y[indices[n_train:], :]


In [178]:
X_train.shape, X_test.shape


((1127, 5), (376, 5))

In [186]:
regressor = DecisionTreeRegressor(min_samples_split=2, max_depth=4)
regressor.fit(X_train,Y_train)
#regressor.print_tree()

In [180]:
import sklearn.metrics as metrics

In [181]:
Y_pred = regressor.predict(X_test) 
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(Y_test, Y_pred))



4.260825899993658

In [182]:
from sklearn.metrics import mean_absolute_error
(mean_absolute_error(Y_test, Y_pred))


3.3517013548943

In [183]:
from sklearn.metrics import mean_poisson_deviance
mean_poisson_deviance(Y_test, Y_pred)

0.14566465540165424

In [194]:
from itertools import product
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Define the hyperparameters to search over
max_depths = [2, 4, 6, 8]
min_samples_splits = [2, 5, 10, 15]

# Define the data and target
X = data.iloc[:, :-1].values
Y = data.iloc[:, -1].values.reshape(-1,1)

# Define a function to perform the grid search
def grid_search_decision_tree(max_depths, min_samples_splits, X, Y):
    best_params = None
    best_mse = float('inf')
    for max_depth, min_samples_split in product(max_depths, min_samples_splits):
        # Create a decision tree regressor with the current hyperparameters
        model = DecisionTreeRegressor(max_depth=max_depth, min_samples_split=min_samples_split)
        # Fit the model on the data
        model.fit(X_train, Y_train)
        # Predict the target
        y_pred = model.predict(X_test)
        # Calculate the mean squared error
        mse = mean_squared_error(Y_test, Y_pred)
        # If the mean squared error is better than the previous best,
        # update the best hyperparameters and mean squared error
        if mse < best_mse:
            best_mse = mse
            best_params = {'max_depth': max_depth, 'min_samples_split': min_samples_split}
    return best_params

# Perform the grid search and print the best hyperparameters
best_params = grid_search_decision_tree(max_depths, min_samples_splits, X, Y)
print(f'Best hyperparameters: {best_params}')


Best hyperparameters: {'max_depth': 2, 'min_samples_split': 2}
