<h3><b>Import libraries</b></h3>

In [143]:
import numpy as np
import pandas as pd
from IPython.display import display
from typing import Tuple

<h3><b>Get the data</b></h3>

In [144]:
dataset = pd.read_csv('airfoil_self_noise.dat', sep='\s+', header=None)
dataset.rename(columns={
    0: 'Frequency (Hz)', 
    1: 'Angle (deg)', 
    2: 'Chord length (m)', 
    3: 'Free-stream velocity (m/s)', 
    4: 'Suction side displacement thickness (m)',
    5: 'Scaled sound pressure level (dB)'
}, inplace=True)
display(dataset)

Unnamed: 0,Frequency (Hz),Angle (deg),Chord length (m),Free-stream velocity (m/s),Suction side displacement thickness (m),Scaled sound pressure level (dB)
0,800,0.0,0.3048,71.3,0.002663,126.201
1,1000,0.0,0.3048,71.3,0.002663,125.201
2,1250,0.0,0.3048,71.3,0.002663,125.951
3,1600,0.0,0.3048,71.3,0.002663,127.591
4,2000,0.0,0.3048,71.3,0.002663,127.461
...,...,...,...,...,...,...
1498,2500,15.6,0.1016,39.6,0.052849,110.264
1499,3150,15.6,0.1016,39.6,0.052849,109.254
1500,4000,15.6,0.1016,39.6,0.052849,106.604
1501,5000,15.6,0.1016,39.6,0.052849,106.224


<h3><b>Node class</b></h3>

In [145]:
class Node():
    def __init__(self, featureIndex=None, threshold=None, left=None, right=None, varRed=None, value=None):
        # for decision node
        self.featureIndex = featureIndex
        self.threshold = threshold
        self.left = left
        self.right = right
        self.varRed = varRed
        
        # for leaf node
        self.value = value

<h3><b>Split class</b></h3>

In [146]:
class Split:
    def __init__(self, featureIndex: int, threshold: float, datasetLeft: Node, datasetRight: Node, varRed: float):
        self.featureIndex = featureIndex
        self.threshold = threshold
        self.datasetLeft = datasetLeft
        self.datasetRight = datasetRight
        self.varRed = varRed

<h3><b>Tree class</b></h3>

In [147]:
class DecisionTreeRegressor:
    def __init__(self, minSamplesSplit=2, maxDepth=2) -> None:
        self.root = None
        self.minSamplesSplit = minSamplesSplit
        self.maxDepth = maxDepth

    def buildTree(self, dataset, currDepth=0) -> Node:
        x, y = dataset[:, :-1], dataset[:, -1]
        numSamples, numFeatures = np.shape(x)
        bestSplit: Split = None

        if numSamples >= self.minSamplesSplit and currDepth < self.maxDepth:
            bestSplit = self.getBestSplit(dataset, numFeatures)
            if bestSplit.varRed > 0:
                leftSubtree = self.buildTree(bestSplit.datasetLeft, currDepth+1)
                rightSubtree = self.buildTree(bestSplit.datasetRight, currDepth+1)
                return Node(bestSplit.featureIndex, bestSplit.threshold, leftSubtree, rightSubtree, bestSplit.varRed)

        leafValue = self.calcLeafValue(y)
        return Node(value=leafValue)

    def getBestSplit(self, dataset, numFeatures) -> Split:
        bestSplit: Split = None
        maxVarRed = -float('inf')

        for featureIndex in range(numFeatures):
            featureValues = dataset[:, featureIndex]
            possibleThresholds = np.unique(featureValues)

            for threshold in possibleThresholds:
                datasetLeft, datasetRight = self.split(dataset, featureIndex, threshold)
                if len(datasetLeft) > 0 and len(datasetRight) > 0:
                    y, leftY, rightY = dataset[:, -1], datasetLeft[:, -1], datasetRight[:, -1]
                    currVarRed = self.varRed(y, leftY, rightY)
                    if currVarRed > maxVarRed:
                        bestSplit = Split(featureIndex, threshold, datasetLeft, datasetRight, currVarRed)
                        maxVarRed = currVarRed

        return bestSplit

    def split(self, dataset, featureIndex, threshold) -> Tuple[np.ndarray, np.ndarray]:
        datasetLeft = np.array([row for row in dataset if row[featureIndex] <= threshold])
        datasetRight = np.array([row for row in dataset if row[featureIndex] > threshold])
        return datasetLeft, datasetRight

    def varRed(self, parent, lChild, rChild) -> float:
        weightL = len(lChild) / len(parent)
        weightR = len(rChild) / len(parent)

        return np.var(parent) - (weightL * np.var(lChild) + weightR * np.var(rChild))

    def calcLeafValue(self, y):
        return np.mean(y)

    def printTree(self, node: Node=None, indent=" ", columns=None) -> None:
        if not node: node = self.root

        if node.value is not None: 
            print(node.value)
        else:
            print(f' {columns[node.featureIndex] if columns.size else node.featureIndex}', "<=", node.threshold, "?", node.varRed)
            print("%sLeft:" % (indent), end="")
            self.printTree(node.left, indent + indent, columns)
            print("%sRight:" % (indent), end="")
            self.printTree(node.right, indent + indent, columns)

    def fit(self, x, y) -> None:
        dataset = np.concatenate((x, y), axis=1)
        self.root = self.buildTree(dataset)

    def make_prediction(self, x, node: Node) -> float:
        ''' function to predict new dataset '''
        
        if node.value != None: return node.value

        featureVal = x[node.featureIndex]
        if featureVal <= node.threshold:
            return self.make_prediction(x, node.left)
        return self.make_prediction(x, node.right)
    
    def predict(self, X):
        ''' function to predict a single data point '''
        
        return [self.make_prediction(x, self.root) for x in X]

<h3><b>Train-Test Split</b></h3>

In [148]:
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, -1].values.reshape(-1,1)
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.2, random_state=41)

<h3><b>Fit the model</b></h3>

In [149]:
regressor = DecisionTreeRegressor(minSamplesSplit=3, maxDepth=3)
regressor.fit(X_train,Y_train)
regressor.printTree(columns=dataset.columns.values)

 Frequency (Hz) <= 3150.0 ? 7.132048702017748
 Left: Suction side displacement thickness (m) <= 0.0337792 ? 3.590330569067664
  Left: Free-stream velocity (m/s) <= 55.5 ? 1.17898999813184
    Left:126.57842133815551
    Right:128.95809913793101
  Right: Frequency (Hz) <= 1250.0 ? 9.970884020498868
    Left:123.0391029411765
    Right:115.91672
 Right: Suction side displacement thickness (m) <= 0.00146332 ? 29.08299210506528
  Left: Frequency (Hz) <= 8000.0 ? 11.886497073996964
    Left:132.59594117647057
    Right:125.45646666666667
  Right: Suction side displacement thickness (m) <= 0.0229028 ? 5.638575922510643
    Left:118.47146634615386
    Right:111.70868571428572


<h3><b>Test the model</b></h3>

In [150]:
Y_pred = regressor.predict(X_test)
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(Y_test, Y_pred))

5.376979589924688