<a href="https://colab.research.google.com/github/taimoorsardar/Machine-Learning-Projects/blob/main/DecisionTreeClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Remember to upload the dataset into your drive before running the cells and change the path accordingly

#Importing Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np

#### i first made the dataset into a csv file and then used it

In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/Semester 6/ML/data.csv')

In [None]:
dataset['Est'][3] = '10-30'
dataset['Est'][9] = '10-30'

In [None]:
dataset

Unnamed: 0,Alt,Bar,Fri,Hun,Pat,Price,Rain,Res,Type,Est,Will Wait
0,Yes,No,No,Yes,Some,$$$,No,Yes,French,0-10,Yes
1,Yes,No,No,Yes,Full,$,No,No,Thai,30-60,No
2,No,Yes,No,No,Some,$,No,No,Burger,0-10,Yes
3,Yes,No,Yes,Yes,Full,$,Yes,No,Thai,10-30,Yes
4,Yes,No,Yes,No,Full,$$$,No,Yes,French,>60,No
5,No,Yes,No,Yes,Some,$$,Yes,Yes,Italian,0-10,Yes
6,No,Yes,No,No,,$,Yes,No,Burger,0-10,No
7,No,No,No,Yes,Some,$$,Yes,Yes,Thai,0-10,Yes
8,No,Yes,Yes,No,Full,$,Yes,No,Burger,>60,No
9,Yes,Yes,Yes,Yes,Full,$$$,No,Yes,Italian,10-30,No


#### Data preprocessing

In [None]:
# Extracting features (X) and target variable (y)
X = dataset.drop(columns=['Will Wait'])  # Features (all columns except the last one)
y = dataset['Will Wait']  # Target variable (last column)

In [None]:
# Perform one-hot encoding for features (X)
X_encoded = pd.get_dummies(X, drop_first=False)

# Label encode the target variable (y)
y_encoded = y.map({'No': 0, 'Yes': 1})

# Combine the encoded features and target variable into a new DataFrame
encoded_dataset = pd.concat([X_encoded, y_encoded], axis=1)

# Display the updated DataFrame
print(encoded_dataset)

    Alt_No  Alt_Yes  Bar_No  Bar_Yes  Fri_No  Fri_Yes  Hun_No  Hun_Yes  \
0        0        1       1        0       1        0       0        1   
1        0        1       1        0       1        0       0        1   
2        1        0       0        1       1        0       1        0   
3        0        1       1        0       0        1       0        1   
4        0        1       1        0       0        1       1        0   
5        1        0       0        1       1        0       0        1   
6        1        0       0        1       1        0       1        0   
7        1        0       1        0       1        0       0        1   
8        1        0       0        1       0        1       1        0   
9        0        1       0        1       0        1       0        1   
10       1        0       1        0       1        0       1        0   
11       0        1       0        1       0        1       0        1   

    Pat_Full  Pat_None  ...  Res_Yes 

In [None]:
def train_validation_split(X, y, validation_size=0.2, random_state=None):
    """Split the dataset into training and validation sets."""
    # Determine the number of samples for the validation set
    num_validation = int(validation_size * len(X))

    # Set the seed for reproducibility
    if random_state is not None:
        np.random.seed(random_state)

    # Shuffle the indices
    indices = np.random.permutation(len(X))

    # Split the indices into training and validation sets
    validation_indices = indices[:num_validation]
    train_indices = indices[num_validation:]

    # Split the data
    X_train, X_validation = X.iloc[train_indices], X.iloc[validation_indices]
    y_train, y_validation = y.iloc[train_indices], y.iloc[validation_indices]

    return X_train, X_validation, y_train, y_validation


In [None]:
X = encoded_dataset.drop(columns=['Will Wait'])  # Features (all columns except the last one)
y = encoded_dataset['Will Wait']  # Target variable (last column)

# Assuming 'X' is your feature matrix and 'y' is your target vector
X_train, X_validation, y_train, y_validation = train_validation_split(X, y, validation_size=0.2, random_state=42)

# Display the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_validation.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_validation.shape)


X_train shape: (10, 28)
X_test shape: (2, 28)
y_train shape: (10,)
y_test shape: (2,)


# building a decision tree

In [None]:
class DecisionTree:
    def __init__(self):
        self.tree = None

    def fit(self, X_train, y_train):
        self.tree = self._build_tree(X_train, y_train)

    def _calculate_entropy(self, y):
        classes, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        entropy = -np.sum(probabilities * np.log2(probabilities))
        return entropy

    def _calculate_information_gain(self, X, y, feature, threshold):
        left_indices = X[:, feature] <= threshold
        right_indices = X[:, feature] > threshold

        left_entropy = self._calculate_entropy(y[left_indices])
        right_entropy = self._calculate_entropy(y[right_indices])

        parent_entropy = self._calculate_entropy(y)
        left_weight = np.sum(left_indices) / len(y)
        right_weight = np.sum(right_indices) / len(y)

        information_gain = parent_entropy - (left_weight * left_entropy + right_weight * right_entropy)
        return information_gain

    def _find_best_split(self, X, y):
        best_information_gain = -float('inf')
        best_feature = None
        best_threshold = None

        for feature in range(X.shape[1]):
            unique_values = np.unique(X[:, feature])
            for value in unique_values:
                information_gain = self._calculate_information_gain(X, y, feature, value)
                if information_gain > best_information_gain:
                    best_information_gain = information_gain
                    best_feature = feature
                    best_threshold = value

        return best_feature, best_threshold

    def _build_tree(self, X, y, depth=0):
        if depth == 0:
            print("Building decision tree...")
        if len(np.unique(y)) == 1:
            return {'prediction': y[0]}

        best_feature, best_threshold = self._find_best_split(X, y)
        left_indices = X[:, best_feature] <= best_threshold
        right_indices = X[:, best_feature] > best_threshold
        left_subtree = self._build_tree(X[left_indices], y[left_indices], depth+1)
        right_subtree = self._build_tree(X[right_indices], y[right_indices], depth+1)

        return {'feature': best_feature, 'threshold': best_threshold, 'left': left_subtree, 'right': right_subtree}

    def print_tree(self):
        self._print_tree(self.tree)

    def _print_tree(self, node, depth=0):
        if isinstance(node, dict):
            print('\t' * depth + f"[X{node['feature']} <= {node['threshold']}]")
            self._print_tree(node['left'], depth + 1)
            self._print_tree(node['right'], depth + 1)
        else:
            print('\t' * depth + f"Predicted class: {node}")

    def predict(self, validation_set):
        predictions = []
        for _, row in validation_set.iterrows():
            node = self.tree
            while 'prediction' not in node:
                feature = node['feature']
                threshold = node['threshold']
                if row[feature] <= threshold:
                    node = node['left']
                else:
                    node = node['right']
            predictions.append(node['prediction'])
        return predictions

In [None]:
# Assuming X_val and y_val are the validation set
decision_tree = DecisionTree()
decision_tree.fit(X_train.values, y_train.values)
predictions =decision_tree.predict(X_validation)

Building decision tree...


In [None]:
print(predictions)
print (y_validation.shape)

[0, 1]
(2,)


# Evaluation

In [None]:
def calculate_accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

# Assuming you already have the predicted labels 'y_pred' and the true labels 'y_validation'
accuracy = calculate_accuracy(y_validation.values, predictions)
print("Accuracy:", accuracy)


Accuracy: 0.5
