<a href="https://colab.research.google.com/github/suryathotapalli/Machine-Learning-/blob/main/Lab_08___Decision_Trees.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## A1

Initialize necessary libraries

In [None]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split

Initializing data

In [None]:
class_labels = ["acrostic", "ballad", "epigram", "haiku", "limerick", "sestina", "sonnet", "villanelle"]

Load embeddings from .csv file

In [None]:
# Load the dataset into a DataFrame
data_df = pd.read_csv("poems_data.csv")

# Drop rows with missing values
data_df.dropna(inplace=True)

# Extract features and target variable
X = data_df.drop(columns=['label']).values
y = data_df['label'].values

Splitting data into Train and Test datasets

In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Initializing Decision Tree Class

In [None]:
import numpy as np
from collections import Counter

class DecisionTree:
    def __init__(self):
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def _entropy(self, y):
        counter = Counter(y)
        entropy = 0
        for label in counter:
            prob = counter[label] / len(y)
            entropy -= prob * np.log2(prob)
        return entropy

    def _information_gain(self, X, y, feature_index):
        total_entropy = self._entropy(y)
        values, counts = np.unique(X[:, feature_index], return_counts=True)
        weighted_entropy = 0
        for value, count in zip(values, counts):
            subset_y = y[X[:, feature_index] == value]
            weighted_entropy += (count / len(y)) * self._entropy(subset_y)
        information_gain = total_entropy - weighted_entropy
        return information_gain

    def _find_best_split(self, X, y):
        best_gain = 0
        best_feature = None
        for feature_index in range(X.shape[1]):
            gain = self._information_gain(X, y, feature_index)
            if gain > best_gain:
                best_gain = gain
                best_feature = feature_index
        return best_feature

    def _build_tree(self, X, y):
        if len(set(y)) == 1:
            return {'label': y[0]}
        best_feature = self._find_best_split(X, y)
        values, counts = np.unique(X[:, best_feature], return_counts=True)
        node = {'feature': best_feature, 'children': {}}
        for value, count in zip(values, counts):
            subset_indices = np.where(X[:, best_feature] == value)[0]
            subset_X = X[subset_indices]
            subset_y = y[subset_indices]
            node['children'][value] = self._build_tree(subset_X, subset_y)
        return node

    def predict(self, X):
        predictions = []
        for sample in X:
            predictions.append(self._predict_sample(sample, self.tree))
        return predictions

    def _predict_sample(self, sample, tree):
        if 'label' in tree:
            return tree['label']
        feature_value = sample[tree['feature']]
        if feature_value not in tree['children']:
            # If a value is encountered in test data which was not seen during training,
            # we predict the majority label of the parent node.
            return Counter(tree['children']).most_common(1)[0][0]
        return self._predict_sample(sample, tree['children'][feature_value])

In [None]:
# Create and fit the DecisionTree model
dt = DecisionTree()
dt.fit(X_train, y_train)

# Print the structure of the decision tree
print("Decision Tree Structure:")
print(dt.tree)

# Make predictions on the test data
predictions = dt.predict(X_test)

print("Predictions:")
print(predictions)

Decision Tree Structure:
{'feature': 0, 'children': {-0.5675428: {'label': 2.0}, -0.5177372: {'label': 2.0}, -0.42911008: {'label': 4.0}, -0.35730004: {'label': 2.0}, -0.34059384: {'label': 4.0}, -0.3391202: {'label': 4.0}, -0.33297166: {'label': 4.0}, -0.3243278: {'label': 2.0}, -0.32309258: {'label': 4.0}, -0.32296684: {'label': 4.0}, -0.3074702: {'label': 4.0}, -0.30156597: {'label': 3.0}, -0.29620895: {'label': 4.0}, -0.288056: {'label': 2.0}, -0.28178996: {'label': 4.0}, -0.28171024: {'label': 2.0}, -0.2816199: {'label': 3.0}, -0.27063403: {'label': 4.0}, -0.26343188: {'label': 4.0}, -0.2613473: {'label': 4.0}, -0.26095122: {'label': 1.0}, -0.25508666: {'label': 3.0}, -0.25124165: {'label': 4.0}, -0.24777532: {'label': 4.0}, -0.24596791: {'label': 2.0}, -0.23899138: {'label': 2.0}, -0.23716187: {'label': 5.0}, -0.23574047: {'label': 4.0}, -0.23568916: {'label': 4.0}, -0.22403383: {'label': 4.0}, -0.22245936: {'label': 4.0}, -0.21741644: {'label': 3.0}, -0.21725368: {'label': 3.0},

TypeError: '>' not supported between instances of 'dict' and 'dict'