# **ML Lab - Decision Trees**
Urlana Suresh Kumar - 22071A6662

# Decision Trees and Bag of Words in Python

In this notebook, we will explore two different concepts:
1. **Decision Trees**: Using Gini index and entropy for training decision tree classifiers.
2. **Bag of Words**: Using `CountVectorizer` to convert text data into a matrix of token counts.

## Step 1: Importing Required Libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer

## Step 2: Importing Dataset


In [None]:
# Function importing Dataset
def importdata():
    balance_data = pd.read_csv(
        'https://archive.ics.uci.edu/ml/machine-learning-' +
        'databases/balance-scale/balance-scale.data',
        sep=',', header=None)

    # Printing the dataset shape and first few rows
    print("Dataset Length: ", len(balance_data))
    print("Dataset Shape: ", balance_data.shape)
    print("Dataset: ", balance_data.head())
    return balance_data

## Step 3: Splitting Dataset into Training and Testing

In [None]:
# Function to split the dataset
def splitdataset(balance_data):
    # Separating the target variable
    X = balance_data.values[:, 1:5]
    Y = balance_data.values[:, 0]

    # Splitting the dataset into train and test
    X_train, X_test, y_train, y_test = train_test_split(
        X, Y, test_size=0.3, random_state=100)

    return X, Y, X_train, X_test, y_train, y_test

## Step 4: Training Decision Tree using Gini Index

In [None]:
# Function to perform training with Gini Index
def train_using_gini(X_train, X_test, y_train):
    # Creating the classifier object
    clf_gini = DecisionTreeClassifier(criterion="gini", random_state=100, max_depth=3, min_samples_leaf=5, class_weight='balanced')
    clf_entropy = DecisionTreeClassifier(criterion="entropy", random_state=100, max_depth=3, min_samples_leaf=5, class_weight='balanced')


    # Performing training
    clf_gini.fit(X_train, y_train)
    return clf_gini

## Step 5: Training Decision Tree using Entropy

In [None]:
# Function to perform training with Entropy
def tarin_using_entropy(X_train, X_test, y_train):
    # Decision tree with entropy
    clf_entropy = DecisionTreeClassifier(
        criterion="entropy", random_state=100, max_depth=3, min_samples_leaf=5)

    # Performing training
    clf_entropy.fit(X_train, y_train)
    return clf_entropy

## Step 6: Making Predictions

In [None]:
# Function to make predictions
def prediction(X_test, clf_object):
    # Prediction on test data
    y_pred = clf_object.predict(X_test)
    print("Predicted values:")
    print(y_pred)
    return y_pred

## Step 7: Calculating Accuracy

In [None]:
# Function to calculate accuracy
def cal_accuracy(y_test, y_pred):
    print("Confusion Matrix: ", confusion_matrix(y_test, y_pred))
    print("Accuracy : ", accuracy_score(y_test, y_pred)*100)
    print("Report : ", classification_report(y_test, y_pred, zero_division=0))  # Set

# Main Execution Code


In [None]:
# Driver code
def main():
    # Building Phase
    data = importdata()
    X, Y, X_train, X_test, y_train, y_test = splitdataset(data)
    clf_gini = train_using_gini(X_train, X_test, y_train)
    clf_entropy = tarin_using_entropy(X_train, X_test, y_train)

    # Operational Phase
    print("Results Using Gini Index:")
    # Prediction using gini
    y_pred_gini = prediction(X_test, clf_gini)
    cal_accuracy(y_test, y_pred_gini)

    print("Results Using Entropy:")
    # Prediction using entropy
    y_pred_entropy = prediction(X_test, clf_entropy)
    cal_accuracy(y_test, y_pred_entropy)

# Calling main function
if __name__ == "__main__":
    main()

Dataset Length:  625
Dataset Shape:  (625, 5)
Dataset:     0  1  2  3  4
0  B  1  1  1  1
1  R  1  1  1  2
2  R  1  1  1  3
3  R  1  1  1  4
4  R  1  1  1  5
Results Using Gini Index:
Predicted values:
['R' 'B' 'R' 'B' 'R' 'B' 'R' 'L' 'B' 'R' 'R' 'B' 'B' 'B' 'R' 'B' 'R' 'L'
 'R' 'R' 'B' 'R' 'B' 'L' 'R' 'L' 'B' 'L' 'R' 'L' 'L' 'L' 'R' 'L' 'L' 'L'
 'B' 'L' 'B' 'B' 'L' 'L' 'L' 'L' 'R' 'R' 'L' 'L' 'R' 'L' 'B' 'R' 'B' 'B'
 'R' 'L' 'R' 'R' 'L' 'B' 'B' 'L' 'L' 'B' 'L' 'B' 'B' 'R' 'L' 'L' 'B' 'L'
 'R' 'L' 'R' 'L' 'L' 'R' 'R' 'B' 'R' 'B' 'L' 'B' 'R' 'R' 'R' 'L' 'R' 'L'
 'B' 'R' 'L' 'L' 'L' 'R' 'R' 'L' 'B' 'B' 'L' 'L' 'L' 'R' 'R' 'R' 'B' 'R'
 'R' 'B' 'L' 'L' 'R' 'R' 'L' 'B' 'R' 'L' 'B' 'R' 'B' 'R' 'R' 'L' 'B' 'L'
 'L' 'L' 'L' 'R' 'R' 'R' 'R' 'L' 'R' 'R' 'R' 'L' 'L' 'R' 'L' 'R' 'L' 'R'
 'L' 'B' 'R' 'L' 'B' 'R' 'L' 'R' 'B' 'L' 'R' 'R' 'L' 'R' 'R' 'R' 'L' 'L'
 'B' 'L' 'R' 'L' 'R' 'R' 'B' 'R' 'B' 'L' 'B' 'R' 'B' 'B' 'R' 'L' 'L' 'R'
 'B' 'R' 'L' 'L' 'L' 'L' 'R' 'R']
Confusion Matrix:  [[ 2  5  6]
 [2

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Bag of Words Example using CountVectorizer

In [None]:
# A list of sentences
sentences = ["This pasta is very tasty and affordable.",
             "This pasta is not tasty and is affordable.",
             "This pasta is very very delicious."]

# Create object for count vectorizer
countvectorizer = CountVectorizer()

# Fit corpus to object of count vectorizer
X = countvectorizer.fit_transform(sentences)

# Convert result to array for visualization
result = X.toarray()
print(f"Result of bag of words: {result}")


Result of bag of words: [[1 1 0 1 0 1 1 1 1]
 [1 1 0 2 1 1 1 1 0]
 [0 0 1 1 0 1 0 1 2]]
