<a href='https://www.darshan.ac.in/'> <img src='https://www.darshan.ac.in/Content/media/DU_Logo.svg' width="250" height="300"/></a>
<pre>
<center><b><h1>Data Mining</b></center> 
<pre>



# Implement Decision Tree(ID3) in python
Uses Information Gain to choose the best feature to split. 

Recursively builds the tree until stopping conditions are met.

1) Calculate Entropy for the dataset.<BR>
2) Calculate Information Gain for each feature. <BR>
3) Choose the feature with maximum Information Gain. <BR>
4) Split dataset into subsets for that feature. <BR>
5) Repeat recursively until: <BR>

All samples in a node have the same label.<BR>
No features are left.<BR>
No data is left.

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv). 

##  import Pandas, Numpy

In [1]:
import pandas as pd
import numpy as np

##  Create Following Data

In [None]:
data = pd.read_csv("Heart Disease.csv", encoding='latin1', on_bad_lines='skip')     

In [3]:
data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3070,59,1,1,140,221,0,1,164,1,0.0,2,0,2,1
3071,60,1,0,125,258,0,0,141,1,2.8,1,1,3,0
3072,47,1,0,110,275,0,0,118,1,1.0,1,1,2,0
3073,50,0,0,110,254,0,0,159,0,0.0,2,0,2,1


## Now Define Function to  Calculate Entropy

In [4]:
def entropy(y):
    elements,value_counts = np.unique(y,return_counts=True)
    probabilities = value_counts / value_counts.sum()
    entropy = -np.sum(probabilities * np.log2(probabilities))
    return entropy
  

## Define function to Calculate Information Gain

In [5]:
def information_gain(data, split_attribute, target):
    total_entropy = entropy(data[target])

    elements, value_counts = np.unique(data[split_attribute], return_counts=True)

    weighted_entropy = 0.0
    for i in range(len(elements)):
        subset = data[data[split_attribute] == elements[i]]
        weighted_entropy += (value_counts[i] / value_counts.sum()) * entropy(subset[target])

    information_gain = total_entropy - weighted_entropy
    return information_gain
   

## Implement ID3 Algo

In [6]:
def id3(data, features, target):
    # If all labels are same → return the label
    if len(np.unique(data[target])) == 1:
        return data[target].iloc[0] 
   
    
    # If no features left → return majority label
    if len(features) == 0:
        return data[target].mode()[0]
    
    
    # Choose best feature
    gains = [information_gain(data, feature, target) for feature in features]
    best_feature = features[np.argmax(gains)]

    tree = {best_feature: {}}

    
    # For each value of best feature → branch
    for value in np.unique(data[best_feature]):
        sub_data = data[data[best_feature] == value].drop(columns=[best_feature])
        sub_tree = id3(sub_data, [f for f in features if f != best_feature], target)
        tree[best_feature][value] = sub_tree
        
    # Return the tree    
    return tree

## Use ID3

In [7]:
id3_tree = id3(data, data.columns[:-1], 'target')

## Print Tree

In [8]:
print("ID3 Decision Tree: ", id3_tree)


ID3 Decision Tree:  {'chol': {np.int64(126): np.int64(1), np.int64(131): np.int64(0), np.int64(141): np.int64(1), np.int64(149): {'age': {np.int64(49): np.int64(0), np.int64(71): np.int64(1)}}, np.int64(157): np.int64(1), np.int64(160): np.int64(1), np.int64(164): np.int64(0), np.int64(166): np.int64(0), np.int64(167): np.int64(0), np.int64(168): np.int64(1), np.int64(169): np.int64(0), np.int64(172): np.int64(0), np.int64(174): np.int64(0), np.int64(175): np.int64(1), np.int64(176): np.int64(0), np.int64(177): {'age': {np.int64(43): np.int64(0), np.int64(46): np.int64(1), np.int64(59): np.int64(0), np.int64(65): np.int64(1)}}, np.int64(178): np.int64(1), np.int64(180): np.int64(1), np.int64(182): np.int64(1), np.int64(183): np.int64(1), np.int64(184): np.int64(0), np.int64(185): np.int64(0), np.int64(186): np.int64(1), np.int64(187): np.int64(0), np.int64(188): np.int64(0), np.int64(192): np.int64(1), np.int64(193): {'age': {np.int64(56): np.int64(1), np.int64(68): np.int64(0)}}, np.i