## Scratch impl of DT

#### Loading data

In [None]:
!gdown  1l53Fgkg1G1ekCxxgaDQ00EXrnSMTeJj-

Downloading...
From: https://drive.google.com/uc?id=1l53Fgkg1G1ekCxxgaDQ00EXrnSMTeJj-
To: /content/sample_data.csv
  0% 0.00/32.5k [00:00<?, ?B/s]100% 32.5k/32.5k [00:00<00:00, 62.2MB/s]


In [None]:
import pandas as pd
import numpy as np

In [None]:
sample_data = pd.read_csv('sample_data.csv')

In [None]:
sample_data

Unnamed: 0,Gender,Age_less_35,JobRole,Attrition
0,Male,True,Laboratory Technician,0
1,Male,False,Sales Executive,1
2,Male,True,Sales Representative,1
3,Female,False,Healthcare Representative,0
4,Male,True,Sales Executive,0
...,...,...,...,...
995,Male,False,Laboratory Technician,1
996,Female,False,Manufacturing Director,0
997,Female,True,Sales Executive,0
998,Male,False,Manager,0


In [None]:
sample_data.Attrition.value_counts()

0    831
1    169
Name: Attrition, dtype: int64

#### Entropy

In [None]:
def entropy(y):

    if isinstance(y, pd.Series):
        # calculating probability
        p = y.value_counts()/y.shape[0]

        entropy = np.sum(-p*np.log2(p+1e-9)) # adding delta 1e-9 in case p = 0 as log(0) is not defined

        return(entropy)

    else:
        raise('Object must be a Pandas Series.')



#### Weighted entropy for child nodes

In [None]:
def calculate_weighted_entropy(feature,y):
    '''
        feature -> series (Gender/ Age < 35)
        y -> series
    '''
    categories = feature.unique()

    weighted_entropy = 0

    for category in categories:
        y_category = y[feature == category]
        entropy_category = entropy(y_category)
        # print(category)
        # print(entropy_category)
        weighted_entropy += y_category.shape[0]/y.shape[0]*entropy_category


    return weighted_entropy



#### Information gain

In [None]:
def information_gain(feature,y):
    parent_entropy = entropy(y)

    child_entropy = calculate_weighted_entropy(feature,y)

    ig = parent_entropy - child_entropy

    return ig

#### Entropy at root node

In [None]:
entropy(sample_data.Attrition)

0.6554120789588516

#### Weighted entropy of child for Gender

In [None]:
calculate_weighted_entropy(sample_data.Gender, sample_data.Attrition)

0.6554087812684072

#### Information Gain for Gender

In [None]:
information_gain(sample_data.Gender,sample_data.Attrition)

3.2976904443815513e-06

#### Comparing Information gain for features

In [None]:
for feature in sample_data.columns[:-1]:
    print(f'Information Gain for feature {feature} is {information_gain(sample_data[feature],sample_data.Attrition)}')

Information Gain for feature Gender is 3.2976904443815513e-06
Information Gain for feature Age_less_35 is 0.021463064919737374
Information Gain for feature JobRole is 0.05429906805101925
