#### CODE : Entropy 

In [151]:
import numpy as np

In [152]:
def entropy(var):
    N = var.shape[0]
    values, counts = np.unique(var, return_counts=True )
    
    ent = 0.0
    
    for i in counts:
        p = i/N
        ent += (p * np.log2(p))
        
    return -ent

In [153]:
Y = np.array([1,0,0,1,0,1,0,1])

In [154]:
X = np.array([1,1,1,1,1,1,1])

In [155]:
entropy(Y)

1.0

In [156]:
entropy(X)

-0.0

### CODE : Split Data

In [157]:
import pandas as pd

In [158]:
df = pd.read_csv('file.csv')
output_feature='job_satisfaction'

In [159]:
df.head()

Unnamed: 0,emp_id,department,job_role,over_time,performance,attrition,job_satisfaction
0,1,hr,hr,yes,excellent,yes,very_satisfied
1,2,marketing,marketing,yes,excellent,no,very_satisfied
2,3,finance,finance,no,good,no,satisfied
3,4,it,operation,yes,bad,yes,dissatisfied
4,5,hr,hr,no,excellent,yes,neutral


In [160]:
# test=df.values.tolist();
# print(test)
# for i in range(len(test)):
#     test[i].pop(0)
# print(test)

# df=pd.DataFrame(test);
# df.head(5)
df=pd.read_csv('../temp.csv')
print(df)

   department   job_role over_time performance attrition job_satisfaction
0          hr         hr       yes   excellent       yes   very_satisfied
1   marketing  marketing       yes   excellent        no   very_satisfied
2     finance    finance        no        good        no        satisfied
3          it  operation       yes         bad       yes     dissatisfied
4          hr         hr        no   excellent       yes          neutral
5     finance    finance       yes        good        no     dissatisfied
6          hr         hr        no   excellent       yes        satisfied
7   marketing  marketing       yes         bad        no   very_satisfied
8          hr         hr        no   excellent       yes   very_satisfied
9          it  operation       yes         bad        no        satisfied
10         hr         hr        no   excellent       yes     dissatisfied
11         it  operation       yes        good        no        satisfied
12  marketing  marketing        no    

In [161]:
def divide_data(data, feature):
    # here we are working data frames.
    
    DATA = {}
    
    feat_values = list(data[feature].value_counts().index)
    occurence = list(data[feature].value_counts())
    
    for val in feat_values:
        DATA[val] = {'data' : pd.DataFrame([], columns = data.columns), 'len': 0}
        
    
    for ix in range(data.shape[0]):
        val = data[feature].iloc[ix]
        
        DATA[val]['data'] = DATA[val]['data']._append(data.iloc[ix])
    
        idx = feat_values.index(val)
        DATA[val]['len']  = occurence[idx]
    
    return DATA

In [162]:
divide_data(df,'department')

{'hr': {'data':    department job_role over_time performance attrition job_satisfaction
  0          hr       hr       yes   excellent       yes   very_satisfied
  4          hr       hr        no   excellent       yes          neutral
  6          hr       hr        no   excellent       yes        satisfied
  8          hr       hr        no   excellent       yes   very_satisfied
  10         hr       hr        no   excellent       yes     dissatisfied,
  'len': 5},
 'marketing': {'data':    department   job_role over_time performance attrition job_satisfaction
  1   marketing  marketing       yes   excellent        no   very_satisfied
  7   marketing  marketing       yes         bad        no   very_satisfied
  12  marketing  marketing        no         bad        no        satisfied
  14  marketing  marketing        no         bad        no        satisfied,
  'len': 4},
 'finance': {'data':    department job_role over_time performance attrition job_satisfaction
  2     finance  fin

### CODE : Information Gain

In [163]:
def information_gain(data, feature):
    examples = data.shape[0]
    
    DATA = divide_data(data, feature)
    
    keys = DATA.keys()
    
    
    ent_of_children = 0.0
    
    for key in keys:
        ent_of_children +=  ( (DATA[key]['len']/examples) * entropy(DATA[key]['data'][output_feature]) )
        
    info_gain = entropy(data[output_feature]) - ent_of_children
    return info_gain

In [164]:
information_gain(df, 'department')

0.5316112300691394

In [165]:
information_gain(df, 'job_role')

0.5316112300691394

In [166]:
information_gain(df, 'over_time')

0.19679163198163652

In [167]:
information_gain(df, 'performance')

0.3655962303576019

In [168]:
information_gain(df, 'attrition')

0.2537751634665262

### Constructing a Decision Tree


In [169]:
class DecisionTree:
    
    # constructor
    def __init__(self, depth=0, max_depth=5):
        # Creating a Node
        self.children = {}
        self.fkey = None
        self.max_depth = max_depth
        self.depth = depth
        self.target = None
        
    def train(self, data):
        features = data.columns[:-1]
        
        info_gains = []
            
        for f in features:
            i_gain = information_gain(data, f)
            info_gains.append(i_gain)
            
        # finding the best feature
        self.fkey = features[np.argmax(info_gains)]
        
        #Spliting the Data
        DATA = divide_data(data, self.fkey)
        
        
        
        # Giving a target label to the Node
        labels = list(data[output_feature].value_counts().index)
        freq = list(data[output_feature].value_counts().values)
        
        self.target = labels[np.argmax(freq)]
        

        ###### STOPPING CONDITIONS ######
        
        have_data = 0
        keys = DATA.keys()
        
        for key in keys:
            if DATA[key]['len'] > 0:
                have_data +=1
        
        # 1. If it is pure node 
        if have_data<2:
            print("\t"*self.depth + "Leaf Node : "+ self.target)
            return 
        
    
        # 2. Early Stop if you have reached max depth
        if(self.depth >= self.max_depth):
            print("\t"*self.depth + "Leaf Node : "+ self.target)
            return
        
        
        print("\t"*self.depth + "Node : ", self.fkey)
        
        
        # Recursively train child Node
        for key in keys:
            new_data = DATA[key]['data']
            self.children[key] = DecisionTree(depth = self.depth + 1)
            self.children[key].train(new_data)
            
        return  
    
    
    def predict(self, test):
        if self.children == {}:
            return self.target
        print("\t"*self.depth + "Testing Node for - ", self.fkey);
        return self.children[test[self.fkey][0]].predict(test)

# Model

In [170]:
model = DecisionTree()

In [171]:
model.train(df)

Node :  department
	Node :  over_time
		Leaf Node : neutral
		Leaf Node : very_satisfied
	Node :  over_time
		Leaf Node : very_satisfied
		Leaf Node : satisfied
	Node :  over_time
		Leaf Node : dissatisfied
		Leaf Node : satisfied
	Node :  attrition
		Leaf Node : satisfied
		Leaf Node : dissatisfied


In [172]:
model

<__main__.DecisionTree at 0x1472f0850>

In [173]:
model.target

'satisfied'

In [174]:
model.fkey

'department'

In [175]:
model.children

{'hr': <__main__.DecisionTree at 0x1470bc4c0>,
 'marketing': <__main__.DecisionTree at 0x144edcd30>,
 'finance': <__main__.DecisionTree at 0x1466f1e10>,
 'it': <__main__.DecisionTree at 0x1449bf550>}

In [176]:
model.children['2']

KeyError: '2'

In [None]:
model.children['1'].fkey

'credit_rating'

In [None]:
model.children['2'].children

{'fair': <__main__.DecisionTree at 0x1446d77f0>,
 'excellent': <__main__.DecisionTree at 0x1446d57e0>}

In [None]:
# model.children['2'].children['_'].children

{}

In [None]:
model.children['_'].target

'yes'

In [None]:
model.children['_'].children

{}

# Prediction 

In [None]:
test_data=[]
x_test = pd.DataFrame([['<=30', 'low', 'no', 'fair']], columns=list(df.columns.values[:-1]))

In [None]:
x_test

Unnamed: 0,age,income,student,credit_rating
0,<=30,low,no,fair


In [None]:
model.predict(x_test)

Testing Node for -  age
	Testing Node for -  student


'no'