In [1]:
import pandas as pd
import numpy as np
eps = np.finfo(float).eps
from numpy import log2 as log
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [2]:
#reading only the columns containing categorical data from the input file along with the target column
df2=pd.read_csv('train.csv',usecols=['sales','salary','Work_accident','promotion_last_5years','left'])
#appending the 'left' column as the last column
left=df2['left']
df2=df2.drop(columns=['left'])
df2=df2.join(left)
#splitting the data for training and validation
msk = np.random.rand(len(df2)) < 0.8
train = df2[msk]
test = df2[~msk]

In [3]:
def find_entropy(df):
    target = df.keys()[-1]   
    entropy = 0
    values = df[target].unique()
    for value in values:
        fraction = df[target].value_counts()[value]/(len(df[target])+eps)
        entropy += -fraction*log(fraction+eps)
    return entropy  

In [6]:
def find_entropy_attribute(df,attribute):
    target = df.keys()[-1]   
    #Will return the unique values present in the target column 'left'
    target_variables = df[target].unique() 
    variables = df[attribute].unique()
    entropy2 = 0
    for variable in variables:
        entropy = 0
        for target_variable in target_variables:
            num = len(df[attribute][df[attribute]==variable][df[target] ==target_variable])
            den = len(df[attribute][df[attribute]==variable])
            fraction = num/(den+eps)
            entropy += -fraction*log(fraction+eps)
        fraction2 =(den)/len(df)
        entropy2 += -fraction2*entropy
    return abs(entropy2)

In [5]:
def find_winner(df):
    Entropy_att = []
    IG = []
    for key in df.keys()[:-1]:       
        IG.append(find_entropy(df)-find_entropy_attribute(df,key))
    return df.keys()[:-1][np.argmax(IG)]

In [7]:
def get_subtable(df, node,value):
    return df[df[node] == value].reset_index(drop=True)

In [8]:
def buildTree(df,tree=None): 
    
    if len(df.columns)==1:
        return df['left'].median()
    Class = df.keys()[-1]   

    #Get attribute with maximum information gain
    node = find_winner(df)
    
    #Get distinct value of that attribute
    attValue = np.unique(df[node])
    
    #Create an empty dictionary to create tree    
    if tree is None:                    
        tree={}
        tree[node] = {}

    for value in attValue:
        
        subtable = get_subtable(df,node,value)
        subtable=subtable.drop(columns=[node])
        clValue,counts = np.unique(subtable['left'],return_counts=True)                        
        
        if len(counts)==1:#Checking purity of subset
            tree[node][value] = clValue[0]                                                    
        else:        
            tree[node][value] = buildTree(subtable) #Calling the function recursively 
                   
    return tree


In [11]:
#This function is used to predict for any input variable 
def predict(inst,tree):
    for nodes in tree.keys():        
        
        value = inst[nodes]
        tree = tree[nodes][value]
        prediction = 0
            
        if type(tree) is dict:
            prediction = predict(inst, tree)
        else:
            return tree
            break;                               
    return prediction



In [12]:
def testing(data,tree):
    queries = data.iloc[:,:-1].to_dict(orient = "records")
    predicted = pd.DataFrame(columns=['predicted'])
    for i in range(len(data)):
        predicted.loc[i,"predicted"]=predict(queries[i],tree) 
    print accuracy_score(data['left'],predicted)
    print confusion_matrix(data['left'],predicted)
    print classification_report(data['left'],predicted)
    

In [13]:
#building tree and printing the tree formed. Testing the model over the 20% test data for validation.

tree=buildTree(train)
# import pprint
# pprint.pprint(tree)

#for validation, predicting the value of the test data(20% of the original data)
testing(test,tree)

0.7507802050824788
[[1683    0]
 [ 559    1]]
              precision    recall  f1-score   support

           0       0.75      1.00      0.86      1683
           1       1.00      0.00      0.00       560

   micro avg       0.75      0.75      0.75      2243
   macro avg       0.88      0.50      0.43      2243
weighted avg       0.81      0.75      0.64      2243



In [14]:
# for testing against test_sample.csv file
def predictleft(tree,filename):
    test_sample=pd.read_csv(filename)
    left=test_sample['left']
    test_sample=test_sample.drop(columns=['left'])
    test_sample=test_sample.join(left)
    test_sample
    queries = test_sample.iloc[:,:-1].to_dict(orient = "records")
    for i in range(len(test_sample)):
        predict(queries[i],tree) 
    testing(test_sample,tree)
    
# predictleft(tree,'sample_test.csv')

In [15]:
#Comparing result with in-built(scikit-learn) decision tree function to check correctness of algorithm used
df=df2
df
from sklearn import tree
model = tree.DecisionTreeClassifier()
le_salary = LabelEncoder()
le_sales=LabelEncoder()
df['sales_n'] = le_salary.fit_transform(df['sales'])
df['salary_n'] = le_sales.fit_transform(df['salary'])
df=df.drop(['sales','salary'],axis='columns')

#dividing the data into training and testing data(for validation)
msk = np.random.rand(len(df2)) < 0.8
train2 = df[msk]
test2 = df[~msk]
trainy=train2['left']
trainx=train2.drop(['left'],axis='columns')

#training the model
model.fit(trainx,trainy)
testy=test2['left']
testx=test2.drop(['left'],axis='columns')

#predicting over the test data
pred=model.predict(testx)
print accuracy_score(testy,pred)
print confusion_matrix(testy,pred)
print classification_report(testy,pred)

0.7728265618173875
[[1768    0]
 [ 520    1]]
              precision    recall  f1-score   support

           0       0.77      1.00      0.87      1768
           1       1.00      0.00      0.00       521

   micro avg       0.77      0.77      0.77      2289
   macro avg       0.89      0.50      0.44      2289
weighted avg       0.82      0.77      0.67      2289

