In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
from pprint import pprint

x = pd.read_csv('data/data/train.csv')
x_test = pd.read_csv('data/data/test.csv')

x_miss_train_data = pd.read_csv('data/data_missing/train.csv')
x_miss_test_data = pd.read_csv('data/data_missing/test.csv')

In [2]:
def get_accuracy(pred_label, true_label):
    
    cnt = 0
    for i in range(len(pred_label)):
        if pred_label[i] == true_label[i]:
            cnt += 1
    
    return cnt*100/len(pred_label)

In [3]:
def entropy(x):
    labels = x['label']
    all_cnt = len(list(x['label']))
    unique_labels = np.unique(x['label'])
    entropy = 0
    
    for v in unique_labels:
        cnt = x['label'].value_counts().loc[v]
        p = cnt/all_cnt
        entropy += (-p* np.log2(p))
        
    return entropy

In [4]:
print("entropy of the train data is {}".format(entropy(x)))

entropy of the train data is 1.1854716840497384


In [5]:
def info_gain(x, f):
    entropy_x = entropy(x)
    unique_vals = np.unique(x[f])
    
    new_entropy = 0
    for v in unique_vals:
        sub_x = x[x[f]==v]
        entropy_sub_x = entropy(sub_x)
        
        new_entropy += ((sub_x.shape[0]/x.shape[0])* entropy_sub_x)
        
    info_gain = entropy_x - new_entropy
    return info_gain

In [6]:
def get_best_feature(x):
    
    best_feature = None
    best_info_gain = 0
    
    for f in x.columns[:-1]:
        info_gain_f = info_gain(x, f)
        
        if best_info_gain < info_gain_f:
            best_info_gain = info_gain_f
            best_feature = f
    
    return best_feature

In [7]:
def decision_tree(x, depth, max_depth):
    
    if depth == max_depth or get_best_feature(x) == None:
        
        return list(x['label'].mode())[0]
    
    split_f = get_best_feature(x)
    
    if depth == 0:
        
        print("Best feature is", split_f, "with information gain:", info_gain(x, split_f))
    
    subtree = {split_f: {k:None for k in np.unique(x[split_f])}}
    
    for v in np.unique(x[split_f]):
        
        subtree[split_f][v] = decision_tree(x[x[split_f] == v], depth+1, max_depth)
        
    return subtree
    

In [8]:
def prediction(x_test, tree):
    predictions = []
    
    
    for idx, row in x_test.iterrows():
        
        subtree = tree
        while type(subtree) == dict:
            
            f = list(subtree.keys())[0]
            
            row_f_val = row.loc[f]
            
            if row_f_val in subtree[f]:
                subtree = subtree[f][row_f_val]
            else:
                subtree = 'unacc'
            
        
        predictions.append(subtree)
        
    
    return predictions

In [9]:
print("----------------------------Baseline--------------------------")
depth_of_tree = 0
tree = decision_tree(x, depth_of_tree, 0)

preds = prediction(x, tree)
# print(preds)
print("Train accuracy") 
print(get_accuracy(preds, list(x['label'])))

preds = prediction(x_test, tree)
# print(preds)
print("Test accuracy")
print(get_accuracy(preds, list(x_test['label'])))

----------------------------Baseline--------------------------
Train accuracy
70.98408104196817
Test accuracy
66.18497109826589


In [10]:
print("----------------------------Full Trees--------------------------")
depth_of_tree = 0
tree = decision_tree(x, depth_of_tree, 7)
# pprint(tree)


preds = prediction(x, tree)
# print(preds)
print("Train accuracy") 
print(get_accuracy(preds, list(x['label'])))

preds = prediction(x_test, tree)
# print(preds)
print("Test accuracy")
print(get_accuracy(preds, list(x_test['label'])))


----------------------------Full Trees--------------------------
Best feature is safety with information gain: 0.25557618418966443
Train accuracy
100.0
Test accuracy
86.41618497109826


In [11]:
print("Depth of the tree: 6")

Depth of the tree: 6


In [12]:
def testing_with_depths(d, x = x, x_test = x_test):
#     print("For Depth:", d, end=" ")
    tree = decision_tree(x, 0, d)
    # pprint(tree)
    preds = prediction(x, tree)
    # print(preds)
#     print("Train accuracy:",get_accuracy(preds, list(x['label'])), end=" ") 
    preds = prediction(x_test, tree)
#     print("Test accuracy:",get_accuracy(preds, list(x_test['label'])))
    return get_accuracy(preds, list(x_test['label']))

In [13]:
# files = ['data/data/CVfolds/fold1.csv']
def k_fold_cross_validation():
    ans = 0
    curr_max = 0
    for i in range(1,6):
        data_path = 'data/data/CVfolds/fold{}.csv'
        
        
        
        summ = []
        for j in range(1,6):
            l = list(pd.read_csv(data_path.format(t)) for t in range(1,6) if t!=j)
            temp = pd.concat(l).reset_index(drop = True)
            temp_test = pd.read_csv(data_path.format(j))
            summ.append(testing_with_depths(i, temp, temp_test))
        
        if curr_max < sum(summ)/5:
            ans = i
        print("-------for depth {}, cross_validation accuracy for testing data is {}, std is {}----------------".format(i,sum(summ)/5,np.std(summ)))

    return ans
        #self.cv_folds = [pd.read_csv(data_path.format(X)) for X in range(1, 6)]

In [14]:
print("----------------------------Limiting Depth---------------------------------------------")
best_depth = k_fold_cross_validation()
print("Best Depth after k_fold_cross_validation is {}".format(best_depth))

----------------------------Limiting Depth---------------------------------------------
Best feature is safety with information gain: 0.26292587914015986
Best feature is safety with information gain: 0.23577255515791706
Best feature is safety with information gain: 0.256967525268085
Best feature is safety with information gain: 0.25989034992447035
Best feature is safety with information gain: 0.26420752987933704
-------for depth 1, cross_validation accuracy is 70.98060681889271, std is 2.7953934138603644----------------
Best feature is safety with information gain: 0.26292587914015986
Best feature is safety with information gain: 0.23577255515791706
Best feature is safety with information gain: 0.256967525268085
Best feature is safety with information gain: 0.25989034992447035
Best feature is safety with information gain: 0.26420752987933704
-------for depth 2, cross_validation accuracy is 77.20467104577207, std is 1.567126928158673----------------
Best feature is safety with informati

In [15]:
depth_of_tree = 0
tree = decision_tree(x, depth_of_tree, 5)
# pprint(tree)

preds = prediction(x, tree)
print("Train accuracy with best depth after k_fold_cross_validation: ",get_accuracy(preds, list(x['label']))) 
# print(get_accuracy(preds, list(x['label'])))

preds = prediction(x_test, tree)
print("Test accuracy with best depth after k_fold_cross_validation: ",get_accuracy(preds, list(x_test['label'])))
# print(get_accuracy(preds, list(x_test['label'])))


Best feature is safety with information gain: 0.25557618418966443
Train accuracy with best depth after k_fold_cross_validation:  96.81620839363242
Test accuracy with best depth after k_fold_cross_validation:  92.48554913294798
