In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
from pprint import pprint

x = pd.read_csv('data/roberta.train.csv')
x_test = pd.read_csv('data/roberta.test.csv')

# x_miss_train_data = pd.read_csv('data/data_missing/train.csv')
# x_miss_test_data = pd.read_csv('data/data_missing/test.csv')

In [2]:
def get_accuracy(pred_label, true_label):
    
    cnt = 0
    for i in range(len(pred_label)):
        if pred_label[i] == true_label[i]:
            cnt += 1
    
    return cnt*100/len(pred_label)

In [3]:
def entropy(x):
    labels = x['label']
    all_cnt = len(list(x['label']))
    unique_labels = np.unique(x['label'])
    entropy = 0
    
    for v in unique_labels:
        cnt = x['label'].value_counts().loc[v]
        p = cnt/all_cnt
        entropy += (-p* np.log2(p))
        
    return entropy

In [4]:
print("entropy of the train data is {}".format(entropy(x)))

entropy of the train data is 0.9999870776669013


In [5]:
def info_gain(x, f):
    entropy_x = entropy(x)
    unique_vals = np.unique(x[f])
    
    new_entropy = 0
    for v in unique_vals:
        sub_x = x[x[f]==v]
        entropy_sub_x = entropy(sub_x)
        
        new_entropy += ((sub_x.shape[0]/x.shape[0])* entropy_sub_x)
        
    info_gain = entropy_x - new_entropy
    return info_gain

In [6]:
def get_best_feature(x):
    
    best_feature = None
    best_info_gain = 0
    
    for f in x.columns[:-1]:
        info_gain_f = info_gain(x, f)
        
        if best_info_gain < info_gain_f:
            best_info_gain = info_gain_f
            best_feature = f
    
    return best_feature

In [7]:
def decision_tree(x, depth, max_depth):
    
    if depth == max_depth or get_best_feature(x) == None:
        
        return list(x['label'].mode())[0]
    
    split_f = get_best_feature(x)
    
    if depth == 0:
        
        print("Best feature is", split_f, "with information gain:", info_gain(x, split_f))
    
    subtree = {split_f: {k:None for k in np.unique(x[split_f])}}
    
    for v in np.unique(x[split_f]):
        
        subtree[split_f][v] = decision_tree(x[x[split_f] == v], depth+1, max_depth)
        
    return subtree
    

In [8]:
def prediction(x_test, tree):
    predictions = []
    
    
    for idx, row in x_test.iterrows():
        
        subtree = tree
        while type(subtree) == dict:
            
            f = list(subtree.keys())[0]
            
            row_f_val = row.loc[f]
            
            if row_f_val in subtree[f]:
                subtree = subtree[f][row_f_val]
            else:
                subtree = 'unacc'
            
        
        predictions.append(subtree)
        
    
    return predictions

In [9]:
print("----------------------------Baseline--------------------------")
depth_of_tree = 0
tree = decision_tree(x, depth_of_tree, 0)

preds = prediction(x, tree)
# print(preds)
print("Train accuracy") 
print(get_accuracy(preds, list(x['label'])))

preds = prediction(x_test, tree)
# print(preds)
print("Test accuracy")
print(get_accuracy(preds, list(x_test['label'])))

----------------------------Baseline--------------------------
Train accuracy
50.21162528216704
Test accuracy
50.36208031599737


In [None]:
print("----------------------------Full Trees--------------------------")
depth_of_tree = 0
tree = decision_tree(x, depth_of_tree, 7)
# pprint(tree)


preds = prediction(x, tree)
# print(preds)
print("Train accuracy") 
print(get_accuracy(preds, list(x['label'])))

preds = prediction(x_test, tree)
# print(preds)
print("Test accuracy")
print(get_accuracy(preds, list(x_test['label'])))


----------------------------Full Trees--------------------------


In [None]:
print("Depth of the tree: 6")

In [None]:
def testing_with_depths(d, x = x, x_test = x_test):
#     print("For Depth:", d, end=" ")
    tree = decision_tree(x, 0, d)
    # pprint(tree)
    preds = prediction(x, tree)
    # print(preds)
#     print("Train accuracy:",get_accuracy(preds, list(x['label'])), end=" ") 
    preds = prediction(x_test, tree)
#     print("Test accuracy:",get_accuracy(preds, list(x_test['label'])))
    return get_accuracy(preds, list(x_test['label']))

In [None]:
print("----------------------------Limiting Depth---------------------------------------------")
best_depth = k_fold_cross_validation()
print("Best Depth after k_fold_cross_validation is {}".format(best_depth))

In [None]:
depth_of_tree = 0
tree = decision_tree(x, depth_of_tree, best_depth)
# pprint(tree)

preds = prediction(x, tree)
print("Train accuracy with best depth after k_fold_cross_validation: ",get_accuracy(preds, list(x['label']))) 
# print(get_accuracy(preds, list(x['label'])))

preds = prediction(x_test, tree)
print("Test accuracy with best depth after k_fold_cross_validation: ",get_accuracy(preds, list(x_test['label'])))
# print(get_accuracy(preds, list(x_test['label'])))
