In [1]:
from xclib.data import data_utils
import numpy as np
import pandas as pd
from math import log2
import time
import matplotlib.pyplot as plt

In [2]:
# Read sparse file
train_x = data_utils.read_sparse_file('ass3_parta_data/train_x.txt', force_header=True)
train_x = np.array(train_x.toarray(),dtype =int)
train_y = pd.read_csv('ass3_parta_data/train_y.txt', sep="\n", header=None).to_numpy()

test_x = data_utils.read_sparse_file('ass3_parta_data/test_x.txt', force_header=True)
test_x = np.array(test_x.toarray(),dtype =int)
test_y = pd.read_csv('ass3_parta_data/test_y.txt', sep="\n", header=None).to_numpy()

val_x = data_utils.read_sparse_file('ass3_parta_data/valid_x.txt', force_header=True)
val_x = np.array(val_x.toarray(),dtype =int)
val_y = pd.read_csv('ass3_parta_data/valid_y.txt', sep="\n", header=None).to_numpy()



In [3]:
def entropy(data):
    """
    Calculates the uncertanity of data :
    sparse distibuted data low entropy
    mixed distibuted data high entropy
    """
    totaldata = data.shape[0]
    
    Totalpositive = np.sum(data[:,-1]==1)
    Totalnegative = np.sum(data[:,-1]==0)
    H = 0 
    
    # When only one class of data -> Pure
    if Totalpositive ==0 or Totalnegative ==0:
        return 0
    
    else:
        positive_ratio = Totalpositive/totaldata
        negative_ratio = Totalnegative/totaldata
        H = -positive_ratio * log2(positive_ratio) + (-negative_ratio * log2(negative_ratio))
        return H

In [4]:
def info_gain(data,true_data_index,false_data_index,H):
    """
    Takes data, entropy of that data (H)
    as well as a col number to find info_gain 
    if we split on that col median
    """
    totalDataSize =data.shape[0]
    true_data = data[true_data_index]
    false_data = data[false_data_index]
    
    true_branch_ratio = true_data.shape[0]/totalDataSize
    false_branch_ratio = false_data.shape[0]/totalDataSize  

    if true_branch_ratio == 0 or false_branch_ratio ==0:
        return 0
    
    trueEntropy  = entropy(true_data)
    falseEntropy = entropy(false_data)
    
    I = H - ((true_branch_ratio * trueEntropy) + (false_branch_ratio * falseEntropy))
    return I

In [5]:
def best_split(data,H):
    
    best_info_gain = -1
    best_attr = -1
#     split_median = -1
    trueIndex = None
    falseIndex = None
    # consider all attributes except the last one i.e label *y*
    medianList  = np.median(data[:,:-1],axis =0)

    
    for colIndex, median in enumerate(medianList):
        
        true_data_index = np.where(data[:,colIndex] <= median)
        false_data_index = np.where(data[:,colIndex] > median)

        
        IG = info_gain(data,true_data_index,false_data_index,H)

        if IG > best_info_gain:
            best_info_gain = IG
            best_attr = colIndex
            split_median = median
            trueIndex = true_data_index
            falseIndex = false_data_index

    return best_attr, split_median, trueIndex,falseIndex, best_info_gain

In [6]:
class Decision_nodes:

    total_decision_nodes = 0
    depth = 0
    
    def __init__(self,depth):
        self.depth = depth
        self.left_child =None
        self.right_chid = None
        self.column = None
        self.split_median = None
        self.label = None
        
        #----------Class Variable ---------------
        #counts number of nodes created each time a new node is created
        Decision_nodes.total_decision_nodes+=1
        
        if depth > Decision_nodes.depth:
            Decision_nodes.depth = depth
        #----------------------------------------

    def assignLabel(self,data):
        if sum(data[:,-1]==0) >= sum(data[:,-1]==1):
            self.label = 0
        else:
            self.label = 1
            
            
    @classmethod        
    def get_node_count(cls):
        return cls.total_decision_nodes
    @classmethod
    def reset_node_count(cls):
        cls.total_decision_nodes = 0

In [7]:
class Decision_tree:
    
    
    train_accuracy = []
    test_accuracy = []
    val_accuracy = []
    
    
    def __init__(self,node,train_y,test_y,val_y,Train_data,Test_data,Val_data):
        self.root = node
        
        self.train_y_pred = [-1]*train_y.shape[0]
        self.test_y_pred  = [-1]*test_y.shape[0]
        self.val_y_pred = [-1]*val_y.shape[0]
        
        self.train_y = train_y
        self.test_y = test_y
        self.val_y = val_y
        
        self.Train_data = Train_data
        self.Test_data = Test_data
        self.Val_data  = Val_data
        
        

    def Build_tree(self,node, data, last_node,
                   Train_data, Test_data,
                   Val_data):


        H = entropy(data)

        best_attr, split_median, trueIndex, falseIndex, best_info_gain =  best_split(data,H)

        if H ==0 or best_attr == -1 or best_info_gain <= 0:
            node.assignLabel(data)
            return 



        true_branch = data[trueIndex]
        false_branch = data[falseIndex]

        if true_branch.shape[0] ==0 or false_branch.shape[0]==0:
            node.assignLabel(data)
            return 



        node.assignLabel(data)
        node.column  = best_attr
        node.split_median = split_median

        nc = Decision_nodes.get_node_count()
        if nc==1 or nc%88 == 1:
            (train_acc, test_acc, val_acc) = self.find_acc()
            print("last node depth ",last_node.depth)
            print("node count ",nc)
            print("train_acc ",train_acc,"test_acc ",test_acc,"val_acc ",val_acc)
            Decision_tree.train_accuracy.append((nc,train_acc))
            Decision_tree.test_accuracy.append((nc,test_acc))
            Decision_tree.val_accuracy.append((nc,val_acc))
            last_node = node

        Train_data_left = Train_data[Train_data[:,best_attr] <= split_median]
        Train_data_right = Train_data[Train_data[:,best_attr] > split_median]

        Test_data_left = Test_data[Test_data[:,best_attr] <= split_median]
        Test_data_right = Test_data[Test_data[:,best_attr] > split_median]

        Val_data_left = Val_data[Val_data[:,best_attr] <= split_median]
        Val_data_right = Val_data[Val_data[:,best_attr] > split_median]

        left_node =  Decision_nodes(node.depth+1)
        right_node = Decision_nodes(node.depth+1)


        node.left_child = left_node
        self.Build_tree(left_node,true_branch,last_node,
                                     Train_data_left, Test_data_left, 
                                     Val_data_left)
        node.right_chid = right_node
        self. Build_tree(right_node,false_branch,last_node,
                                     Train_data_right, Test_data_right,
                                     Val_data_right)

        return 
         

    def classify(self,row,node):
        # Base case: we've reached a leaf
        if node.left_child ==None and node.right_chid ==None:
            return node.label
#         print("row len is classify ",len(row))
        col = node.column
        median = node.split_median
        if row[col] <= median:
            return self.classify(row, node.left_child)
        elif node.right_chid ==None:
            return node.label
        else:
            return self.classify(row, node.right_chid)
    
   
    def find_acc(self):
        #------------------------------------------
        
 
        for row in self.Train_data:
            pred = self.classify(row,self.root)
            self.train_y_pred[row[-1]] = pred
        match = 0
        for i in range(len(self.train_y_pred)):
            if self.train_y_pred[i] == -1:
                print("something is wrong")
            if self.train_y_pred[i] == self.train_y[i]:
                match+=1
        train_acc = match/len(self.train_y_pred)
        
        
        #-------------------------------------------  
        
        
        for row in self.Test_data:
            pred=self.classify(row,self.root)
            self.test_y_pred[row[-1]] = pred
        match = 0
        for i in range(len(self.test_y_pred)):
            if self.test_y_pred == -1:
                print("something is wrong")
            if self.test_y_pred[i] == self.test_y[i]:
                match+=1
        test_acc = match/len(self.test_y_pred)
        
        
        #--------------------------------------------   
        
        
        for row in self.Val_data:
            pred=self.classify(row,self.root)
            self.val_y_pred[row[-1]] = pred
        match = 0
        for i in range(len(self.val_y_pred)):
            if self.val_y_pred == -1:
                print("something is wrong")
            if self.val_y_pred[i] == self.val_y[i]:
                match+=1
        val_acc = match/len(self.val_y_pred)
        
        
        #---------------------------------------------   
        return train_acc, test_acc, val_acc
            
        
    

          

In [8]:
Train_data = np.append(train_x,train_y.reshape((-1,1)),axis = 1)
Test_data = np.append(test_x,test_y.reshape((-1,1)),axis = 1)
Val_data = np.append(val_x,val_y.reshape((-1,1)),axis = 1)
data = Train_data.copy()
#index col
Train_data = np.append(Train_data,np.arange(Train_data.shape[0]).reshape((-1,1)),axis =1)
Test_data = np.append(Test_data,np.arange(Test_data.shape[0]).reshape((-1,1)),axis =1)
Val_data = np.append(Val_data,np.arange(Val_data.shape[0]).reshape((-1,1)),axis =1)

In [9]:
s= time.time()
root = Decision_nodes(0)
tree = Decision_tree(root,train_y,test_y,val_y,Train_data,Test_data,Val_data)

# tree.Build_tree(root, data, root,
#                    Train_data, Test_data,
#                    Val_data)

# print(time.time()-s)

In [10]:
def predict(X,node):
    prediction = []
    for row in X:
        prediction.append(tree.classify(row,node))
    return prediction

def score(prediction,y):
    match=0
    for i in range(len(prediction)):
        if y[i]==prediction[i]:
            match+=1
    print((match*100)/len(prediction))
        

In [11]:
# prediction = predict(train_x,tree.root)
# score(prediction,train_y)

In [12]:
import pickle
with open("treeModel",'rb') as f:
    tree=pickle.load(f)

In [13]:
# %matplotlib qt
# plt.plot([i[0] for i in Decision_tree.train_accuracy] ,[i[1] for i in Decision_tree.train_accuracy],label="train")
# plt.plot([i[0] for i in Decision_tree.test_accuracy] ,[i[1] for i in Decision_tree.test_accuracy],label="test")
# plt.plot([i[0] for i in Decision_tree.val_accuracy] ,[i[1] for i in Decision_tree.val_accuracy],label="val")
# plt.xlabel("Number of nodes")
# plt.ylabel("Accuracies")
# plt.title("Train, Test and Val Accuracy while growing Decision Tree")
# plt.legend()
# plt.show()

In [14]:
queue =[]
queue.append(tree.root)
nodeDepth = dict()
nodesInDepth =[]
count = 0
while len(queue)!=0:
    count+=1
    node = queue.pop(0)
    
    if node.depth in nodeDepth:
        nodeDepth[node.depth]+=1
    else :
        nodeDepth[node.depth]=1
    if node.left_child!=None and node.right_chid!=None:
        queue.append(node.left_child)
        queue.append(node.right_chid)         
        
count

19977

In [15]:
def classify(row,node,depth):

    if node.depth == depth:
        return node.label
    if node.left_child ==None and node.right_chid ==None:
        return node.label

    col = node.column
    median = node.split_median
    if row[col] <= median:
        return classify(row, node.left_child,depth)
    elif node.right_chid ==None:
        return node.label
    else:
        return classify(row, node.right_chid,depth)

def depthWiseAcc(x,y):
    acc = []
    for i in range(55):
        r = tree.root
        prediction = []
        for row in x:
            prediction.append(classify(row,r,i))
        match=0
        for i in range(len(prediction)):
            if y[i]==prediction[i]:
                match+=1
        acc.append(match/len(prediction))
        
    return acc

In [16]:
tr=depthWiseAcc(train_x,train_y)

In [17]:
te=depthWiseAcc(test_x,test_y)

In [18]:
vl = depthWiseAcc(val_x,val_y)

In [19]:
nodes=[nodeDepth[i] for i in nodeDepth]

In [20]:
len(vl)

55

In [21]:
%matplotlib qt
plt.plot(np.arange(55),tr,label="train")
plt.plot(np.arange(55),te,label="test")
plt.plot(np.arange(55),vl,label="val")
plt.xlabel("Levels in Decision Tree")
plt.ylabel("Accuracies")
plt.title("Train, Test and Val Accuracy while growing Decision Tree Level Wise")
plt.legend()
plt.show()