# Project 550-01-Dtree
## Team members
### 1) Sean Pereira - sean.pereira@student.csulb.edu
### 2) Sushmitha Pasala - sushmitha.pasala@student.csulb.edu
### 3) Vatsal Patel - vatsal.patel01@student.csulb.edu
##### This file creates two decision tree and checks its accuracy on selected holdout set.

In [11]:
import pandas as pd
import numpy as np
import random

class DecisionTree:
    def __init__(self,depth=0,max_depth=8):
        #Read the data from csv file and name the columns
        
        c=['White King file (column)','White King rank (row)','White Rook file','White Rook rank','Black King file','Black King rank','Output']
        self.df=pd.read_csv('550-p1-cset-krk-1.csv',header=None)
        self.df=self.df.rename({0:'White King file (column)',1:'White King rank (row)',2:'White Rook file',3:'White Rook rank',4:'Black King file',5:'Black King rank',6:'Output'}, axis=1)
        df0,df1,df2,df3,df4,df5=self.processing_data(self.df)
        self.label_output()
        self.df=pd.concat([df0,df1,df2,df3,df4,df5,self.df['Output']],axis=1)
        self.left=None
        self.right=None
        self.fkey=None
        self.fval=None
        self.depth=depth
        self.max_depth=max_depth
        self.target=None
        self.d1={17:'draw',0:'zero',1:'one',2:'two',3:'three',4:'four',5:'five',6:'six',7:'seven',8:'eight',9:'nine',10:'ten',11:'eleven',12:'twelve',13:'thirteen',14:'fourteen',15:'fifteen',16:'sixteen'}

    def label_output(self):
        self.d={'draw':17,'zero':0,'one':1,'two':2,'three':3,'four':4,'five':5,'six':6,'seven':7,'eight':8,'nine':9,'ten':10,'eleven':11
          ,'twelve':12,'thirteen':13,'fourteen':14,'fifteen':15,'sixteen':16}
        for column in self.df:
            if column=='Output':
                s1=self.df[column].values
                for j,i in enumerate(s1):
                    s1[j]=self.d[i]
                break
        self.df=self.df.assign(Output=s1,inplace='True')

        
        
    def processing_data(self,data):
        # Labeling each data to 0-1, converting categorical to numerical data
        
        columns_text_0=['WKa','WKb','WKc','WKd','WKe','WKf','WKg','WKh']
        columns_data_0=['WK1','WK2','WK3','WK4','WK5','WK6','WK7','WK8']
        columns_text_1=['WRa','WRb','WRc','WRd','WRe','WRf','WRg','WRh']
        columns_data_1=['WR1','WR2','WR3','WR4','WR5','WR6','WR7','WR8']
        columns_text_2=['BKa','BKb','BKc','BKd','BKe','BKf','BKg','BKh']
        columns_data_2=['BK1','BK2','BK3','BK4','BK5','BK6','BK7','BK8']
        index=0
        for i in ['White King file (column)','White King rank (row)','White Rook file','White Rook rank','Black King file','Black King rank']:
            alphabets=[]
            numericals=[]
            for columndata in data[i]:
                letter=[0]*8
                numbers=[0]*8
                if not isinstance(columndata, int):
                    letter[ord(columndata)-ord('a')]=1
                    alphabets.append(letter)
                else:
                    numbers[ord(str(columndata))-ord('0')-1]=1
                    numericals.append(numbers)
            if index==0:
                df0=pd.DataFrame(data=alphabets, columns=columns_text_0)
            if index==1:
                df1=pd.DataFrame(data=numericals, columns=columns_data_0)
            if index==2:
                df2=pd.DataFrame(data=alphabets, columns=columns_text_1)
            if index==3:
                df3=pd.DataFrame(data=numericals, columns=columns_data_1)
            if index==4:
                df4=pd.DataFrame(data=alphabets, columns=columns_text_2)
            if index==5:
                df5=pd.DataFrame(data=numericals, columns=columns_data_2)
            index+=1
        return (df0,df1,df2,df3,df4,df5)
    
    def entropy(self,col):
        counts=np.unique(col,return_counts=True)
        ent=0.0
        for i in counts[1]:
            p=i/col.shape[0]
            ent+=(-1.0*p*np.log2(p))
        return ent
    
    def information_gain(self,x_data,fkey,fval):
        right,left=self.divide_data(x_data,fkey,fval)
        l=float(left.shape[0])/x_data.shape[0]
        r=float(right.shape[0])/x_data.shape[0]
        if left.shape[0]==0 or right.shape[0]==0:
            return float("-inf")
        i_gain=self.entropy(x_data.Output)-(l*self.entropy(left.Output)+r*self.entropy(right.Output))
        return i_gain
    
    def divide_data(self,x_data,fkey,fval):
        
        #fkey: Feature names 
        #fval: 
        
        x_right=pd.DataFrame([],columns=x_data.columns)
        x_left=pd.DataFrame([],columns=x_data.columns)
        for i in range(x_data.shape[0]):
            val = x_data[fkey].loc[i]
            if val >= fval:
                x_right = x_right.append(x_data.iloc[i])
            else:
                x_left = x_left.append(x_data.iloc[i])
        return x_right,x_left
    
    def frequency_of_Output(self, x_train):
        
        self.dict={}
        for i in x_train:
            if i not in self.dict:
                self.dict[i]=1
            else:
                self.dict[i]+=1
        return max(self.dict, key= lambda d: self.dict[d])
        
    def train(self,x_train):
        features=self.df.columns[:-1]
        info_gains=[]
        for i in features:
            i_gain=self.information_gain(x_train,i,0.5)
            info_gains.append(i_gain)
        self.fkey=features[np.argmax(info_gains)]
        self.fval=0.5
        #print("Splitting Tree",self.fkey,"entropy",max(info_gains))
        data_right,data_left=self.divide_data(x_train,self.fkey,self.fval)
        data_right=data_right.reset_index(drop=True)
        data_left=data_left.reset_index(drop=True)
        if data_left.shape[0]==0 or data_right.shape[0]==0:
            self.target=self.d1[self.frequency_of_Output(x_train.Output)]
            return 
        if self.depth>=self.max_depth:
            
            self.target=self.d1[self.frequency_of_Output(x_train.Output)]
            return 
        self.left=DecisionTree(self.depth+1,self.max_depth)
        self.left.train(data_left)
        self.right=DecisionTree(self.depth+1,self.max_depth)
        self.right.train(data_right)

        self.target=self.d1[self.frequency_of_Output(x_train.Output)]
        return 
    
    def predict(self,test):
        if test[self.fkey] > self.fval:
            if self.right is None:
                return self.target
            return self.right.predict(test)
        if test[self.fkey] <= self.fval:
            if self.left is None:
                return self.target
            return self.left.predict(test)
    def dataframe(self):
        return self.df

    

        
#Creating Object of Decision Tree
d=DecisionTree()



# Splitting Data Into training, test and validate :60,20,20
train_data, validate_data, test_data = np.split(d.dataframe().sample(frac=1,random_state=42), [int(.6*len(d.dataframe())), int(.8*len(d.dataframe()))])

#Reset Index to 0
train_data=train_data.reset_index(drop=True)
test_data=test_data.reset_index(drop=True)

# Building tree
d.train(train_data)

In [12]:
def bagging_replacement(t_set, holdout_set,d):
    final_t_set = []
    final_holdout_set = []
    
    Training_indexes = list(t_set.index)
    Testing_indexes = list(holdout_set.index)
    union_set=Training_indexes+Testing_indexes
    union_set.sort()
    
    incorrect_array=accuracy(d,holdout_set)[1]


    for i in incorrect_array:
        union_set.append(i)
        union_set.append(i)

    for _ in range(len(t_set)):
        add_index = random.randint(0, len(t_set) - 1)
        final_t_set.append(union_set[add_index])



    # remove duplicates before removing items in final_t_set
    for item in union_set:
        if item not in final_holdout_set:
            final_holdout_set.append(item)
    
    for item in final_t_set:
        if item in final_holdout_set:
            final_holdout_set.remove(item)


    return final_t_set, final_holdout_set

def accuracy(d,test_data):

    count=0
    incorrect=[]
    correct=[]
    old_data=test_data.index

    test_data=test_data.reset_index(drop=True)
    y_pred=[]

    for i in range(test_data.shape[0]):
        y_pred.append(d.predict(test_data.loc[i]))


    for i in range(len(y_pred)):
        if y_pred[i]== d.d1[test_data['Output'][i]]:
            count+=1
            correct.append(i)
        else:
            incorrect.append(i)
    

        
    
    new_data=[]
    for i in incorrect:
        new_data.append(old_data[i])  
    return count/len(test_data),new_data


#print("Accuracy of 1st DTree:",accuracy(d,test_data)[0]*100,"%")
train_data, validate_data, test_data = np.split(d.dataframe().sample(frac=1,random_state=42), [int(.6*len(d.dataframe())), int(.8*len(d.dataframe()))])
Training_Set, Holdout_Set = bagging_replacement(train_data, test_data,d)

In [13]:
def convert_indices_to_DataFrame(Training_Set,d):
    index1=[]
    Training_Set.sort()   
    d1=[]
    for i, j in d.dataframe().iterrows():
        if i in Training_Set:
            c1=Training_Set.count(i)
            for _ in range(c1):
                d1.append(d.dataframe()[i:i+1].values)
    v1=[]
    for i in d1:
        b1=[]
        for t in i:
            for r in t:
                b1.append(r)
        v1.append(b1)
    return v1


#d1=DecisionTree()
Training_Set_d2 = pd.DataFrame(data= convert_indices_to_DataFrame(Training_Set,d),columns=d.dataframe().columns)
HoldOut_Set_d2 = pd.DataFrame(data=  convert_indices_to_DataFrame(Holdout_Set,d),columns=d.dataframe().columns)
d.train(Training_Set_d2)

In [14]:
#print("Accuracy of 2nd DTree:",accuracy(d,HoldOut_Set_d2 )[0]*100,"%")