In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('mushrooms.csv')
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [3]:
df.describe()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,...,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,...,4,9,9,1,4,3,5,9,6,7
top,e,x,y,n,f,n,f,c,b,b,...,s,w,w,p,w,o,p,w,v,d
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,...,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


In [4]:
column_to_drop = ['gill-attachment','veil-type','veil-color']
df = df.drop(column_to_drop,axis=1)
df.head(n=10)

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,c,n,k,e,e,s,s,w,w,o,p,k,s,u
1,e,x,s,y,t,a,c,b,k,e,c,s,s,w,w,o,p,n,n,g
2,e,b,s,w,t,l,c,b,n,e,c,s,s,w,w,o,p,n,n,m
3,p,x,y,w,t,p,c,n,n,e,e,s,s,w,w,o,p,k,s,u
4,e,x,s,g,f,n,w,b,k,t,e,s,s,w,w,o,e,n,a,g
5,e,x,y,y,t,a,c,b,n,e,c,s,s,w,w,o,p,k,n,g
6,e,b,s,w,t,a,c,b,g,e,c,s,s,w,w,o,p,k,n,m
7,e,b,y,w,t,l,c,b,n,e,c,s,s,w,w,o,p,n,s,m
8,p,x,y,w,t,p,c,n,p,e,e,s,s,w,w,o,p,k,v,g
9,e,b,s,y,t,a,c,b,g,e,c,s,s,w,w,o,p,k,s,m


In [5]:
input_cols = ['cap-shape','class','cap-surface','cap-color','bruises','odor','gill-spacing','gill-size','gill-color','stalk-shape','stalk-root','stalk-surface-above-ring','stalk-surface-below-ring','stalk-color-above-ring','stalk-color-below-ring','ring-number','ring-type','spore-print-color','population','habitat']

In [6]:
def entropy(col):
    types = np.unique(col,return_counts = True)
    ent = 0.0
    
    N = col.shape[0]
    for ix in types[1]:
        probability = ix/N
        ent += (-1.0*probability*np.log2(probability))
        
    return ent


In [7]:
def split_data(x_data,fkey):
    types = np.unique(x_data[fkey])
    
    split_nodes = []
    Data = []
    for Class in types:
        data = pd.DataFrame([],columns=x_data.columns)
        for ix in range(x_data.shape[0]):
            if Class == (x_data[fkey].loc[ix]):
                data = data.append(x_data.loc[ix])
        data = data.reset_index(drop=True)
        split_nodes.append(Class)
        Data.append(data)
        
    return split_nodes,Data

In [8]:
def info_gain(x_data,fkey):
    
    #Split
    key,data = split_data(x_data,fkey)
    
    # Calculation
    i_gain = entropy(x_data['class'])
    
    for i in range(len(key)):
        ratio = data[i].shape[0]/x_data.shape[0]
        i_gain -= ratio*entropy(data[i]['class'])
        
    return i_gain

In [9]:
class GenericTree:
    
    def __init__(self,depth=0,max_depth = 8):
        self.node = None
        self.fkey = None
        self.max_depth = max_depth
        self.child = list()
        self.depth = depth
        self.target = None
        
    def train(self,x_train):
        
        features = ['cap-shape','cap-surface','cap-color','bruises','odor','gill-spacing','gill-size','gill-color','stalk-shape','stalk-root','stalk-surface-above-ring','stalk-surface-below-ring','stalk-color-above-ring','stalk-color-below-ring','ring-number','ring-type','spore-print-color','population','habitat']
        info_gains = []
        
        for ix in features:
            info_gains.append(info_gain(x_train,ix))
        self.fkey = features[np.argmax(info_gains)]
        print("making tree features : ",self.fkey)
        
        # Split
        self.node,data = split_data(x_train,self.fkey)
        # BASE CASE
        
        # if current node is a leaf node
        if len(data)==1:
            majority_voting = np.unique(data[0]['class'],return_counts=True)
            self.target = majority_voting[0][np.argmax(majority_voting[1])]
            return
        
        # if max_depth is reached
        if self.depth>=self.max_depth:
            majority_voting = np.unique(x_train['class'],return_counts=True)
            self.target = majority_voting[0][np.argmax(majority_voting[1])]
            return
        
        
        # RECURSIVE CASE
        for i in range(len(self.node)):
            NewChild = GenericTree(depth = self.depth+1,max_depth = self.max_depth)
            NewChild.train(data[i])
            self.child.append(NewChild)
        
        # Setting Up Target Value
        if self.depth>=self.max_depth:
            majority_voting = np.unique(x_train['class'],return_counts=True)
            self.target = majority_voting[0][np.argmax(majority_voting[1])]
            return    
        
            
    def predict(self,test):
        if len(self.child)==0:
            return self.target
        if test[self.fkey] in self.node:
            return self.child[self.node.index(test[self.fkey])].predict(test)
        else:
            return self.target
        
        

In [10]:
dataset = df.sample(frac=1).reset_index(drop=True)
cols = dataset.columns
dataset = dataset.values

In [11]:
split = int(0.8*dataset.shape[0])
X_train = dataset[:split, :]
X_test = dataset[split: , :]

X_train = pd.DataFrame(X_train , columns=cols)
X_test = pd.DataFrame(X_test , columns=cols)

X_train = X_train.sample(frac=1).reset_index(drop=True)
X_test = X_test.sample(frac=1).reset_index(drop=True)
print(X_train.head())
print(X_test.head())

  class cap-shape cap-surface cap-color bruises odor gill-spacing gill-size  \
0     p         x           y         y       f    f            c         b   
1     e         x           f         w       f    n            w         b   
2     p         f           f         y       f    f            c         b   
3     p         f           f         g       f    f            c         b   
4     p         k           y         e       f    y            c         n   

  gill-color stalk-shape stalk-root stalk-surface-above-ring  \
0          p           e          b                        k   
1          h           t          e                        s   
2          p           e          b                        k   
3          p           e          b                        k   
4          b           t          ?                        k   

  stalk-surface-below-ring stalk-color-above-ring stalk-color-below-ring  \
0                        k                      n               

In [15]:
class RandomForest:
    
    def __init__(self,n_estimators=5,max_depth=5):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.branch = list()
        self.batch_size_percentage = 0.8
    
    def fit(self,x_train):
        
        for i in range(self.n_estimators):
            
            x = x_train
            features = x_train.columns
            x = x.sample(frac=1).reset_index(drop=True)
            x = x.values
            x = x[:int(x.shape[0]*self.batch_size_percentage),:]
            x = pd.DataFrame(x,columns=features)
            
            obj = GenericTree(max_depth=self.max_depth)
            obj.train(x)
            self.branch.append(obj)
    
    def score(self,X_test):
        pred = []
        for i in range(X_test.shape[0]):
            ans = []
            for j in range(len(self.branch)):
                ans.append(self.branch[j].predict(X_test.loc[i]))
            FinalAns = np.unique(ans,return_counts=True)
            pred.append(FinalAns[0][np.argmax(FinalAns[1])])
            
        return np.sum(pred==X_test['class'])/X_test.shape[0]
            
    

In [16]:
randomforest = RandomForest(max_depth = 8)

In [17]:
randomforest.fit(X_train)

making tree features :  odor
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  spore-print-color
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree

making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  habitat
making tree features :  gill-size
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-color
making tree features :  cap-shape
making tree feat

making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree features :  cap-shape
making tree fe