In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#benign = 1 and malignant = 0
dataset = pd.read_csv('Breast_cancer_data[1].csv')
dataset = dataset.sort_values(by='mean_area')
dataset.head(10)

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,diagnosis
101,6.981,13.43,43.79,143.5,0.117,1
539,7.691,25.44,48.34,170.4,0.08668,1
538,7.729,25.49,47.98,178.8,0.08098,1
568,7.76,24.54,47.92,181.0,0.05263,1
46,8.196,16.84,51.71,201.9,0.086,1
151,8.219,20.7,53.27,203.9,0.09405,1
314,8.597,18.6,54.09,221.2,0.1074,1
525,8.571,13.1,54.53,221.3,0.1036,1
61,8.598,20.98,54.66,221.8,0.1243,1
59,8.618,11.79,54.34,224.5,0.09752,1


In [3]:
dataset.isnull().sum()

mean_radius        0
mean_texture       0
mean_perimeter     0
mean_area          0
mean_smoothness    0
diagnosis          0
dtype: int64

In [4]:
# stores all the column labels in the list except for last column
# which is our target variable.
m= dataset.shape[1]
header = dataset.columns
header = list(header)
print(header)
dataset.shape

['mean_radius', 'mean_texture', 'mean_perimeter', 'mean_area', 'mean_smoothness', 'diagnosis']


(569, 6)

In [5]:
# dividing the dataset into 75% for training and 25% for testing 
dataset=np.array(dataset)
np.random.shuffle(dataset)
idx = 3*dataset.shape[0]//4
train = dataset[:idx,:]
test = dataset[idx:,:]
print("last 5 rows of training dataset:\n\n",train[-6:-1,:])
print("\nfirst 5 rows of testing dataset:\n\n",test[:5,:])

last 5 rows of training dataset:

 [[1.907e+01 2.481e+01 1.283e+02 1.104e+03 9.081e-02 0.000e+00]
 [1.311e+01 1.556e+01 8.721e+01 5.302e+02 1.398e-01 0.000e+00]
 [1.348e+01 2.082e+01 8.840e+01 5.592e+02 1.016e-01 0.000e+00]
 [1.450e+01 1.089e+01 9.428e+01 6.407e+02 1.101e-01 1.000e+00]
 [1.270e+01 1.217e+01 8.088e+01 4.950e+02 8.785e-02 1.000e+00]]

first 5 rows of testing dataset:

 [[1.980e+01 2.156e+01 1.297e+02 1.230e+03 9.383e-02 0.000e+00]
 [1.377e+01 2.229e+01 9.063e+01 5.889e+02 1.200e-01 0.000e+00]
 [1.959e+01 2.500e+01 1.277e+02 1.191e+03 1.032e-01 0.000e+00]
 [2.092e+01 2.509e+01 1.430e+02 1.347e+03 1.099e-01 0.000e+00]
 [1.635e+01 2.329e+01 1.090e+02 8.404e+02 9.742e-02 0.000e+00]]


In [6]:
# train = np.array(train)
# test = np.array(test)

## Decision Tree Code

In [7]:
def unique_values(dataset,col_index):
    unique = np.unique(dataset[:,col_index])
    return(unique)
    
#x is a portion of the dataset for which we find entropy
def count_label(x):
    cnt = {}
    for row in x:
        if row[-1] not in cnt:
            cnt[row[-1]]=0
        cnt[row[-1]]+=1
    return cnt


def entropy(x):
    cnt = count_label(x)
    total = 0
    for val in cnt.values():
        total += val
        
    ent = 0 #entropy value
    for val in cnt.values():
        ent += (val/total)*np.log(val/total)
    
    return -1*ent


# dataset_ent is the entropy of the entire dataset 
def info_gain(left,right,parent_ent):
    n = len(left) + len(right)
    
    left_ent=entropy(left)
    right_ent=entropy(right)
    
    weighted_avg = (len(left)/n)*left_ent + (len(right)/n)*right_ent
    
    return parent_ent-weighted_avg

    
    
class Node:
    def __init__(self,question=None,split_feature=None,left=None,right=None,label=None):
        self.question = question
        self.split_feature = split_feature
        self.left = left
        self.right = right
        self.label = label
    
    def is_leaf(self):
        if self.label is not None:
            return True
    
        
        
def partition(rows,question,column):
    left, right = [],[]
    for row in rows:
        if row[column]>=question:
            right.append(row)
        else:
            left.append(row)
    return np.array(left), np.array(right)


        
def best_split(rows,column):
    max_gain = 0 
    ques, attr = None,None
    #left,right=[],[]
    parent_entropy = entropy(rows)
    for col in range(rows.shape[1]-1):
        if col in column:
            continue
        for row in range(rows.shape[0]):
            l,r = partition(rows,rows[row,col],col)
            ig = info_gain(l,r,parent_entropy)
            if max_gain < ig:
                max_gain = ig
                ques = rows[row,col]
                attr = col
    return ques,attr


def find_common_label(column):
    types = np.unique(column)
    group = {t:0 for t in types}
    for c in column:
        group[c]+=1
        
    most_common=None
    cnt=0
    for t in types:
        if cnt<group[t]:
            most_common = t
            cnt = group[t]
    return most_common


        
def define_root(rows,max_depth):
    return build_tree(rows,max_depth,column=[])


    
def build_tree(rows,max_depth,column):
    ques, attr = best_split(rows,column)
    groups = None
    if attr is not None:
        column.append(attr)
        groups = np.unique(rows[:,-1])
        
    if max_depth==0 or attr is None or len(groups) == 1:
        common_label = find_common_label(rows[:,-1])
        #print(common_label)
        return Node(label = common_label)
    
    left_part,right_part = partition(rows,ques,attr)
    left = build_tree(left_part,max_depth-1,column)
    right = build_tree(right_part,max_depth-1,column)
    
    return Node(ques,attr,left,right)
        
    
    
    
def traverse_tree(test_row,node):
    if node.is_leaf():
        return node.label
    
    #print(node.split_feature)
    
    if test_row[node.split_feature]<node.question:
        #print('left')
        return traverse_tree(test_row,node.left)
    else:
        #print('right')
        return traverse_tree(test_row,node.right)
    

def predict(test_set,root):
    predictions=[]
    for test_row in test_set:
        predictions.append(traverse_tree(test_row,root))
    return np.array(predictions)
    
    
    
def calAccuracy(pred, y):
    correct=0
    for i in range(pred.shape[0]):
        if pred[i]==y[i]:
            correct+=1
    return correct*100/pred.shape[0]
            

## Random Forest Code

In [16]:
def get_samples(train):
    n = train.shape[0]
    sample=[]
    indices = np.random.choice(n,size=n//3,replace=True)
    #print(indices,end="\n\n")
    for i in indices:
        sample.append(train[i,:])
    return np.array(sample)


def random_forest_classifier(n_trees, train , test , max_depth):
    trees,predictions_per_tree = {},{}
    for i in range(n_trees):
        trees[i]=[]
        predictions_per_tree[i]=[]
        sample = get_samples(train)
        tree = define_root(sample,max_depth)
        trees[i].append(tree)
        predictions_per_tree[i]=(predict(test,tree)) # key is the tree number
    return predictions_per_tree
            
def random_forest_predict(tree_pred,y):
    values = np.unique(y)
    final_pred = []
    for i in range(tree_pred[0].shape[0]):
        labels = {val:0 for val in values}
        for key in tree_pred.keys():
            labels[tree_pred[key][i]]+=1
        final_pred.append(max(labels,key = lambda x:labels[x]))
    return final_pred
        

In [17]:
tree_pred = random_forest_classifier(5,train,test,100)
final_pred = random_forest_predict(tree_pred,test[:,-1])
final_pred = np.array(final_pred)

In [18]:
accuracy = calAccuracy(final_pred,test[:,-1])
print(f"model accuracy on test data: {accuracy}%")

model accuracy on test data: 93.7062937062937%
