In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#benign = 1 and malignant = 0
dataset = pd.read_csv('Breast_cancer_data[1].csv')
dataset = dataset.sort_values(by='mean_area')
dataset.head(10)

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,diagnosis
101,6.981,13.43,43.79,143.5,0.117,1
539,7.691,25.44,48.34,170.4,0.08668,1
538,7.729,25.49,47.98,178.8,0.08098,1
568,7.76,24.54,47.92,181.0,0.05263,1
46,8.196,16.84,51.71,201.9,0.086,1
151,8.219,20.7,53.27,203.9,0.09405,1
314,8.597,18.6,54.09,221.2,0.1074,1
525,8.571,13.1,54.53,221.3,0.1036,1
61,8.598,20.98,54.66,221.8,0.1243,1
59,8.618,11.79,54.34,224.5,0.09752,1


In [3]:
dataset.isnull().sum()

mean_radius        0
mean_texture       0
mean_perimeter     0
mean_area          0
mean_smoothness    0
diagnosis          0
dtype: int64

In [4]:
# stores all the column labels in the list except for last column
# which is our target variable.
m= dataset.shape[1]
header = dataset.columns
header = list(header)
print(header)
dataset.shape

['mean_radius', 'mean_texture', 'mean_perimeter', 'mean_area', 'mean_smoothness', 'diagnosis']


(569, 6)

In [5]:
# dividing the dataset into 75% for training and 25% for testing 
idx = 3*dataset.shape[0]//4
train = dataset.iloc[:idx,:]
test = dataset.iloc[idx:,:]
print("last 5 rows of training dataset:\n\n",train.tail())
print("\nfirst 5 rows of testing dataset:\n\n",test.head())

last 5 rows of training dataset:

      mean_radius  mean_texture  mean_perimeter  mean_area  mean_smoothness  \
133        15.71         13.93           102.0      761.7          0.09462   
182        15.70         20.31           101.2      766.6          0.09597   
258        15.66         23.20           110.2      773.5          0.11090   
11         15.78         17.89           103.6      781.0          0.09710   
118        15.78         22.91           105.7      782.6          0.11550   

     diagnosis  
133          1  
182          0  
258          0  
11           0  
118          0  

first 5 rows of testing dataset:

      mean_radius  mean_texture  mean_perimeter  mean_area  mean_smoothness  \
13         15.85         23.95           103.7      782.7          0.08401   
375        16.17         16.07           106.3      788.5          0.09880   
330        16.03         15.51           105.8      793.2          0.09491   
10         16.02         23.24           102.7

In [6]:
train = np.array(train)
test = np.array(test)
print(train.shape[0])
# train[:,-1].astype(int)
# test[:,-1].astype(int)
# print(train,end='\n\n')
# print(test)

426


In [52]:
def unique_values(dataset,col_index):
    unique = np.unique(dataset[:,col_index])
    return(unique)
    
#section is a portion of the dataset for which we find entropy
def count_label(section):
    cnt = {}
    for row in section:
        if row[-1] not in cnt:
            cnt[row[-1]]=0
        cnt[row[-1]]+=1
    return cnt


def entropy(section):
    cnt = count_label(section)
    total = 0
    for val in cnt.values():
        total += val
        
    ent = 0 #entropy value
    for val in cnt.values():
        ent += (val/total)*np.log(val/total)
    
    return -1*ent

# dataset_ent is the entropy of the entire dataset 
def info_gain(left,right,parent_ent):
    n = len(left) + len(right)
    
    left_ent=entropy(left)
    right_ent=entropy(right)
    
    weighted_avg = (len(left)/n)*left_ent + (len(right)/n)*right_ent
    
    return parent_ent-weighted_avg
    
    
# partitions the dataset according to the question value.
# it compares each value of the colum at col_index and 
# with the question value and seperates them into two groups 
# left and right.
def partition(dataset,qval,col_index):
    n = dataset.shape[0]
    right=[]
    left=[]
    for i in range(n):
        if dataset[i,col_index]>=qval:
            right.append(dataset[i,:])
        else:
            left.append(dataset[i,:])

    return left,right


# finds the best split by iterating over each and every value
# in each and every column. It finds a suitable value for questioning
# and then finds the information gain of that attribute based on that question.
# best_attr stores the column label used for question
# best_ques stores the value using which we could find the best split.

def best_split(dataset,header,column):
    maxGain=0
    best_ques=None
    best_attr=None
    best_left=[]
    best_right=[]
    rows = dataset.shape[0]
    cols = dataset.shape[1]
    parent_ent = entropy(dataset)
    
    #we will see the best question value, except for the last column
    for c in range(cols-1):
        if c in column:
            continue
        for r in range(rows):
            left,right=partition(dataset,dataset[r,c],c)
            IG = info_gain(left,right,parent_ent)
            if maxGain<IG:
                maxGain = IG
                best_ques = dataset[r,c]
                best_attr = c
                best_left = left
                best_right = right
            
    best_left=np.array(best_left)
    best_right=np.array(best_right)
    
    return best_ques,best_attr,best_left,best_right

def predictions(section,label):
    cnt = count_label(section)
    print(cnt)
    total=0
    for vals in cnt.values():
        total+=vals

    prob={}
    for key,vals in cnt.items():
        prob[key]=vals/total
        prob[key]*=100

    for key in prob.keys():
        print(f"{key} : {prob[key]}\n")
        

def splitting(ds,itr,header,space,col):
    if itr==0 or len(col)==len(header)-1:
        return
    
    ques,attr,left,right = best_split(ds,header,col)
    col.append(attr)
    prob={}
    
    if attr is not None:
        
        print("\n"+'-'*space+"> "+header[attr]+" < "+ques.astype(str),end='')
        cnt = unique_values(left,left.shape[1]-1)
        if len(cnt)>1:
            splitting(left,itr-1,header,space+2,col)
        else:
            pred=None
            if cnt[0]==1:
                pred="benign"
            else:
                pred="malignant"
            print(f" ---> stop , predicted value is {pred}")
            
            
        print("\n"+'-'*space+"> "+header[attr]+" >= "+ques.astype(str),end='')
        cnt = unique_values(right,right.shape[1]-1)
        if len(cnt)>1:
            splitting(right,itr-1,header,space+2,col)
        else:
            pred=None
            if cnt[0]==1:
                pred="benign"
            else:
                pred="malignant"
            print(f" ---> stop , predicted value is {pred}")
            
            
    else:
        print(" ---> stop , predicted value is ",end='')
        predictions(ds,ds.shape[1]-1)
        
        
    

In [55]:
itr = 100
space = 1
col=[]
splitting(train,itr,header,space,col)


-> mean_perimeter < 90.2
---> mean_smoothness < 0.1088
-----> mean_texture < 19.63
-------> mean_radius < 11.8
---------> mean_area < 431.1 ---> stop , predicted value is benign

---------> mean_area >= 431.1 ---> stop , predicted value is malignant

-------> mean_radius >= 11.8 ---> stop , predicted value is benign

-----> mean_texture >= 19.63
---> mean_smoothness >= 0.1088
-> mean_perimeter >= 90.2

In [54]:
itr = 10
space = 1
col=[]
splitting(test,itr,header,space,col)


-> mean_perimeter < 108.8
---> mean_texture < 19.65
-----> mean_radius < 16.14 ---> stop , predicted value is malignant

-----> mean_radius >= 16.14 ---> stop , predicted value is benign

---> mean_texture >= 19.65 ---> stop , predicted value is malignant

-> mean_perimeter >= 108.8
---> mean_smoothness < 0.0802
-----> mean_area < 1024.0 ---> stop , predicted value is benign

-----> mean_area >= 1024.0 ---> stop , predicted value is malignant

---> mean_smoothness >= 0.0802 ---> stop , predicted value is malignant
