In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import random
from pprint import pprint
import ipywidgets as widgets
from IPython.display import display

In [2]:
df= pd.read_csv('/Users/DELL/Desktop/Sem 6/Python/heart.csv')



In [3]:
df.head()
df["label"] = df.target
df = df.drop(["target"],axis=1)



In [4]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,label
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [5]:
def train_test_split(df,test_size):
    if isinstance(test_size,float):
        test_size=round(test_size*len(df))
    indices= df.index.tolist()
    test_indices=random.sample(population=indices, k=test_size)
    
    test_df=df.loc[test_indices]
    train_df= df.drop(test_indices)
    return train_df,test_df


In [6]:
def determine_type_of_feature(df):
    
    feature_types=[]
    n_unique_values_threshold = 5
    
    for column in df.columns:
        unique_values = df[column].unique()
        example_value = unique_values[0]
        
        if (isinstance(example_value, str)) or (len(unique_values)<= n_unique_values_threshold):
           feature_types.append("categorical")
        else:
           feature_types.append("continuous")
    
    return feature_types

In [8]:
feature_types = determine_type_of_feature(df)
i=0
for column in df.columns:
    print(column,"-", feature_types[i])
    i=i+1

age - continuous
sex - categorical
cp - categorical
trestbps - continuous
chol - continuous
fbs - categorical
restecg - categorical
thalach - continuous
exang - categorical
oldpeak - continuous
slope - categorical
ca - categorical
thal - categorical
label - categorical


In [9]:
def check_purity(data):
    label_column = data[:,-1]
    unique_classes = np.unique(label_column)
    if len(unique_classes)==1: #only one class is there
        return True
    else:
        return False

In [10]:
#the class appearing the most
def classify_data(data):
    label_column = data[:,-1]
    unique_classes, counts_unique_classes = np.unique(label_column,return_counts=True) #if true return number of times each unique item appears in array
    index = counts_unique_classes.argmax()
    classification = unique_classes[index]
    return classification
    

In [11]:
def get_potential_splits(data):
    
    potential_splits={}
    _, n_columns = data.shape
    
    for column_index in range(n_columns-1):
        
        
        potential_splits[column_index]=[]
        values = data[:,column_index]
        unique_values = np.unique(values)
        
        type_of_feature = feature_types[column_index]
        if type_of_feature == "continuous":
            for index in range(len(unique_values)):
                if index!=0:
                    current_value = unique_values[index]
                    previous_value = unique_values[index-1]
                    potential_split = (current_value + previous_value)/2
                    potential_splits[column_index].append(potential_split)
        else:
            potential_splits[column_index] = unique_values
    
    return potential_splits

In [12]:
def split_data(data, split_column, split_value):
    
    split_column_values=data[:,split_column]
    type_of_feature = feature_types[split_column]
    if type_of_feature == "continuous":
        data_below = data[split_column_values <= split_value]
        data_above = data[split_column_values > split_value]
    else:
        data_below = data[split_column_values == split_value]
        data_above = data[split_column_values != split_value]
        
    return data_below, data_above

In [13]:
def calculate_entropy(data):
    
    label_column = data[:,-1]
    _,counts = np.unique(label_column, return_counts = True)
    probabilities = counts/counts.sum()
    entropy = sum(probabilities*(-np.log2(probabilities)))
    
    return entropy

In [14]:
def calculate_overall_entropy(data_below, data_above):
    
    n_data_points = len(data_below) + len(data_above)
    p_data_below = len(data_below) / n_data_points
    p_data_above = len(data_above) / n_data_points
    
    overall_entropy = (p_data_below * calculate_entropy(data_below) + p_data_above*calculate_entropy(data_above))
    
    return overall_entropy


In [15]:
def best_fit(data, potential_splits):
    
    overall_entropy = 999
    for column_index in potential_splits:
        for value in potential_splits[column_index]:
            data_below, data_above = split_data(data, split_column=column_index, split_value=value)
            current_overall_entropy = calculate_overall_entropy(data_below,data_above)
            
            if current_overall_entropy <= overall_entropy:
                overall_entropy = current_overall_entropy
                best_split_column = column_index
                best_split_value = value
                
    return best_split_column, best_split_value

In [16]:
def decision_tree_algorithm(df, counter=0,min_samples=20, max_depth=20 ):
    
    if counter==0:
        global column_header, feature_types
        data=df.values
        column_header = df.columns
        feature_types = determine_type_of_feature(df)
    else:
        data = df
    
    if check_purity(data) or (len(data)< min_samples) or (counter==max_depth):
        classification = classify_data(data)
        return classification
    
    else:
        counter+=1
        
        potential_splits = get_potential_splits(data)
        split_column, split_value = best_fit(data, potential_splits)
        data_below, data_above = split_data(data, split_column, split_value)
        
        feature_name = column_header[split_column]
        type_of_feature = feature_types[split_column]
        if type_of_feature == "continuous":
            question = "{} <= {}".format(feature_name, split_value)
        else:
            question = "{} = {}".format(feature_name, split_value)   
        sub_tree = {question:[]}
        
        yes_answer = decision_tree_algorithm(data_below, counter, min_samples,max_depth)
        no_answer = decision_tree_algorithm(data_above, counter, min_samples,max_depth)
        
        if yes_answer == no_answer:
            sub_tree = no_answer
        else:
            sub_tree[question].append(yes_answer)
            sub_tree[question].append(no_answer)
            
        return sub_tree

    

In [17]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,label
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [18]:
def classify_example(example,tree):
    
    question = list(tree.keys())[0]
    feature_name, comparision_operator, value = question.split(" ")
    
    if comparision_operator == "<=":
        if example[feature_name]<= float(value):
            answer = tree[question][0]
        else:
            answer = tree[question][1]
    else:
        if str(example[feature_name]) == value:
            answer = tree[question][0]
        else:
            answer = tree[question][1]
        
    if not isinstance(answer, dict):
        return answer
    else:
        residual_tree=answer
        return classify_example(example, residual_tree)
    

In [19]:
def calculate_accuracy(df, tree):
    
    df["classification"] = df.apply(classify_example, axis=1, args=(tree,))
    df["classification_correct"]=df. classification == df.label
    
    accuracy = df.classification_correct.mean()
    
    return accuracy

In [20]:
train_df, test_df = train_test_split(df, test_size = 0.1)
tree = decision_tree_algorithm(train_df, max_depth=10)
example = df.iloc[180]
accuracy = calculate_accuracy(test_df, tree)
answer=classify_example(example,tree)
print(answer)
print(accuracy)
pprint(tree)

0.0
0.8666666666666667
{'thal = 2.0': [{'ca = 0.0': [{'thalach <= 160.5': [{'oldpeak <= 3.2': [{'age <= 57.0': [1.0,
                                                                                         {'thalach <= 157.5': [{'oldpeak <= 2.8': [1.0,
                                                                                                                                   0.0]},
                                                                                                               0.0]}]},
                                                                        0.0]},
                                                    1.0]},
                              {'cp = 0.0': [0.0, 1.0]}]},
                {'cp = 0.0': [{'oldpeak <= 0.55': [{'chol <= 237.5': [1.0,
                                                                      0.0]},
                                                   0.0]},
                              {'slope = 1.0': [0.0, 1.0]}]}]}


In [22]:
import ipywidgets as widgets
import pandas as pd 

print("\n")
print("\n")
print("\n")


age = widgets.IntSlider(min=1)
display("Age:",age)

gender = widgets.Dropdown(
    options=['Male', 'Female'],
    value='Male',
    #description='Gender:',
)
display("Gender",gender)

cp=widgets.RadioButtons(
    options=['Typical angina', 'Atypical angina', 'Non-angina pain','Asymptomatic'],
#     value='No chest pain',
    
    disabled=False
)
display('Chest pain type:',cp)

trestbps=widgets.FloatText(
    value=125.80,
    #description='Resting blood pressure (in mm Hg):',
    disabled=False
)
display('Resting blood pressure (in mm Hg):',trestbps)

chol=widgets.FloatText(
    value=213.60,
    #description='Serum cholestoral (in mg/dl):',
    disabled=False
)
display('Serum cholestoral (in mg/dl):',chol)

fbs=widgets.Select(
    options=['Yes', 'No'],
    value='No',
    disabled=False,
    rows=2
)
display("Fasting blood sugar > 120 (in mg/dl):",fbs)

thalach=widgets.FloatText(
    value=150.0,
    disabled=False
)
display('Maximum heart rate achieved:',thalach)

restecg = widgets.Dropdown(
    options=["Normal","Having ST-T wave abnormality","Showing probable or definite left ventricular hypertrophy"],
    value="Normal"
)
display("Resting ECG results",restecg)

exang=widgets.Select(
    options=['Yes', 'No'],
    value='No',
    #description='Exercise induced angina :',
    disabled=False,
    rows=2
)
display("Exercise induced angina :",exang)

oldpeak=widgets.FloatText(
    value=0.62,
    disabled=False
)
display('ST depression induced by exercise relative to rest(in mm):',oldpeak)

slope=widgets.RadioButtons(
    options=['Up','Flat', 'Down'],
    disabled=False
)
display('Slope of the peak exercise ST segment:',slope)

ca = widgets.Dropdown(
    options=[0,1,2,3],
    value=0
)
display("Number of major vessels colored by Flourosopy",ca)

thal=widgets.Select(
    options=['Normal', 'Fixed defect','Reversible defect'],
    value='Normal',
    #description='Exercise induced angina :',
    disabled=False,
    rows=3
)
display("Thalassemia:",thal)

submit=widgets.Button(
    description='Submit',
    disabled=False,
    button_style='info', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Submit details',
    icon='check'
)
display(submit)


def on_button_clicked(b):
    age1=age.value
    if gender.value=="Male":
        gender1=1
    else:
        gender1=0
    if cp.value=="Typical angina":
        cp1=0
    elif cp.value=="Atypical angina":
        cp1=1
    elif cp.value=="Non-angina pain":
        cp1=2
    else:
        cp1=3
        
    trestbps1=trestbps.value
    
    chol1=chol.value
    
    if fbs.value=="Yes":
        fbs1=1
    else:
        fbs1=0
    if restecg.value=="Normal":
        restecg1=0
    elif restecg.value=="Having ST-T wave abnormality":
        restecg1=1
    else:
        restecg1=2
    
    
    thalach1=thalach.value
    
    if exang.value=="Yes":
        exang1=1
    else:
        exang1=0
    
    oldpeak1=oldpeak.value
    
    if slope.value=="Up":
        slope1=0
    elif slope.value=="Flat":
        slope1=1
    else:
        slope1=2
    
    ca1=ca.value
    
    if thal.value=="Normal":
        thal1=1
    elif thal.value=="Fixed defect":
        thal1=2
    else:
        thal1=3
        
    
    # initialize list of lists 
    input1 = [[age1,gender1,cp1,trestbps1,chol1,fbs1,restecg1,thalach1,exang1,oldpeak1,slope1,ca1,thal1]] 
  
    # Create the pandas DataFrame 
    df2 = pd.DataFrame(input1, columns = ['Age', 'Sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal']) 
  
    train_df, test_df = train_test_split(df, test_size = 0.1)
    tree = decision_tree_algorithm(train_df, max_depth=10)
    example = df2.iloc[0]
    accuracy = calculate_accuracy(test_df, tree)
    answer=classify_example(example,tree)
    #print(answer)
    if answer==1:
        print("Risk of heart disease")
    else:
        print("Healthy")

    
    #print(accuracy)
    #pprint(tree) 
    

submit.on_click(on_button_clicked)









'Age:'

IntSlider(value=1, min=1)

'Gender'

Dropdown(options=('Male', 'Female'), value='Male')

'Chest pain type:'

RadioButtons(options=('Typical angina', 'Atypical angina', 'Non-angina pain', 'Asymptomatic'), value='Typical …

'Resting blood pressure (in mm Hg):'

FloatText(value=125.8)

'Serum cholestoral (in mg/dl):'

FloatText(value=213.6)

'Fasting blood sugar > 120 (in mg/dl):'

Select(index=1, options=('Yes', 'No'), rows=2, value='No')

'Maximum heart rate achieved:'

FloatText(value=150.0)

'Resting ECG results'

Dropdown(options=('Normal', 'Having ST-T wave abnormality', 'Showing probable or definite left ventricular hyp…

'Exercise induced angina :'

Select(index=1, options=('Yes', 'No'), rows=2, value='No')

'ST depression induced by exercise relative to rest(in mm):'

FloatText(value=0.62)

'Slope of the peak exercise ST segment:'

RadioButtons(options=('Up', 'Flat', 'Down'), value='Up')

'Number of major vessels colored by Flourosopy'

Dropdown(options=(0, 1, 2, 3), value=0)

'Thalassemia:'

Select(options=('Normal', 'Fixed defect', 'Reversible defect'), rows=3, value='Normal')

Button(button_style='info', description='Submit', icon='check', style=ButtonStyle(), tooltip='Submit details')

Risk of heart disease
