### First Split

In [1]:
import pandas as pd


In [6]:
df = pd.read_csv('dataset.csv')
df.drop(['index'], axis = 1, inplace = True) 
df.head()

Unnamed: 0,gender,fbs,slope,age,prediction
0,male,True,downsloping,63,absence
1,male,False,flat,67,presence
2,male,False,flat,67,presence
3,male,False,downsloping,37,absence
4,female,False,upsloping,41,absence


In [8]:
df['gender'].value_counts()

male      71
female    29
Name: gender, dtype: int64

In [9]:
df['fbs'].value_counts()

False    87
True     13
Name: fbs, dtype: int64

In [10]:
df['slope'].value_counts()

upsloping      50
flat           39
downsloping    11
Name: slope, dtype: int64

In [19]:
age = list(df['age'].sort_values())
print(f'({age[0]}, {age[-1]})')

(37, 71)


In [35]:
print(f'total: {len(df)}')

classes = df['prediction'].unique()
genders = df['gender'].unique()
fbs = df['fbs'].unique()
slopes = df['slope'].unique()

# gender
d = {'absence': [], 'presence': []}

for c in classes:
    for gender in genders:
        d[c].append(len(df[(df.gender == gender) & (df.prediction == c)]))

pd.DataFrame(data=d, index=genders)

total: 100
34


Unnamed: 0,absence,presence
male,34,37
female,23,6


In [37]:
# fbs
d = {'absence': [], 'presence': []}

for c in classes:
    for fb in fbs:
        d[c].append(len(df[(df.fbs == fb) & (df.prediction == c)]))

pd.DataFrame(data=d, index=fbs)

Unnamed: 0,absence,presence
True,8,5
False,49,38


In [38]:
# slope
d = {'absence': [], 'presence': []}

for c in classes:
    for slope in slopes:
        d[c].append(len(df[(df.slope == slope) & (df.prediction == c)]))

pd.DataFrame(data=d, index=slopes)

Unnamed: 0,absence,presence
downsloping,6,5
flat,12,27
upsloping,39,11


In [39]:
# age
d = {'absence': [len(df[(df.age <= 57) & (df.prediction == 'absence')]), len(df[(df.age > 57) & (df.prediction == 'absence')])],
     'presence': [len(df[(df.age <= 57) & (df.prediction == 'presence')]), len(df[(df.age > 57) & (df.prediction == 'presence')])]}


pd.DataFrame(data=d, index=['<=57', '>57'])

Unnamed: 0,absence,presence
<=57,40,17
>57,17,26


In [40]:
# age
d = {'absence': [len(df[(df.age <= 65) & (df.prediction == 'absence')]), len(df[(df.age > 65) & (df.prediction == 'absence')])],
     'presence': [len(df[(df.age <= 65) & (df.prediction == 'presence')]), len(df[(df.age > 65) & (df.prediction == 'presence')])]}


pd.DataFrame(data=d, index=['<=65', '>65'])

Unnamed: 0,absence,presence
<=65,53,39
>65,4,4


In [201]:
import math

def MI(x,y,z,total):
    if x == 0:
        return 0
    
    print(f'{x}/100*math.log2(({x}/{total})/(({y}/{total})*({z}/{total})))')
    return x/100*math.log2((x/total)/((y/total)*(z/total)))

def G(x):
    if x == 0:
        return 0
    
    return 2*math.log(2)*100*x

In [151]:
calc(2,2,48,11)

-0.09169925001442313

In [152]:
calc2(-0.83)

-115.06243197295092

In [134]:
len(df[df.slope == 'flat'])

39

## Second split

In [128]:
# gender
d = {'absence': [], 'presence': []}

for c in classes:
    for gender in genders:
        d[c].append(len(df[(df.gender == gender) & (df.prediction == c) & (df.slope == 'upsloping')]))

pd.DataFrame(data=d, index=genders)

Unnamed: 0,absence,presence
male,22,10
female,17,1


In [135]:
# fbs
d = {'absence': [], 'presence': []}

for c in classes:
    for fb in fbs:
        d[c].append(len(df[(df.fbs == fb) & (df.prediction == c) & (df.slope == 'upsloping')]))

pd.DataFrame(data=d, index=fbs)

Unnamed: 0,absence,presence
True,5,1
False,34,10


In [148]:
# age
d = {'absence': [len(df[(df.age <= 65) & (df.prediction == 'absence')  & (df.slope == 'upsloping')]), len(df[(df.age > 65) & (df.prediction == 'absence')   & (df.slope == 'upsloping')])],
     'presence': [len(df[(df.age <= 65) & (df.prediction == 'presence')  & (df.slope == 'upsloping')]), len(df[(df.age > 65) & (df.prediction == 'presence') & (df.slope == 'upsloping')])]}


pd.DataFrame(data=d, index=['<=65', '>65'])

Unnamed: 0,absence,presence
<=65,37,11
>65,2,0


In [242]:
features = ['gender', 'fbs', 'slope', 'age']
thresholds = [57, 65]
classes = df['prediction'].unique()

def calc_statistics(size, features=features, thresholds=thresholds):
    for feature in features:
        print(f'\n{feature}:')

        total_mi = 0

        if feature != 'age':
            feature_values = df[feature].unique()

            for c in classes:
                for val in feature_values:
                    mi = MI(len(df[(df[feature] == val) & (df['prediction'] == c)]), len(df[(df[feature] == val)]), len(df[(df['prediction'] == c)]), size)
                    print('MI(A,={}, A={}) = {:.4}'.format( val, c, float(mi)))
                    total_mi += mi

            print(f'total_mi = {total_mi}')
            print(f'G = {G(total_mi)}')

        else:
            for t in thresholds:
                total_mi = 0

                print(f'threshold {t}:')
                for c in classes:
                    mi = MI(len(df[(df[feature] <= t) & (df['prediction'] == c)]), len(df[(df[feature] <= t)]), len(df[(df['prediction'] == c)]), size)
                    print('MI(A,={}, A={}) = {:.4}'.format( f'<= {t}', c, float(mi)))
                    total_mi += mi

                    mi = MI(len(df[(df[feature] > t) & (df['prediction'] == c)]), len(df[(df[feature] > t)]), len(df[(df['prediction'] == c)]), size)
                    print('MI(A,={}, A={}) = {:.4}'.format( f'> {t}', c, float(mi)))
                    total_mi += mi

                print(f'total_mi = {total_mi}')
                print(f'G = {G(total_mi)}')

In [188]:
df = pd.read_csv('dataset.csv')
df.drop(['index'], axis = 1, inplace = True) 

calc_statistics(100)


gender:
MI(A,=male, A=absence) = -0.085
MI(A,=female, A=absence) = 0.110
MI(A,=male, A=presence) = 0.103
MI(A,=female, A=presence) = -0.063
total_mi = 0.06343220498780901
G = 8.793570808800062

fbs:
MI(A,=True, A=absence) = 0.009
MI(A,=False, A=absence) = -0.008
MI(A,=True, A=presence) = -0.008
MI(A,=False, A=presence) = 0.009
total_mi = 0.0009137159015669798
G = 0.1266679202007881

slope:
MI(A,=downsloping, A=absence) = -0.004
MI(A,=flat, A=absence) = -0.107
MI(A,=upsloping, A=absence) = 0.176
MI(A,=downsloping, A=presence) = 0.004
MI(A,=flat, A=presence) = 0.186
MI(A,=upsloping, A=presence) = -0.106
total_mi = 0.14909622274987033
G = 20.66912528624204

age:
threshold 57:
MI(A,=<= 57, A=absence) = 0.120
MI(A,=> 57, A=absence) = -0.090
MI(A,=<= 57, A=presence) = -0.090
MI(A,=> 57, A=presence) = 0.128
total_mi = 0.06839681527985947
G = 9.481811934102796
threshold 65:
MI(A,=<= 65, A=absence) = 0.008
MI(A,=> 65, A=absence) = -0.008
MI(A,=<= 65, A=presence) = -0.008
MI(A,=> 65, A=presence

In [214]:
df = pd.read_csv('dataset.csv')
df = df[df.slope=='upsloping']
df.drop(['index', 'slope'], axis = 1, inplace = True) 
print(len(df))
calc_statistics(50, features=['gender', 'fbs', 'age'])

50

gender:
17/100*math.log2((17/50)/((18/50)*(39/50)))
MI(A,=female, A=absence) = 0.04692
22/100*math.log2((22/50)/((32/50)*(39/50)))
MI(A,=male, A=absence) = -0.04007
1/100*math.log2((1/50)/((18/50)*(11/50)))
MI(A,=female, A=presence) = -0.01986
10/100*math.log2((10/50)/((32/50)*(11/50)))
MI(A,=male, A=presence) = 0.05064
total_mi = 0.03763369982286591
G = 5.217138585251764

fbs:
34/100*math.log2((34/50)/((44/50)*(39/50)))
MI(A,=False, A=absence) = -0.004595
5/100*math.log2((5/50)/((6/50)*(39/50)))
MI(A,=True, A=absence) = 0.004771
10/100*math.log2((10/50)/((44/50)*(11/50)))
MI(A,=False, A=presence) = 0.004692
1/100*math.log2((1/50)/((6/50)*(11/50)))
MI(A,=True, A=presence) = -0.004005
total_mi = 0.0008626694955224451
G = 0.11959138571529064

age:
threshold 57:
29/100*math.log2((29/50)/((33/50)*(39/50)))
MI(A,=<= 57, A=absence) = 0.04989
10/100*math.log2((10/50)/((17/50)*(39/50)))
MI(A,=> 57, A=absence) = -0.04071
4/100*math.log2((4/50)/((33/50)*(11/50)))
MI(A,=<= 57, A=presence) = -

In [209]:
G(0.003)

0.4158883083359672

In [215]:
MI(22,32,39,50)

22/100*math.log2((22/50)/((32/50)*(39/50)))


-0.04006517029904982

In [250]:
# gender
df = pd.read_csv('dataset.csv')
d = {'absence': [], 'presence': []}


for c in classes:
    for gender in genders:
        d[c].append(len(df[(df.gender == gender) & (df.prediction == c) & (df.slope == 'upsloping') & (df.age > 57)]))

pd.DataFrame(data=d, index=genders)

Unnamed: 0,absence,presence
male,2,6
female,8,1


In [256]:
# fbs
d = {'absence': [], 'presence': []}
df = pd.read_csv('dataset.csv')

for c in classes:
    for fb in fbs:
        d[c].append(len(df[(df.fbs == fb) & (df.prediction == c) & (df.slope == 'upsloping')  & (df.age > 57)]))

pd.DataFrame(data=d, index=fbs)

Unnamed: 0,absence,presence
True,3,1
False,7,6


In [252]:
df = pd.read_csv('dataset.csv')
df = df[(df.slope=='upsloping') & (df.age>57)]
df.drop(['index', 'slope', 'age'], axis = 1, inplace = True) 
print(len(df))
calc_statistics(17, features=['gender', 'fbs'])

17

gender:
8/100*math.log2((8/17)/((9/17)*(10/17)))
MI(A,=female, A=absence) = 0.04765
2/100*math.log2((2/17)/((8/17)*(10/17)))
MI(A,=male, A=absence) = -0.02469
1/100*math.log2((1/17)/((9/17)*(7/17)))
MI(A,=female, A=presence) = -0.0189
6/100*math.log2((6/17)/((8/17)*(7/17)))
MI(A,=male, A=presence) = 0.0519
total_mi = 0.05596552889325043
G = 7.758469712180539

fbs:
3/100*math.log2((3/17)/((4/17)*(10/17)))
MI(A,=True, A=absence) = 0.01051
7/100*math.log2((7/17)/((13/17)*(10/17)))
MI(A,=False, A=absence) = -0.008929
1/100*math.log2((1/17)/((4/17)*(7/17)))
MI(A,=True, A=presence) = -0.007199
6/100*math.log2((6/17)/((13/17)*(7/17)))
MI(A,=False, A=presence) = 0.009878
total_mi = 0.004265335230383547
G = 0.5913010178166721


In [223]:
MI(len(df[(df['gender'] == 'female') & (df['prediction'] == 'presence')]), len(df[(df['gender'] == 'female')]), len(df[(df['prediction'] == 'presence')]), 14)

0

In [276]:
df = pd.read_csv('dataset.csv')
df.drop(['index'], axis = 1, inplace = True) 
print(f"absence class {len(df[(df.prediction == 'absence')])}")
print(f"presence class {len(df[(df.prediction == 'presence')])}")

print(f"flat {len(df[(df.slope == 'flat')])}")
print(f"flat and absence {len(df[(df.prediction == 'absence') & (df.slope == 'flat')])}")

x = 12 / 100
y = 12 / 39
z = 43/ 100

print(f'weight: {x*math.log2(y/z)}')

absence class 57
presence class 43
flat 39
flat and absence 12
weight: -0.057941793968215846


In [286]:
df = pd.read_csv('dataset.csv')
df.drop(['index'], axis = 1, inplace = True) 
print(f"absence class {len(df[(df.prediction == 'absence')])}")
print(f"presence class {len(df[(df.prediction == 'presence')])}")

print(f"<=57 and upsloping {len(df[(df.age <= 57) & (df.slope == 'upsloping')])}")
print(f"upsloping and <=57 and presence {len(df[(df.prediction == 'presence') & (df.slope == 'upsloping') & (df.age <= 57)])}")

x = 4 / 100
y = 4 / 33
z = 43/ 100

print(f'weight: {x*math.log2(y/z)}')

absence class 57
presence class 43
<=57 and upsloping 33
upsloping and <=57 and presence 4
weight: -0.07307210737143308


In [287]:
print(f"<=57 {len(df[(df.age <= 57)])}")
print(f"upsloping and <=57 and absence {len(df[(df.prediction == 'absence') & (df.slope == 'upsloping') & (df.age <= 57)])}")

x = 29 / 100
y = 29 / 33
z = 57/ 100

print(f'weight: {x*math.log2(y/z)}')

<=57 57
upsloping and <=57 and absence 29
weight: 0.1811203848999395


In [299]:
df = pd.read_csv('dataset.csv')
df.drop(['index'], axis = 1, inplace = True) 
print(f"absence class {len(df[(df.prediction == 'absence')])}")
print(f"presence class {len(df[(df.prediction == 'presence')])}")

print(f">57 and upsloping and female {len(df[(df.age > 57) & (df.slope == 'upsloping') & (df.gender == 'female')])}")
print(f"upsloping and >57 and presence and female {len(df[(df.prediction == 'presence') & (df.gender == 'female') & (df.slope == 'upsloping') & (df.age > 57)])}")

x = 1 / 100
y = 1 / 9
z = 43/ 100

print(f'weight: {x*math.log2(y/z)}')

absence class 57
presence class 43
>57 and upsloping and female 9
upsloping and >57 and presence and female 1
weight: -0.019523335663696857


In [None]:
weight: 0.051283293933413666