In [1]:
#library
import pandas as pd
import numpy as np
import itertools
import seaborn as sns
import math

<H1>Dataset : Pima Indians Diabetes

In [2]:
df = pd.read_csv("../dataset/diabetes.csv")
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


<h1>Features

In [3]:
features = df.columns.tolist()[:-1]
features

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age']

In [4]:
X = df[features]
X

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


In [5]:
label = 'Outcome'
label

'Outcome'

In [6]:
y = df[label]
y

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

In [7]:
num_feats = X.shape[1]
num_feats

8

<h1>Euclidean Distance between each feature and label

<h2> Normalization Data

In [8]:
X = df[features].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
X

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.352941,0.743719,0.590164,0.353535,0.000000,0.500745,0.234415,0.483333
1,0.058824,0.427136,0.540984,0.292929,0.000000,0.396423,0.116567,0.166667
2,0.470588,0.919598,0.524590,0.000000,0.000000,0.347243,0.253629,0.183333
3,0.058824,0.447236,0.540984,0.232323,0.111111,0.418778,0.038002,0.000000
4,0.000000,0.688442,0.327869,0.353535,0.198582,0.642325,0.943638,0.200000
...,...,...,...,...,...,...,...,...
763,0.588235,0.507538,0.622951,0.484848,0.212766,0.490313,0.039710,0.700000
764,0.117647,0.613065,0.573770,0.272727,0.000000,0.548435,0.111870,0.100000
765,0.294118,0.608040,0.590164,0.232323,0.132388,0.390462,0.071307,0.150000
766,0.058824,0.633166,0.491803,0.000000,0.000000,0.448584,0.115713,0.433333


<h2> Euclidean Distance Table

In [9]:
df_ed = {'Outcome': []}

for f in features:
    df_ed['Outcome'].append(np.mean([math.sqrt((a[0]-a[1])**2) for a in zip(X[f], y)]))

df_ed = pd.DataFrame.from_dict(df_ed)
df_ed.index = features
df_ed

Unnamed: 0,Outcome
Pregnancies,0.375383
Glucose,0.461062
BloodPressure,0.510235
SkinThickness,0.400147
Insulin,0.360511
BMI,0.460225
DiabetesPedigreeFunction,0.376333
Age,0.366081


<h1> Function

In [10]:
def grouping_attribute_score(attrubutes, value):
    '''combine value to dictionary format'''
    return {"attributes": attrubutes, "value": value}
#can modify to be object in the future

In [11]:
def find_best_subset(subset_score, max_min='max'):
    """find best score of subset subset score is list of grouping_attribute_score
        max_min (str) : can be only 'max' or 'min' to define the best value
    """
    #still available for only a group of attributes
    print('========================================== find best value ===================================')
    best_subset_score = []
    
    if max_min == 'min':
        # min is the best
        subset_score.sort(key=lambda x: x['value'], reverse=False)
    else:        
        #max is the best
        subset_score.sort(key=lambda x: x['value'], reverse=True)

    best_subset_score.append(subset_score[0])
    print(best_subset_score)
    print("=============================================================================================")
    return best_subset_score
# return [{'attrubute': ('Glucose', 'BMI'), 'value': 0.3796380304756596}]

In [12]:
def cal_best(algo_round, max_min='max'):
    """match all subset and find the best one
        max_min (str) : can be only 'max' or 'min' to define the best value
    """
    #start calculating
    subset_score = []
    subset_mean = []
    
    #use mean of all correlation of subset
    for atrs in algo_round:
        temp_score_mean = []
        for i in atrs:
            #euclidean distance
            ed = df_ed[label][i]
            temp_score_mean.append(ed)
        subset_mean.append(np.mean(temp_score_mean)) #score
        
    for atrs,mean_score in zip(algo_round,subset_mean):
        subset_score.append(grouping_attribute_score(atrs, mean_score))
    [print(i) for i in subset_score]
    return find_best_subset(subset_score=subset_score, max_min=max_min)

<h1>SFFS

In [13]:
best_subset_per_round = []
combination_history = []
for num in range(1, num_feats+1):   
    print("========================================== Subset_Score Round", num,'=============================')
    if num == 1: #1st round
        #combination of subset in each round
        sfs_round = list(itertools.combinations(features, num))

        print()        
        best_subset_per_round.append(cal_best(algo_round=sfs_round, max_min='max'))
        print()
        
    elif num == 2 or num == 3:
        #combination of subset in each round
        sfs_round = list(itertools.combinations(features, num))
        
        #hv to pop non-best_subset out
        sfs_round_temp = []
        for i in range(len(sfs_round)):
            if all(attr in sfs_round[i] for attr in best_subset_per_round[num-2][0]['attributes']):
                sfs_round_temp.append(sfs_round[i])
        sfs_round = sfs_round_temp
        #record the history of combination
        combination_history.extend(sfs_round)
        print('combination_hist:=====================>', combination_history)
        
        print()
        best_subset_per_round.append(cal_best(algo_round=sfs_round, max_min='max'))
        print()
        
    else: #other round
        print()
        #combination
        sfs_check = list(itertools.combinations(best_subset_per_round[-1][0]['attributes'], num-2))
        print('combination of previous round:===========>', sfs_check)
        #check history
        [sfs_check.remove(i) for i in sfs_check if i in combination_history]
                
        print('never occured combination:===============>', sfs_check)
        #combination in the same number as last round that has sfs_check inside
        check_temp = []
        comb_temp = list(itertools.combinations(features, num-1))
        for j in range(len(sfs_check)):
            for i in range(len(comb_temp)):
                if all(attr in comb_temp[i] for attr in sfs_check[j]):
                    check_temp.append(comb_temp[i])
        comb_temp = check_temp
        #check history
        [comb_temp.remove(i) for i in comb_temp if i in combination_history]
        print()
        print('------------------------------------------calculate sbs check------------------------------------------')
        sbs_check = cal_best(comb_temp)
        print('sbs check:',sbs_check)
        print('current best', best_subset_per_round[-1])
        if sbs_check[0]['value'] > best_subset_per_round[-1][0]['value']: #max is the best
            print('privoue best_subset_score ----->', best_subset_per_round)
            print('new value')
            best_subset_per_round[-1][0] = sbs_check[0]
        print('current best_subset_score ----->', best_subset_per_round)
        print('------------------------------------------calculate sbs check------------------------------------------')
        print()
        sfs_round = list(itertools.combinations(features, num))
        
        #hv to pop non-best_subset out
        sfs_round_temp = []
        for i in range(len(sfs_round)):
            if all(attr in sfs_round[i] for attr in best_subset_per_round[num-2][0]['attributes']):
                sfs_round_temp.append(sfs_round[i])
        sfs_round = sfs_round_temp
        #record the history of combination
        combination_history.extend(sfs_round)
        print('combination_hist:=====================>', combination_history)
        
        print()
        best_subset_per_round.append(cal_best(algo_round=sfs_round, max_min='max'))
        print()       
        
print()
print('best_subset_per_round :',"\n", '=============== SubsetRanking ===============')
# [print(i) for i in best_subset_per_round]

best_temp = [i[0] for i in best_subset_per_round]
best_subset_per_round = best_temp
best_subset_per_round.sort(key=lambda x: x['value'], reverse=True) # max is the best
best_subset_per_round


{'attributes': ('Pregnancies',), 'value': 0.37538296568627444}
{'attributes': ('Glucose',), 'value': 0.4610618195142379}
{'attributes': ('BloodPressure',), 'value': 0.5102352288251365}
{'attributes': ('SkinThickness',), 'value': 0.4001473063973064}
{'attributes': ('Insulin',), 'value': 0.360510859929078}
{'attributes': ('BMI',), 'value': 0.46022533221559864}
{'attributes': ('DiabetesPedigreeFunction',), 'value': 0.37633266172075147}
{'attributes': ('Age',), 'value': 0.3660807291666666}
[{'attributes': ('BloodPressure',), 'value': 0.5102352288251365}]


{'attributes': ('Pregnancies', 'BloodPressure'), 'value': 0.4428090972557055}
{'attributes': ('Glucose', 'BloodPressure'), 'value': 0.4856485241696872}
{'attributes': ('BloodPressure', 'SkinThickness'), 'value': 0.4551912676112214}
{'attributes': ('BloodPressure', 'Insulin'), 'value': 0.4353730443771072}
{'attributes': ('BloodPressure', 'BMI'), 'value': 0.48523028052036754}
{'attributes': ('BloodPressure', 'DiabetesPedigreeFunction'), '

[{'attributes': ('BloodPressure',), 'value': 0.5102352288251365},
 {'attributes': ('Glucose', 'BloodPressure'), 'value': 0.4856485241696872},
 {'attributes': ('Glucose', 'BloodPressure', 'BMI'),
  'value': 0.47717412685165767},
 {'attributes': ('Glucose', 'BloodPressure', 'SkinThickness', 'BMI'),
  'value': 0.4579174217380699},
 {'attributes': ('Glucose',
   'BloodPressure',
   'SkinThickness',
   'BMI',
   'DiabetesPedigreeFunction'),
  'value': 0.4416004697346062},
 {'attributes': ('Pregnancies',
   'Glucose',
   'BloodPressure',
   'SkinThickness',
   'BMI',
   'DiabetesPedigreeFunction'),
  'value': 0.43056421905988423},
 {'attributes': ('Pregnancies',
   'Glucose',
   'BloodPressure',
   'SkinThickness',
   'BMI',
   'DiabetesPedigreeFunction',
   'Age'),
  'value': 0.42135229193228174},
 {'attributes': ('Pregnancies',
   'Glucose',
   'BloodPressure',
   'SkinThickness',
   'Insulin',
   'BMI',
   'DiabetesPedigreeFunction',
   'Age'),
  'value': 0.41374711293188127}]

<h2> The Best Candidate Subset by SFS

In [14]:
find_best_subset(best_subset_per_round, 'max')

[{'attributes': ('BloodPressure',), 'value': 0.5102352288251365}]


[{'attributes': ('BloodPressure',), 'value': 0.5102352288251365}]