### Part-1 Predicting Heart Disease

In [None]:
import pandas as pd
import numpy as np
import itertools
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import precision_recall_fscore_support, accuracy_score,recall_score 
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('cleveland.csv')
df = df.rename({'num':'disease'}, axis=1)
display(df.head())
#Dropping columns which have high correlation
df = df.drop(['thalach', 'oldpeak', 'ca', 'thal'], axis=1)
df = df.replace('?', np.nan).dropna()
df['disease'] = df.disease.apply(lambda x: min(x, 1))
df['age_s'] = (df.age-df.age.mean())/df.age.std()
df['cp_s'] = (df.cp-df.cp.mean())/df.cp.std()
df['trestbps_s'] = (df.trestbps-df.trestbps.mean())/df.trestbps.std()
df['chol_s'] = (df.chol-df.chol.mean())/df.chol.std()
df['restecg_s'] = (df.restecg-df.restecg.mean())/df.restecg.std()
df['slope_s'] = (df.slope-df.slope.mean())/df.slope.std()
display(df.head())
X = df.drop("disease", axis=1)
y = df["disease"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
#get_score will iterate through all the possible combinations of attributes and k-values from 1-10
#get_score will iterate through all the possible combinations of attributes and k-values from 1-10. 
#This will be taking around an hour to run all the combinations and produce output.
def get_scores(att, kl):
    r = {}
    for k in kl:
        nn = NearestNeighbors(n_neighbors=k, metric='euclidean', algorithm='auto')
        df1 = df[att].values
        fit = nn.fit(df1)
        kf = KFold(n_splits=10, shuffle=True)
        actual = []
        predicts = []
        fold = 1
        print(f"k-value: {k}")
        for X_trainn, X_testt in kf.split(df1):
            X_train, X_test = df1[X_trainn], df1[X_testt]
            y_train, y_test = df.iloc[X_trainn]['disease'], df.iloc[X_testt]['disease']
            distances, indices = fit.kneighbors(X_test)
            for i in range(len(X_testt)):
                nbrs = df.iloc[indices[i]]
                nbrs = nbrs.drop(X_testt[i], errors='ignore')
                healthy = nbrs[nbrs.disease == 0].count().disease
                sick = nbrs[nbrs.disease == 1].count().disease
                predict = 0 if (healthy > sick) else 1
                actual.append(y_test.iloc[i])
                predicts.append(predict)
            precision, recall, f1_score, support = precision_recall_fscore_support(actual, predicts, labels=[1])
            r[k] = (precision[0], recall[0], f1_score[0])
            print(f"Fold Val {fold}: Precision: {precision[0]}, ReCall: {recall[0]}, Mean F1 Val: {f1_score[0]}")
            fold += 1
    return r

d = {}
attl = ['age_s', 'sex', 'trestbps_s', 'chol_s',  'cp_s', 'slope_s', 'fbs', 'restecg_s', 'exang'] 
for i in range(1, len(attl)+1):
    for att in itertools.combinations(attl, i):
        att_list = list(att)
        res = get_scores(att_list, [1,2,3,4,5,6,7,8,9,10])
        d[','.join(att_list)] = res
        print(f"Attribute: {','.join(att_list)}")
        for k, (precision, recall, f1_score) in res.items():
            print(f"k={k}, Precision: {precision}, ReCall: {recall}, Mean F1: {f1_score}")
        print("\n")


In [None]:
#Best combination of attributes and K-Value which produced the highest F1-Score
get_scores(['sex','cp_s','slope_s'], [4,7])
print(f"k={k}, Precision: {precision}, ReCall: {recall}, Mean F1: {f1_score}")

### Part-2 Predicting Diabetes 

In [None]:
df = pd.read_csv('diabetes.csv')
df = df.replace('?', np.nan)
df = df.dropna()
display(df.head())
df['Pregnancies_s'] = (df.Pregnancies - df.Pregnancies.mean()) / df.Pregnancies.std()
df['Glucose_s'] = (df.Glucose - df.Glucose.mean()) / df.Glucose.std()
df['BloodPressure_s'] = (df.BloodPressure - df.BloodPressure.mean()) / df.BloodPressure.std()
df['SkinThickness_s'] = (df.SkinThickness - df.SkinThickness.mean()) / df.SkinThickness.std()
df['Insulin_s'] = (df.Insulin - df.Insulin.mean()) / df.Insulin.std()
df['BMI_s'] = (df.BMI - df.BMI.mean()) / df.BMI.std()
df['DiabetesPedigreeFunction_s'] = (df.DiabetesPedigreeFunction - df.DiabetesPedigreeFunction.mean()) / df.DiabetesPedigreeFunction.std()
df['Age_s'] = (df.Age - df.Age.mean()) / df.Age.std()
display(df.head())
X = df.drop("Outcome", axis=1)
y = df["Outcome"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
#get_score will iterate through all the possible combinations of attributes and k-values from 1-10. 
#This will be taking around an hour to run all the combinations and produce output.
def get_scores(att, kl):
    r = {}
    for k in kl:
        nn = NearestNeighbors(n_neighbors=k, metric='euclidean', algorithm='auto')
        df1 = df[att].values
        fit = nn.fit(df1)
        kf = KFold(n_splits=10, shuffle=True)
        actual = []
        predicts = []
        fold = 1
        print(f"k-value: {k}")
        for X_trainn, X_testt in kf.split(df1):
            X_train, X_test = df1[X_trainn], df1[X_testt]
            y_train, y_test = df.iloc[X_trainn]['Outcome'], df.iloc[X_testt]['Outcome']
            distances, indices = fit.kneighbors(X_test)
            for i in range(len(X_testt)):
                nbrs = df.iloc[indices[i]]
                nbrs = nbrs.drop(X_testt[i], errors='ignore')
                healthy = nbrs[nbrs.Outcome == 0].count().Outcome
                sick = nbrs[nbrs.Outcome == 1].count().Outcome
                predict = 0 if (healthy > sick) else 1
                actual.append(y_test.iloc[i])
                predicts.append(predict)
            precision, recall, f1_score, support = precision_recall_fscore_support(actual, predicts, labels=[1])
            r[k] = (precision[0], recall[0], f1_score[0])
            print(f"Fold Val {fold}: Precision: {precision[0]}, ReCall: {recall[0]}, Mean F1 Val: {f1_score[0]}")
            fold += 1
    return r

d = {}
attl = ['Pregnancies_s' ,'Glucose_s','BloodPressure_s' ,'SkinThickness_s' ,'Insulin_s' ,'BMI_s' ,'DiabetesPedigreeFunction_s','Age_s']
for i in range(1, len(attl)+1):
    for att in itertools.combinations(attl, i):
        att_list = list(att)
        res = get_scores(att_list, [1,2,3,4,5,6,7,8,9,10])
        d[','.join(att_list)] = res
        print(f"Attribute: {','.join(att_list)}")
        for k, (precision, recall, f1_score) in res.items():
            print(f"k={k}, Precision: {precision}, ReCall: {recall}, Mean F1: {f1_score}")
        print("\n")


In [None]:
#Best combination of attributes and K-Value which produced the highest F1-Score
get_scores(['Glucose_s','Insulin_s','BMI_s','DiabetesPedigreeFunction_s','Age_s'], [7])
print(f"k={k}, Precision: {precision}, ReCall: {recall}, Mean F1: {f1_score}")