In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import pickle

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import KBinsDiscretizer, LabelEncoder
import xgboost as xgb

In [None]:
raw_df = pd.read_csv('heart.csv')

In [None]:
raw_df.shape

In [None]:
raw_df.columns = raw_df.columns.str.lower()

In [None]:
raw_df.columns 

In [None]:
raw_df.isnull().sum()

In [None]:
raw_df.heartdisease.value_counts(normalize=True)

In [None]:
raw_df.heartdisease.mean()

In [None]:
raw_df.nunique()

In [None]:
raw_df.age.value_counts()

# Data Scaling 

In [None]:

mms = MinMaxScaler() # Normalization
ss = StandardScaler() # Standardization

raw_df['oldpeak'] = mms.fit_transform(raw_df[['oldpeak']])
raw_df['age'] = ss.fit_transform(raw_df[['age']])
raw_df['restingbp'] = ss.fit_transform(raw_df[['restingbp']])
raw_df['cholesterol'] = ss.fit_transform(raw_df[['cholesterol']])
raw_df['maxhr'] = ss.fit_transform(raw_df[['maxhr']])
raw_df.head()

# Feature Analysis 

In [None]:
df_full_train, df_test = train_test_split(raw_df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [None]:
len(df_train), len(df_val), len(df_test)

In [None]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [None]:
y_train = df_train.heartdisease .values
y_val = df_val.heartdisease.values
y_test = df_test.heartdisease.values

del df_train['heartdisease']
del df_val['heartdisease']
del df_test['heartdisease']

In [None]:
df_full_train.head(5)

In [None]:
df_full_train.heartdisease.value_counts()

In [None]:
mutual_col = list(df_train.columns)
mutual_col

# Convert qualitative data to quantitative

In [None]:
categorical_columns = list(df_full_train.dtypes[df_full_train.dtypes == 'object'].index)
categorical_columns

In [None]:
numerical_columns = list(df_full_train.dtypes[df_full_train.dtypes != 'object'].index)
numerical_columns

In [None]:
le = LabelEncoder() 
df_full_train[categorical_columns] = df_full_train[categorical_columns].apply(lambda col: le.fit_transform(col)) 
df_full_train.head(5)

In [None]:
mutual_scores = []
for c in  df_full_train[mutual_col].columns: 
    score = round(mutual_info_score(df_full_train.heartdisease,df_full_train[c]),3)
    mutual_scores.append(score)
    print(f"mutual score for {c} is {score}")

In [None]:
import matplotlib.pyplot as plt

sorted_mutual_scores, sorted_mutual_col_names = zip(*sorted(zip(mutual_scores, mutual_col)))
plt.bar(sorted_mutual_col_names, sorted_mutual_scores)
plt.xlabel("Features")
plt.ylabel("Mutual Information Scores")
plt.title("Mutual Information Scores for Features")
plt.xticks(rotation='vertical')
plt.show()

# Feature importance: Heartdisease rate and risk ratio

In [None]:
for column in categorical_columns:
    unique_categories = df_full_train[column].unique() # this is important
    print(f"Heart disease rate for {column}:")
    for category in unique_categories:
        mean = df_full_train[df_full_train[column] == category].heartdisease.mean()
        print(f"{category}: {mean}")
    print("\n")

# Risk Ratio

In [None]:
global_heartdisease = df_full_train.heartdisease.mean()

In [None]:
for c in categorical_columns:
    print(c)
    df_group = df_full_train.groupby(c).heartdisease.agg(['mean', 'count'])
    df_group['diff'] = df_group['mean'] -global_heartdisease
    df_group['risk'] = df_group['mean'] /global_heartdisease
    df_group
    print(df_group)
    print()

# Feature importance:Correlation

In [None]:
numeric_columns = list(df_full_train.dtypes[df_full_train.dtypes != 'object'].index)
numeric_columns

In [None]:
data_numeric = df_full_train[numeric_columns]
data_numeric.describe()

In [None]:
data_numeric.corr()

In [None]:
plt.figure(figsize=(9, 6))
sns.heatmap(data_numeric.corr())
plt.title('Heatmap showing correlations between numerical data')
plt.show();

In [None]:
threshold = 0.1


columns_to_drop = [col for col in df_full_train.columns if mutual_info_score(df_full_train.heartdisease,df_full_train[col])< threshold]


df_full_train = df_full_train.drop(columns=columns_to_drop)
df_full_train

In [None]:
df_full_train, df_test = train_test_split(raw_df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
y_train = df_train.heartdisease .values
y_val = df_val.heartdisease.values
y_test = df_test.heartdisease.values

del df_train['heartdisease']
del df_val['heartdisease']
del df_test['heartdisease']

# One Hot Encoding

In [None]:
dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

# Logistic Regression

In [None]:
model = LogisticRegression(solver='lbfgs')
model.fit(X_train,y_train)

In [None]:
y_pred = model.predict_proba(X_val)[:, 1]

In [None]:
heartdisease_decision = (y_pred >= 0.5)

In [None]:
(y_val == heartdisease_decision).mean()

In [None]:
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = heartdisease_decision.astype(int)
df_pred['actual'] = y_val
df_pred

# Logistic Regression Score

In [None]:
roc_auc_score(y_val, y_pred)

# Decision Tree 

In [None]:
df_full_train, df_test = train_test_split(raw_df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
y_train = df_train.heartdisease .values
y_val = df_val.heartdisease.values
y_test = df_test.heartdisease.values

del df_train['heartdisease']
del df_val['heartdisease']
del df_test['heartdisease']

In [None]:
scores = []
depths = [4,5,6]

for depth in depths: 
    for s in [1, 5, 10, 15, 20, 500, 100, 200]:
        x_dict = df_train.to_dict(orient='records')
        dv= DictVectorizer(sparse=False)
        x_train = dv.fit_transform(x_dict)
        dt = DecisionTreeClassifier(max_depth=depth, min_samples_leaf=s)
        dt.fit(x_train, y_train)
        val_dicts = df_val.to_dict(orient='records')
        x_val = dv.transform(val_dicts)
        y_pred = dt.predict_proba(x_val)[:, 1]
        auc = roc_auc_score(y_val, y_pred)

        scores.append((depth, s, auc))

In [None]:
columns = ['max_depth', 'min_samples_leaf', 'auc']
df_scores = pd.DataFrame(scores, columns=columns)

In [None]:
df_scores_pivot = df_scores.pivot(index='min_samples_leaf', columns=['max_depth'], values=['auc'])
df_scores_pivot.round(3)

In [None]:
sns.heatmap(df_scores_pivot, fmt=".3f");

In [None]:
dt = DecisionTreeClassifier(max_depth=5, min_samples_leaf=5)
dt.fit(X_train, y_train)

# Decision Tree Score

In [None]:
val_dicts = df_val.to_dict(orient='records')
x_val = dv.transform(val_dicts)
y_pred = dt.predict_proba(x_val)[:, 1]
auc = roc_auc_score(y_val, y_pred)
auc

# Random Forest Classifier

In [None]:
df_full_train, df_test = train_test_split(raw_df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
y_train = df_train.heartdisease .values
y_val = df_val.heartdisease.values
y_test = df_test.heartdisease.values

del df_train['heartdisease']
del df_val['heartdisease']
del df_test['heartdisease']

In [None]:
scores = []

for d in [5, 10, 15]:
    for n in range(10, 201, 10):
        rf = RandomForestClassifier(n_estimators=n,
                                    max_depth=d,
                                    random_state=1)
        rf.fit(X_train, y_train)

        y_pred = rf.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_val, y_pred)

        scores.append((d, n, auc))

In [None]:
columns = ['max_depth', 'n_estimators', 'auc']
df_scores = pd.DataFrame(scores, columns=columns)

In [None]:
df_scores_pivot = df_scores.pivot(index='n_estimators', columns=['max_depth'], values=['auc'])
df_scores_pivot.round(3)

In [None]:
sns.heatmap(df_scores_pivot, fmt=".3f");

In [None]:
for d in [5, 10, 15]:
    df_subset = df_scores[df_scores.max_depth == d]
    
    plt.plot(df_subset.n_estimators, df_subset.auc,
             label='max_depth=%d' % d)

plt.legend();

In [None]:
max_depth = 15

In [None]:
scores = []

for s in [1, 3, 5, 10, 50]:
    for n in range(10, 201, 10):
        rf = RandomForestClassifier(n_estimators=n,
                                    max_depth=max_depth,
                                    min_samples_leaf=s,
                                    random_state=1)
        rf.fit(X_train, y_train)

        y_pred = rf.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_val, y_pred)

        scores.append((s, n, auc))

In [None]:
columns = ['min_samples_leaf', 'n_estimators', 'auc']
df_scores = pd.DataFrame(scores, columns=columns)

In [None]:
sns.heatmap(df_scores_pivot, fmt=".3f");

In [None]:
colors = ['black', 'blue', 'orange', 'red', 'grey']
values = [1, 3, 5, 10, 50]

for s, col in zip(values, colors):
    df_subset = df_scores[df_scores.min_samples_leaf == s]
    
    plt.plot(df_subset.n_estimators, df_subset.auc,
             color=col,
             label='min_samples_leaf=%d' % s)

plt.legend();

In [None]:
min_samples_leaf = 3

In [None]:
scores = []
for n in range(10, 201, 10):
        rf = RandomForestClassifier(n_estimators=n,
                                    max_depth=max_depth,
                                    min_samples_leaf=min_samples_leaf,
                                    random_state=1)
        rf.fit(X_train, y_train)

        y_pred = rf.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_val, y_pred)

        scores.append((s, n, auc))

In [None]:
columns = ['max_depth', 'min_samples_leaf', 'auc']
df_scores = pd.DataFrame(scores, columns=columns)

In [None]:
rf = RandomForestClassifier(n_estimators=200,
                            max_depth=max_depth,
                            min_samples_leaf=min_samples_leaf,random_state=1)

rf.fit(x_train, y_train)

# Random Forest Score

In [None]:
y_pred = rf.predict_proba(X_val)[:, 1]
auc = roc_auc_score(y_val, y_pred)
auc

# Gradient boosting and XGBoost

In [None]:
df_full_train, df_test = train_test_split(raw_df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
y_train = df_train.heartdisease .values
y_val = df_val.heartdisease.values
y_test = df_test.heartdisease.values

del df_train['heartdisease']
del df_val['heartdisease']
del df_test['heartdisease']

In [None]:
x_dict = df_train.to_dict(orient='records')
dv= DictVectorizer(sparse=False)
x_train = dv.fit_transform(x_dict)
val_dicts = df_val.to_dict(orient='records')
x_val = dv.transform(val_dicts)

In [None]:
features = dv.get_feature_names_out()
features

In [None]:
features = dv.get_feature_names_out()
features = features.tolist()
dtrain = xgb.DMatrix(x_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(x_val, label=y_val, feature_names=features)

In [None]:
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'binary:logistic',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=10)

In [None]:
y_pred = model.predict(dval)
roc_auc_score(y_val, y_pred)

In [None]:
watchlist = [(dtrain, 'train'), (dval, 'val')]

In [None]:
%%capture output

xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'binary:logistic',
    'eval_metric': 'auc',

    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=10,
                  verbose_eval=5,
                  evals=watchlist)
y_pred = model.predict(dval)

# Gradient boosting and XGBoost Score

In [None]:
roc_auc_score(y_val, y_pred)

In [None]:
s = output.stdout

In [None]:
print(s[:200])

In [None]:
def parse(output):
    scores = []
    for line in output.stdout.strip().split('\n'):
            a,b,c = line.split('\t')
            itr = int(a.strip('[]'))
            train = float(b.split(':')[1])
            val = float(c.split(':')[1])
            
            scores.append((itr,train,val))
    columns = ['iteration','train_auc','val_auc']
    df_scores = pd.DataFrame(scores,columns=columns)
    return df_scores

In [None]:
df_score = parse(output)

In [None]:
plt.plot(df_score.iteration, df_score.train_auc, label='train')
plt.plot(df_score.iteration, df_score.val_auc, label='val')
plt.legend();

In [None]:
def final_train(df_train, y_train, C=1.0):
    dicts = df_train.to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    x_train = dv.fit_transform(dicts)

    model =LogisticRegression(solver='lbfgs')
    model.fit(x_train, y_train)
    
    return dv, model


def final_predict(df_test, dv, model):
    dicts = df_test.to_dict(orient='records')

    x_test = dv.transform(dicts)
    y_pred = model.predict_proba(x_test)[:, 1]

    return y_pred

In [None]:
dv, model = final_train(df_full_train, df_full_train.heartdisease.values, C=1.0)
y_pred = final_predict(df_test, dv, model)

In [None]:
dv, model,y_pred

In [None]:
output_file =  'mid_term_model'

In [None]:
with open(output_file,'wb')as f_out:
    pickle.dump((dv,model),f_out)
print(f'the model is saved to {output_file}')