In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from tabulate import tabulate
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
import matplotlib.pyplot as plt
import xgboost as xgb
import subprocess
import logging
logging.getLogger('lightgbm').setLevel(logging.ERROR)

In [None]:
def Impute(df):
  # Imputation of Missing Values
  df[['education','BPMeds','cigsPerDay']] = SimpleImputer(missing_values=np.nan,strategy='most_frequent').fit_transform(df[['education','BPMeds','cigsPerDay']])
  df[['totChol','BMI','glucose']] = SimpleImputer(missing_values=np.nan,strategy='mean').fit_transform(df[['totChol','BMI','glucose']])
  df[['totChol','glucose']] = df[['totChol','glucose']].astype(int)
  df.loc[df.query('heartRate.isna()').index[0],'heartRate'] = np.mean(df['heartRate']).astype(int)
  return df

In [None]:
def handle_outliers(df):
  continuous_cols = ['age', 'cigsPerDay', 'totChol', 'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose']
  for column in continuous_cols:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[column] = df[column].clip(lower=lower_bound, upper=upper_bound)
  return df

In [None]:
def feature(df):
  # Feature Engineering
  df.insert(df.columns.get_loc('sysBP'),'BP',df['sysBP']+df['diaBP'])
  df.insert(df.columns.get_loc('currentSmoker'),'Cigarettes',df['currentSmoker']*df['cigsPerDay'])
  df.drop(columns=['sysBP','diaBP'],inplace = True)
  df.drop(columns=['currentSmoker','cigsPerDay'],inplace = True)
  return df

In [None]:
def cor(df):
  cor=df.iloc[:,:-1].corr()
  plt.figure(figsize=(12,12))
  sns.heatmap(cor,annot=True,cmap='coolwarm',center=0)
  plt.title("Correlation Heatmap")
  plt.show()

In [None]:
def Sample(data):
  from sklearn.utils import resample
  target_column = 'TenYearCHD'
  data_majority = data[data[target_column] == 0]
  data_minority = data[data[target_column] == 1]
  data_minority_upsampled = resample(data_minority, replace=True, n_samples=int(len(data_majority)*.5), random_state=123)
  data_upsampled = pd.concat([data_majority, data_minority_upsampled])
  return data_upsampled

In [None]:
def Scale(df):
  X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,:-1], df.iloc[:,-1], test_size=0.2)
  scaler = StandardScaler()
  X_train = scaler.fit_transform(X_train)
  X_test = scaler.transform(X_test)
  return X_train, X_test, y_train, y_test

In [None]:
def NoScale(df):
  X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,:-1], df.iloc[:,-1], test_size=0.2)
  return X_train, X_test, y_train, y_test

In [None]:
from sklearn.metrics import confusion_matrix

def print_confusion_matrix(y_test, y_pred):

    cm = confusion_matrix(y_test, y_pred)

    num_classes = len(cm)

    # Print confusion matrix header
    print("\nConfusion Matrix:")
    print("True Labels ->")
    print("Predicted Labels v")
    print(f"{'':<10}", end="")
    for i in range(num_classes):
        print(f"{i:<10}", end="")
    print()

    # Print confusion matrix contents
    for i in range(num_classes):
        print(f"{i:<10}", end="")
        for j in range(num_classes):
            print(f"{cm[i][j]:<10}", end="")
        print()

In [None]:
def lazyclassify(X_train, X_test, y_train, y_test):

  subprocess.check_call(["pip", "install", "lazypredict"])
  from lazypredict.Supervised import LazyClassifier

  clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
  models,predictions = clf.fit(X_train, X_test, y_train, y_test)
  return models,predictions

# print(predictions.iloc[0,:].name)

  print(tabulate(models,headers=['Model Name','Accuracy','Balanced Accuracy','ROC AUC','F1 Score','Time Taken'],tablefmt='simple_grid'))

In [None]:
def LDA(X_train, X_test, y_train, y_test):
  from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

  lda_model = LinearDiscriminantAnalysis()
  lda_model.fit(X_train, y_train)

  y_pred = lda_model.predict(X_test)
  y_prob = lda_model.predict_proba(X_test)[:,1]

  print_confusion_matrix(y_test, y_pred)
  Metrics(y_test,y_pred,y_prob)

In [None]:
def XGB(X_train, X_test, y_train, y_test):
  clf = xgb.XGBClassifier(
      objective='binary:logistic',  # for binary classification
      n_estimators=100,  # number of trees (boosting rounds)
      learning_rate=0.2, # step size shrinkage to prevent overfitting
      probability=True
      # random_state=42
  )

  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  y_prob = clf.predict_proba(X_test)[:,1]

  print_confusion_matrix(y_test, y_pred)
  Metrics(y_test,y_pred ,y_prob)

In [None]:
def ExtraTree(X_train, X_test, y_train, y_test):
  from sklearn.ensemble import ExtraTreesClassifier
  clf = ExtraTreesClassifier(n_estimators=100, random_state=0)

  clf.fit(X_train, y_train)

  y_pred = clf.predict(X_test)
  y_prob = clf.predict_proba(X_test)[:,1]

  print_confusion_matrix(y_test, y_pred)
  Metrics(y_test,y_pred ,y_prob)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def Metrics(y_test, y_pred, y_pred_proba):

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)

    # Create dictionary to store metrics
    metrics_dict = {
        'Accuracy': round(accuracy*100, 4),
        'Precision': round(precision*100, 4),
        'Recall': round(recall*100, 4),
        'F1': round(f1*100, 4),
        'AUC': round(auc*100, 4)
    }

    # Print the metrics dictionary neatly
    print("\nClassification Metrics:")
    for metric, value in metrics_dict.items():
        print(f"{metric} Score: [{value}] %")

    return metrics_dict

In [None]:
def preProcc(df):
  df = Impute(df)
  df = handle_outliers(df)
  df = Sample(df)
  df = feature(df)
  # cor(df)
  return NoScale(df)

def preProccScale(df):
  df = Impute(df)
  df = handle_outliers(df)
  df = Sample(df)
  df = feature(df)
  # cor(df)
  return Scale(df)

In [None]:
def Predict(X_train, X_test, y_train, y_test):
  print("\nFor Lazy Predict Classifier: -")
  lazyclassify(X_train, X_test, y_train, y_test)
  print("\nFor XGB Classifier: -")
  XGB(X_train, X_test, y_train, y_test)
  print("\nFor LDA Classifier: -")
  LDA(X_train, X_test, y_train, y_test)
  # print("\nFor ANN Classification: -")
  # ANN(X_train, X_test, y_train, y_test)
  print("\nFor ExtraTrees Classification: -")
  ExtraTree(X_train, X_test, y_train, y_test)

In [None]:
df = pd.read_csv('framingham.csv')
print("No Scaling of Data: -\n")
X_train, X_test, y_train, y_test = preProcc(df.copy())
Predict(X_train, X_test, y_train, y_test)
print("\nWith Scaling of Data: -")
X_train, X_test, y_train, y_test = preProccScale(df)
Predict(X_train, X_test, y_train, y_test)

No Scaling of Data: -


For Lazy Predict Classifier: -


100%|██████████| 29/29 [00:09<00:00,  2.93it/s]


[LightGBM] [Info] Number of positive: 1416, number of negative: 2899
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000360 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 886
[LightGBM] [Info] Number of data points in the train set: 4315, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.328158 -> initscore=-0.716530
[LightGBM] [Info] Start training from score -0.716530

For XGB Classifier: -

Confusion Matrix:
True Labels ->
Predicted Labels v
          0         1         
0         634       63        
1         63        319       

Classification Metrics:
Accuracy Score: [88.3225] %
Precision Score: [83.5079] %
Recall Score: [83.5079] %
F1 Score: [83.5079] %
AUC Score: [92.3704] %

For LDA Classifier: -

Confusion Matrix:
True Labels ->
Predicted Labels v
          0         1         
0         587    

100%|██████████| 29/29 [00:09<00:00,  2.99it/s]


[LightGBM] [Info] Number of positive: 1433, number of negative: 2882
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000374 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 887
[LightGBM] [Info] Number of data points in the train set: 4315, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.332097 -> initscore=-0.698714
[LightGBM] [Info] Start training from score -0.698714

For XGB Classifier: -

Confusion Matrix:
True Labels ->
Predicted Labels v
          0         1         
0         662       52        
1         53        312       

Classification Metrics:
Accuracy Score: [90.2688] %
Precision Score: [85.7143] %
Recall Score: [85.4795] %
F1 Score: [85.5967] %
AUC Score: [95.0443] %

For LDA Classifier: -

Confusion Matrix:
True Labels ->
Predicted Labels v
          0         1         
0         624    