## Imports

In [2]:
import pandas as pd 
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

# Modelling
from imblearn.under_sampling import TomekLinks
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve, auc
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier


#### Import the CSV Data as Pandas DataFrame

In [20]:
data = pd.read_csv("Data/train.csv")
test_data = pd.read_csv("Data/test.csv")

In [21]:
data.head()

Unnamed: 0,id,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,...,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking
0,0,55,165,60,81.0,0.5,0.6,1,1,135,...,40,75,16.5,1,1.0,22,25,27,0,1
1,1,70,165,65,89.0,0.6,0.7,2,2,146,...,57,126,16.2,1,1.1,27,23,37,1,0
2,2,20,170,75,81.0,0.4,0.5,1,1,118,...,45,93,17.4,1,0.8,27,31,53,0,1
3,3,35,180,95,105.0,1.5,1.2,1,1,131,...,38,102,15.9,1,1.0,20,27,30,1,0
4,4,30,165,60,80.5,1.5,1.0,1,1,121,...,44,93,15.4,1,0.8,19,13,17,0,1


In [22]:
def reduce_features(df):
    df['eyesight_diff'] = (df['eyesight(left)'] - df['eyesight(right)'])

    # Calculate the mean of left and right hearing
    df['hearing_diff'] = (df['hearing(left)'] - df['hearing(right)'])

    # Drop the original columns
    df = df.drop(['eyesight(left)', 'eyesight(right)', 'hearing(left)', 'hearing(right)'], axis=1)

    return df

data = reduce_features(data)
test_data = reduce_features(test_data)

In [23]:
data.head()

Unnamed: 0,id,age,height(cm),weight(kg),waist(cm),systolic,relaxation,fasting blood sugar,Cholesterol,triglyceride,...,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking,eyesight_diff,hearing_diff
0,0,55,165,60,81.0,135,87,94,172,300,...,16.5,1,1.0,22,25,27,0,1,-0.1,0
1,1,70,165,65,89.0,146,83,147,194,55,...,16.2,1,1.1,27,23,37,1,0,-0.1,0
2,2,20,170,75,81.0,118,75,79,178,197,...,17.4,1,0.8,27,31,53,0,1,-0.1,0
3,3,35,180,95,105.0,131,88,91,180,203,...,15.9,1,1.0,20,27,30,1,0,0.3,0
4,4,30,165,60,80.5,121,76,91,155,87,...,15.4,1,0.8,19,13,17,0,1,0.5,0


#### Preparing X and Y variables


In [24]:
X = data.drop(columns=['smoking','id'],axis=1)

In [35]:
X_testdata = test_data.drop(columns=['id'],axis=1)

In [28]:
X.head()

Unnamed: 0,age,height(cm),weight(kg),waist(cm),systolic,relaxation,fasting blood sugar,Cholesterol,triglyceride,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,eyesight_diff,hearing_diff
0,55,165,60,81.0,135,87,94,172,300,40,75,16.5,1,1.0,22,25,27,0,-0.1,0
1,70,165,65,89.0,146,83,147,194,55,57,126,16.2,1,1.1,27,23,37,1,-0.1,0
2,20,170,75,81.0,118,75,79,178,197,45,93,17.4,1,0.8,27,31,53,0,-0.1,0
3,35,180,95,105.0,131,88,91,180,203,38,102,15.9,1,1.0,20,27,30,1,0.3,0
4,30,165,60,80.5,121,76,91,155,87,44,93,15.4,1,0.8,19,13,17,0,0.5,0


In [29]:
y = data['smoking']

In [31]:
# Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [32]:
X = preprocessor.fit_transform(X)

In [34]:
X.shape

(159256, 20)

In [36]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((127404, 20), (31852, 20))

#### Create an Evaluate Function to give all metrics after model Training

In [37]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    precision = precision_score(true, predicted)
    recall = recall_score(true, predicted)
    f1 = f1_score(true, predicted)
    return accuracy, precision, recall, f1


In [54]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "XGBoost": XGBClassifier(),
    "CatBoost": CatBoostClassifier(verbose=False)
}

model_list = []
accuracy_list = []

for model_name, model in models.items():
    model.fit(X_train, y_train)  # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate model
    accuracy_train = accuracy_score(y_train, y_train_pred)
    accuracy_test = accuracy_score(y_test, y_test_pred)
    precision = precision_score(y_test, y_test_pred)
    recall = recall_score(y_test, y_test_pred)
    f1 = f1_score(y_test, y_test_pred)

    print(model_name)
    model_list.append(model_name)
    accuracy_list.append(accuracy_test)

    print('Model performance for Training set')
    print("- Accuracy: {:.4f}".format(accuracy_train))
    print('----------------------------------')

    print('Model performance for Test set')
    print("- Accuracy: {:.4f}".format(accuracy_test))
    print("- Precision: {:.4f}".format(precision))
    print("- Recall: {:.4f}".format(recall))
    print("- F1 Score: {:.4f}".format(f1))
    print('='*35)
    print('\n')


Logistic Regression
Model performance for Training set
- Accuracy: 0.7495
----------------------------------
Model performance for Test set
- Accuracy: 0.7447
- Precision: 0.7030
- Recall: 0.7308
- F1 Score: 0.7166


Decision Tree
Model performance for Training set
- Accuracy: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.6962
- Precision: 0.6589
- Recall: 0.6472
- F1 Score: 0.6530


Random Forest
Model performance for Training set
- Accuracy: 1.0000
----------------------------------
Model performance for Test set
- Accuracy: 0.7717
- Precision: 0.7170
- Recall: 0.7981
- F1 Score: 0.7554


AdaBoost
Model performance for Training set
- Accuracy: 0.7685
----------------------------------
Model performance for Test set
- Accuracy: 0.7650
- Precision: 0.7138
- Recall: 0.7814
- F1 Score: 0.7461


K-Nearest Neighbors
Model performance for Training set
- Accuracy: 0.8192
----------------------------------
Model performance for Test set
- Accuracy: 0.7

0.7814579932186362

In [55]:
pd.DataFrame(list(zip(model_list, accuracy_list)), columns=['Model Name', 'Accuracy']).sort_values(by=['Accuracy'],ascending=False)

Unnamed: 0,Model Name,Accuracy
6,CatBoost,0.781458
5,XGBoost,0.777063
2,Random Forest,0.771694
3,AdaBoost,0.765038
0,Logistic Regression,0.744694
4,K-Nearest Neighbors,0.733172
1,Decision Tree,0.69622


In [56]:
from sklearn.ensemble import VotingClassifier

# Initialize individual models
catboost_model = CatBoostClassifier(verbose=False)
xgboost_model = XGBClassifier()

# Create a voting ensemble
voting_ensemble = VotingClassifier(estimators=[('CatBoost', catboost_model), ('XGBoost', xgboost_model)], voting='hard')

# Train the ensemble model
voting_ensemble.fit(X_train, y_train)

# Evaluate the ensemble model
accuracy = voting_ensemble.score(X_test, y_test)
print("Ensemble Accuracy:", accuracy)


Ensemble Accuracy: 0.7792917242245385


CatBoostClassifier

In [58]:
# from sklearn.ensemble import StackingClassifier
# # Define base models
# base_models = [
#     ('CatBoost', CatBoostClassifier(verbose=False)),
#     ('XGBoost', XGBClassifier())
# ]

# # Define meta-model
# meta_model = LogisticRegression()

# # Create stacking classifier
# stacking_clf = StackingClassifier(estimators=base_models, final_estimator=meta_model)

# # Train stacking classifier
# stacking_clf.fit(X_train, y_train)

# # Make predictions
# y_pred = stacking_clf.predict(X_test)

# # Evaluate stacking classifier
# accuracy = accuracy_score(y_test, y_pred)
# print("Stacking Accuracy:", accuracy)

Stacking Accuracy: 0.7801707899033028


In [60]:
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

# Initialize CatBoostClassifier
catboost_model = CatBoostClassifier(verbose=False)

# Train the model
catboost_model.fit(X_train, y_train)

# Make predictions
y_pred = catboost_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy of the CatBoost model is {:.2f}%".format(accuracy * 100))


Accuracy of the CatBoost model is 78.15%


In [59]:
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score
# from xgboost import XGBClassifier
# from catboost import CatBoostClassifier

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Define base models
# catboost_model = CatBoostClassifier(verbose=False)
# xgboost_model = XGBClassifier()

# # Train base models
# catboost_model.fit(X_train, y_train)
# xgboost_model.fit(X_train, y_train)

# # Generate predictions
# catboost_pred = catboost_model.predict(X_test)
# xgboost_pred = xgboost_model.predict(X_test)

# # Blend predictions (simple averaging)
# blended_pred = (catboost_pred + xgboost_pred) / 2

# # Round to the nearest integer for classification
# blended_pred_rounded = [round(pred) for pred in blended_pred]

# # Evaluate blended predictions
# accuracy = accuracy_score(y_test, blended_pred_rounded)
# print("Blending Accuracy:", accuracy)


Blending Accuracy: 0.7792917242245385


In [62]:
pred_df=pd.DataFrame({'Actual Value':y_test,'Predicted Value':y_pred,'Difference':y_test-y_pred})
pred_df

Unnamed: 0,Actual Value,Predicted Value,Difference
145654,1,1,0
49118,1,1,0
21769,1,1,0
108299,0,0,0
117130,0,0,0
...,...,...,...
11934,1,0,1
115975,0,0,0
158289,1,1,0
142308,0,1,-1


In [63]:
pred_df.value_counts()

Actual Value  Predicted Value  Difference
0             0                 0            13519
1             1                 0            11372
0             1                -1             4264
1             0                 1             2697
Name: count, dtype: int64