# Machine Learning Term Project
##### Name: Tanni Dev, id: gp2996

#### For this project we are using "Diabetes Health Indicators Dataset" from Kaggle. This is a classification problem where I am trying to identify if a person has diabetics or not based on several parameters

###### data link:https://www.kaggle.com/datasets/julnazz/diabetes-health-indicators-dataset/data

#### Importing necessary library and data

In [1]:
import pandas as pd

# datapath for the csv file
file_path1 = './diabetes_health indicators_dataset/diabetes_012_health_indicators.csv'
file_path2 = './diabetes_health indicators_dataset/diabetes_binary_5050split_health_indicators.csv'
file_path3 = './diabetes_health indicators_dataset/diabetes_binary_health_indicators.csv'


data1 = pd.read_csv(file_path1)
data1_copy = data1.copy()
data2 = pd.read_csv(file_path2)
data3 = pd.read_csv(file_path3)

data1_copy['Diabetes_012'] = data1_copy['Diabetes_012'].replace({1: 1, 2: 1, 0: 0})
data1_copy.rename(columns={'Diabetes_012': 'Diabetes_binary'}, inplace=True)

# combining all 3 datasets
data_merged = pd.concat([data1_copy, data2, data3], ignore_index=True)

print("Dataset 1 length", len(data1))
print("Dataset 2 length", len(data2)) 
print("Dataset 3 length", len(data3)) 
print("Data merged length", len(data_merged))

Dataset 1 length 236378
Dataset 2 length 67136
Dataset 3 length 236378
Data merged length 539892


In [3]:
data_merged.columns

Index(['Diabetes_binary', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')

#### Checking if i need to pre process different types of data

In [8]:
data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 236378 entries, 0 to 236377
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Diabetes_012          236378 non-null  float64
 1   HighBP                236378 non-null  int64  
 2   HighChol              236378 non-null  float64
 3   CholCheck             236378 non-null  int64  
 4   BMI                   236378 non-null  float64
 5   Smoker                236378 non-null  float64
 6   Stroke                236378 non-null  float64
 7   HeartDiseaseorAttack  236378 non-null  float64
 8   PhysActivity          236378 non-null  int64  
 9   Fruits                236378 non-null  int64  
 10  Veggies               236378 non-null  int64  
 11  HvyAlcoholConsump     236378 non-null  int64  
 12  AnyHealthcare         236378 non-null  int64  
 13  NoDocbcCost           236378 non-null  float64
 14  GenHlth               236378 non-null  float64
 15  

#### Checkin if data contains any missing value

In [9]:
# Checking for any missing values in the dataset
missing_values1 = data1.isnull().sum()
missing_values2 = data2.isnull().sum()
missing_values3 = data3.isnull().sum()
missing_values_merged = data_merged.isnull().sum()

missing_values_counts1 = missing_values1[missing_values1 > 0]
missing_values_counts2 = missing_values2[missing_values2 > 0]
missing_values_counts3 = missing_values3[missing_values3 > 0]
missing_values_counts_merged = missing_values_merged[missing_values_merged > 0]


print("Dataset 1: ", missing_values_counts1)
print("Dataset 2: ", missing_values_counts2)
print("Dataset 3: ", missing_values_counts3)
print("Dataset_merged: ", missing_values_counts_merged)


Dataset 1:  Series([], dtype: int64)
Dataset 2:  Series([], dtype: int64)
Dataset 3:  Series([], dtype: int64)
Dataset_merged:  Series([], dtype: int64)


#### Splitting the data in training, validation and test set

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Separate features and target variable
# Dataset1 with 3 class 
# X = data1.drop('Diabetes_012', axis=1)
# y = data1['Diabetes_012']

# # Dataset2 with binary class
# X = data2.drop('Diabetes_binary', axis=1)
# y = data2['Diabetes_binary']

# # Dataset2 with binary class
# X = data3.drop('Diabetes_binary', axis=1)
# y = data3['Diabetes_binary']

# # merged dataset with binary class
X = data_merged.drop('Diabetes_binary', axis=1)
y = data_merged['Diabetes_binary']



# Standardizing the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training, validation, and test sets (70%, 10%, 20%)
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=(2/3), random_state=42)

(X_train.shape, y_train.shape), (X_val.shape, y_val.shape), (X_test.shape, y_test.shape)


(((377924, 21), (377924,)), ((53989, 21), (53989,)), ((107979, 21), (107979,)))

#### Applying 5 different ML algorithms

In [11]:

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


def train_evaluate_model(model, model_name, X_train, y_train, X_val, y_val):

    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred, average='weighted')
    recall = recall_score(y_val, y_pred, average='weighted')
    f1 = f1_score(y_val, y_pred, average='weighted')
    
    model_performance[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }


model_performance = {}

lr_param = {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
knn_param = {'n_neighbors':10,'weights':'uniform', 'metric':'euclidean'}
rf_param= {'n_estimators': 100 , 'max_features': 'sqrt'}
XGB_hyper_param = {'booster': 'gbtree', 'lambda': 0.43403998704508867, 'alpha': 2.252282347254054e-06, 'max_depth': 12, 'eta': 0.03730224167231534, 'gamma': 1.6105377714235158e-07, 'subsample': 0.436607283293611, 'colsample_bytree': 0.5631258519693639}


# List of models to train
models = {
    'Logistic Regression': LogisticRegression(**lr_param, max_iter=1000),
    'KNN': KNeighborsClassifier(),
    'Bagged Decision Tree': BaggingClassifier(n_estimators=100),
    'Random Forest': RandomForestClassifier(**rf_param, random_state=42),
    'XGB classifier': XGBClassifier(**XGB_hyper_param, random_state=42)   
}



for name, model in models.items():
    train_evaluate_model(model, name, X_train, y_train, X_val, y_val)


print(model_performance)



{'Logistic Regression': {'Accuracy': 0.8160551223397359, 'Precision': 0.7866369887164912, 'Recall': 0.8160551223397359, 'F1 Score': 0.7871224386053229}, 'KNN': {'Accuracy': 0.815592065050288, 'Precision': 0.8076070885475376, 'Recall': 0.815592065050288, 'F1 Score': 0.8110594535018403}, 'Bagged Decision Tree': {'Accuracy': 0.949582322324918, 'Precision': 0.9494134411606437, 'Recall': 0.949582322324918, 'F1 Score': 0.9494931910510094}, 'Random Forest': {'Accuracy': 0.9521198762710923, 'Precision': 0.9516783958428123, 'Recall': 0.9521198762710923, 'F1 Score': 0.9518527846538939}, 'XGB classifier': {'Accuracy': 0.8360036303691493, 'Precision': 0.8176976633328947, 'Recall': 0.8360036303691493, 'F1 Score': 0.8094109344999457}}


#### Training, Validation and Testing accuracy

In [12]:
for name, model in models.items():
    # Predictions
    y_pred_train = model.predict(X_train)
    y_pred_val = model.predict(X_val)
    y_pred_test = model.predict(X_test)

    # Evaluating the model on training, validation, and test sets
    train_accuracy = accuracy_score(y_train, y_pred_train)
    val_accuracy = accuracy_score(y_val, y_pred_val)
    test_accuracy = accuracy_score(y_test, y_pred_test)

    # Output results
    results = {
        'Model name ': name, 
        'Train Accuracy': train_accuracy,
        'Validation Accuracy': val_accuracy,
        'Test Accuracy': test_accuracy,
    }

    print(results)



{'Model name ': 'Logistic Regression', 'Train Accuracy': 0.8176935045141351, 'Validation Accuracy': 0.8160551223397359, 'Test Accuracy': 0.8178071662082442}
{'Model name ': 'KNN', 'Train Accuracy': 0.8941295075200305, 'Validation Accuracy': 0.815592065050288, 'Test Accuracy': 0.8188073606905046}
{'Model name ': 'Bagged Decision Tree', 'Train Accuracy': 0.9872355288364857, 'Validation Accuracy': 0.949582322324918, 'Test Accuracy': 0.9512312579297827}
{'Model name ': 'Random Forest', 'Train Accuracy': 0.9872434669404431, 'Validation Accuracy': 0.9521198762710923, 'Test Accuracy': 0.9538058326156011}
{'Model name ': 'XGB classifier', 'Train Accuracy': 0.8524359395010637, 'Validation Accuracy': 0.8360036303691493, 'Test Accuracy': 0.8391631706165088}
