In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

# Load the dataset
print("Loading the dataset from the specified path.")
df = pd.read_csv('/Users/astha/Desktop/Diabetes Prediction/Datasets/Dataset2/diabetes_prediction_dataset.csv')
# Display the first few rows of the dataset
print("\nDisplaying the first 5 rows of the dataset to understand its structure.")
print(df.head())

# Encode categorical columns if present
print("\nEncoding categorical columns into numeric values.")
categorical_columns = df.select_dtypes(include=['object']).columns
if len(categorical_columns) > 0:
    print(f"Categorical columns found: {categorical_columns}")
    for column in categorical_columns:
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
else:
    print("No categorical columns found.")

# Check dataset structure to decide feature and label columns
print("\nDataset Info:")
print(df.info())

# Assuming the last column is the target variable
print("\nSeparating the dataset into features (all except the last column) and labels (last column).")
features = df.iloc[:, :-1]  # All columns except the last
labels = df.iloc[:, -1]    # The last column

# Displaying the shapes of features and labels
print("\nDisplaying the shape of the features (rows, columns) and labels (rows).")
print("Features shape:", features.shape)
print("Labels shape:", labels.shape)

# Splitting the dataset into training and testing sets
print("\nSplitting the dataset into training and testing sets (80% training, 20% testing).")
features_train, features_test, label_train, label_test = train_test_split(
    features, labels, test_size=0.2, random_state=0)

# Displaying the shapes of the training and testing sets
print("\nDisplaying the shapes of training and testing sets for features and labels.")
print("Features training set shape:", features_train.shape)
print("Labels training set shape:", label_train.shape)
print("Features testing set shape:", features_test.shape)
print("Labels testing set shape:", label_test.shape)

# Convert datasets to DMatrix format for xgb.train
dtrain = xgb.DMatrix(features_train, label=label_train)
dtest = xgb.DMatrix(features_test, label=label_test)

# Define parameters for the model
params = {
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "max_depth": 5,
    "learning_rate": 0.1,
    "subsample": 0.8,
    "gamma": 0.1,
    "min_child_weight": 3,
    "alpha": 0.01,
    "lambda": 1,  # reg_lambda
    "random_state": 0
}

# Set evaluation set
evals = [(dtrain, "train"), (dtest, "test")]

# Train model with early stopping
print("\nTraining the XGBoost model with early stopping.")
xgb_model = xgb.train(
    params,
    dtrain,
    num_boost_round=100,
    evals=evals,
    early_stopping_rounds=10,
    verbose_eval=True
)

# Predicting the labels for the test set
print("\nPredicting the labels for the test dataset.")
labels_test_pred = xgb_model.predict(dtest)
labels_test_pred = (labels_test_pred > 0.5).astype(int)  # Convert probabilities to binary
print("Predicted labels for the test dataset:")
print(labels_test_pred)

# Calculating and displaying the accuracy of the model on the test dataset
print("\nCalculating the accuracy of the model on the test dataset.")
test_accuracy = accuracy_score(label_test, labels_test_pred)
print("Test Accuracy of the model: " + str(round(test_accuracy * 100, 2)) + "%")

# Predicting the labels for the training set
print("\nPredicting the labels for the training dataset.")
labels_train_pred = xgb_model.predict(dtrain)
labels_train_pred = (labels_train_pred > 0.5).astype(int)  # Convert probabilities to binary
print("Predicted labels for the training dataset:")
print(labels_train_pred)

# Calculating and displaying the accuracy of the model on the training dataset
print("\nCalculating the accuracy of the model on the training dataset.")
train_accuracy = accuracy_score(label_train, labels_train_pred)
print("Training Accuracy of the model: " + str(round(train_accuracy * 100, 2)) + "%")

XGBoostError: 
XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed
    - vcomp140.dll or libgomp-1.dll for Windows
    - libomp.dylib for Mac OSX
    - libgomp.so for Linux and other UNIX-like OSes
    Mac OSX users: Run `brew install libomp` to install OpenMP runtime.

  * You are running 32-bit Python on a 64-bit OS

Error message(s): ["dlopen(/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/xgboost/lib/libxgboost.dylib, 0x0006): Library not loaded: @rpath/libomp.dylib\n  Referenced from: <F2F42313-BF4F-3B95-A853-AE1DE94D4C87> /Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: tried: '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file)"]
