In [217]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from xgboost import XGBClassifier
import joblib

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

X = train_data.drop(columns=['Target'])  # Drop target column
y = train_data['Target']  # Target column

# X = X.head(10000)
# y = y.head(10000)

X_apply = test_data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27)

In [258]:
categorical_cols = ['Age','Sex','Education','Income'] # here we will use OneHot (convert to numeric data)
numerical_cols = ['BMI','MentHlth','PhysHlth','GenHlth'] # here we will use scaling to give a better representation
# the rest of the columns are binary

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

model = ImbPipeline([
    ('preprocessor', preprocessor),
    # ('classifier', RandomForestClassifier(
    #     random_state=27, 
    #     class_weight='balanced', 
    #     criterion='entropy', 
    #     n_estimators=100,
    #     max_depth=14,
    #     min_samples_split=4,
    #     min_samples_leaf=6,
    # )),
    ('classifier', XGBClassifier(
        scale_pos_weight=len(y_train[y_train == 0]) / len(y_train[y_train == 1]), 
        random_state=27,
        max_depth=10,
        colsample_bytree=0.8,
        gamma=0.1,
        learning_rate=0.1,
        min_child_weight=3,
        n_estimators=100,
        reg_alpha=0,
        reg_lambda=100,
        subsample=0.6,
    ))
])

# param_grid = {
#     'classifier__n_estimators': [50, 100, 200],
#     'classifier__max_depth': [None, 10, 20, 30],
#     'classifier__min_samples_split': [2, 5, 10],
#     'classifier__min_samples_leaf': [1, 2, 4],
# }

# param_grid = {
#     'classifier__n_estimators': [100, 200, 500],
#     'classifier__max_depth': [3, 6, 10],
#     'classifier__learning_rate': [0.01, 0.05, 0.1],
#     'classifier__subsample': [0.6, 0.8, 1.0],
#     'classifier__colsample_bytree': [0.6, 0.8, 1.0],
#     'classifier__min_child_weight': [1, 3, 5],
#     'classifier__gamma': [0, 0.1, 0.3],
#     'classifier__reg_alpha': [0, 0.01, 0.1],
#     'classifier__reg_lambda': [1, 10, 100]
# }

# grid_search = GridSearchCV(model, param_grid, cv=3, scoring='balanced_accuracy', n_jobs=-1, verbose=2)
# grid_search.fit(X_train, y_train)
# 
# print("Best parameters found: ", grid_search.best_params_)
# print("Best balanced accuracy score: ", grid_search.best_score_)

In [269]:
# model.fit(X_train,y_train)
# 
# joblib.dump(model, 'diabeticPredictor-XGB.joblib')
model = joblib.load('diabeticPredictor-XGB.joblib')

predictions = model.predict(X_test)
score = balanced_accuracy_score(y_test, predictions)

print(score)
# print(classification_report(y_test, predictions))

threshold = 0.48

y_pred_proba = model.predict_proba(X_apply)[:, 1]
predictions = (y_pred_proba > threshold).astype(int)

# # Print classification report (for your validation data)
X_train_part, X_val, y_train_part, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Evaluate the model with the validation set
y_pred_val_proba = model.predict_proba(X_val)[:, 1]
val_predictions = (y_pred_val_proba > threshold).astype(int)
val_score = balanced_accuracy_score(y_val, val_predictions)

# submission_df = pd.DataFrame({
#     'Id': X_apply['Id'],  # Assuming 'Id' is the name of the ID column in X_apply
#     'Target': predictions
# })
# 
# # Save the predictions to a CSV file
# submission_df.to_csv('submission-xgb_17-09-24.csv', index=False)

# Output evaluation metrics for validation data
print(f"Balanced Accuracy Score on Validation: {val_score}")
print(classification_report(y_val, val_predictions))

0.730360500881937
Balanced Accuracy Score on Validation: 0.7495970049598162
              precision    recall  f1-score   support

           0       0.96      0.69      0.80     24435
           1       0.30      0.81      0.43      3977

    accuracy                           0.70     28412
   macro avg       0.63      0.75      0.62     28412
weighted avg       0.87      0.70      0.75     28412

