In [2]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from xgboost import XGBClassifier
import lightgbm as lgb
import joblib

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

categorical_cols = ['Age','Sex','Education','Income'] # here we will use OneHot (convert to numeric data)
numerical_cols = ['BMI','MentHlth','PhysHlth','GenHlth'] # here we will use scaling to give a better representation
# the rest of the columns are binary

##Feature Engineering
#BMI_PhysActivity
# train_data['BMI_PhysActivity'] = train_data['BMI'] * train_data['PhysActivity']
# test_data['BMI_PhysActivity'] = test_data['BMI'] * test_data['PhysActivity']


# Binning BMI into categories
# bins = [0, 18.5, 25, 30, 35, 100]
# labels = ['Underweight', 'Normal', 'Overweight', 'Obese', 'Severely Obese']
# train_data['BMI_binned'] = pd.cut(train_data['BMI'], bins=bins, labels=labels)
# test_data['BMI_binned'] = pd.cut(test_data['BMI'], bins=bins, labels=labels)
# 
# # You'll need to add 'BMI_binned' to your categorical columns list for OneHotEncoding
# categorical_cols.append('BMI_binned')


print(train_data.head())
 
X = train_data.drop(columns=['Target'])  # Drop target column
y = train_data['Target']  # Target column

# X = X.head(10000)
# y = y.head(10000)

X_apply = test_data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27)

   Id  HighBP  HighChol  CholCheck  BMI  Smoker  Stroke  HeartDiseaseorAttack  \
0   1       1         1          1   21       0       0                     0   
1   2       1         0          1   26       1       0                     0   
2   3       1         1          1   29       0       0                     1   
3   4       1         1          1   27       0       0                     0   
4   5       1         1          1   26       1       0                     0   

   PhysActivity  Fruits  ...  NoDocbcCost  GenHlth  MentHlth  PhysHlth  \
0             1       1  ...            0        4         0         0   
1             1       1  ...            0        3         0         0   
2             0       0  ...            0        3        15         5   
3             1       1  ...            0        2         0         0   
4             0       0  ...            0        2         0         0   

   DiffWalk  Sex  Age  Education  Income  Target  
0         0    0 

In [11]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

model = ImbPipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=27)),
    # ('classifier', RandomForestClassifier(
    #     random_state=27, 
    #     class_weight='balanced', 
    #     criterion='entropy', 
    #     n_estimators=100,
    #     max_depth=14,
    #     min_samples_split=4,
    #     min_samples_leaf=6,
    # )),
    # ('classifier', lgb.LGBMClassifier(random_state=27)),
    ('classifier', XGBClassifier(
        scale_pos_weight=len(y_train[y_train == 0]) / len(y_train[y_train == 1]), 
        random_state=27,
        max_depth=10,
        colsample_bytree=0.8,
        gamma=0.1,
        learning_rate=0.1,
        min_child_weight=3,
        n_estimators=100,
        reg_alpha=0,
        reg_lambda=100,
        subsample=0.6,
    ))
])

# param_grid = {
#     'classifier__n_estimators': [50, 100, 200],
#     'classifier__max_depth': [None, 10, 20, 30],
#     'classifier__min_samples_split': [2, 5, 10],
#     'classifier__min_samples_leaf': [1, 2, 4],
# }

# param_grid = {
#     'classifier__n_estimators': [100, 200, 500],
#     'classifier__max_depth': [3, 6, 10],
#     'classifier__learning_rate': [0.01, 0.05, 0.1],
#     'classifier__subsample': [0.6, 0.8, 1.0],
#     'classifier__colsample_bytree': [0.6, 0.8, 1.0],
#     'classifier__min_child_weight': [1, 3, 5],
#     'classifier__gamma': [0, 0.1, 0.3],
#     'classifier__reg_alpha': [0, 0.01, 0.1],
#     'classifier__reg_lambda': [1, 10, 100]
# }

# param_grid = {
#     'classifier__learning_rate': [0.01, 0.1],
#     'classifier__n_estimators': [100, 200],
#     'classifier__max_depth': [10, 20, 30],
#     'classifier__subsample': [0.8, 1.0],
#     'classifier__colsample_bytree': [0.8, 1.0]
# }
# 
# grid_search = GridSearchCV(model, param_grid, cv=3, scoring='balanced_accuracy', verbose=2)
# grid_search.fit(X_train, y_train)
# 
# print("Best parameters found: ", grid_search.best_params_)
# print("Best balanced accuracy score: ", grid_search.best_score_)

Fitting 3 folds for each of 48 candidates, totalling 144 fits
[LightGBM] [Info] Number of positive: 81466, number of negative: 81466
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008285 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7988
[LightGBM] [Info] Number of data points in the train set: 162932, number of used features: 33
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[CV] END classifier__colsample_bytree=0.8, classifier__learning_rate=0.01, classifier__max_depth=10, classifier__n_estimators=100, classifier__subsample=0.8; total time=   4.2s
[LightGBM] [Info] Number of positive: 81466, number of negative: 81466
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007859 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not eno

KeyboardInterrupt: 

In [6]:
model.fit(X_train,y_train)
# 
# joblib.dump(model, 'diabeticPredictor-XGB.joblib')
# model = joblib.load('diabeticPredictor-XGB.joblib')

predictions = model.predict(X_test)
score = balanced_accuracy_score(y_test, predictions)

print(score)
# print(classification_report(y_test, predictions))

threshold = 0.46

y_pred_proba = model.predict_proba(X_apply)[:, 1]
predictions = (y_pred_proba > threshold).astype(int)

# # Print classification report (for your validation data)
X_train_part, X_val, y_train_part, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Evaluate the model with the validation set
y_pred_val_proba = model.predict_proba(X_val)[:, 1]
val_predictions = (y_pred_val_proba > threshold).astype(int)
val_score = balanced_accuracy_score(y_val, val_predictions)

# submission_df = pd.DataFrame({
#     'Id': X_apply['Id'],  # Assuming 'Id' is the name of the ID column in X_apply
#     'Target': predictions
# })
# 
# # Save the predictions to a CSV file
# submission_df.to_csv('submission-xgb_17-09-24.csv', index=False)

# Output evaluation metrics for validation data
print(f"Balanced Accuracy Score on Validation: {val_score}")
print(classification_report(y_val, val_predictions))

[LightGBM] [Info] Number of positive: 19861, number of negative: 122199
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000678 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 203
[LightGBM] [Info] Number of data points in the train set: 142060, number of used features: 33
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.139807 -> initscore=-1.816893
[LightGBM] [Info] Start training from score -1.816893
0.5432503673911095
Balanced Accuracy Score on Validation: 0.5714564084183873
              precision    recall  f1-score   support

           0       0.88      0.98      0.93     24435
           1       0.58      0.16      0.25      3977

    accuracy                           0.87     28412
   macro avg       0.73      0.57      0.59     28412
weighted avg       0.84      0.87      0.83     28412

