In [14]:
!pip install scikit-learn pandas xgboost lightgbm




[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [15]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split,  GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import xgboost as xgb
import lightgbm as lgb
from sklearn.svm import SVC

In [16]:
dataset = pd.read_csv('../dataset/dataset_2.csv')

for column in dataset.columns:
    dataset = dataset.dropna(subset = [column])

dataset['Risk_Level_Binary'] = dataset['RiskLevel'].map({'high risk': 1, 'mid risk':1, 'low risk': 0})
y = dataset['Risk_Level_Binary']
dataset = dataset.drop(['RiskLevel', 'Risk_Level_Binary'], axis =1)


In [17]:

X_train, X_test, y_train, y_test = train_test_split(dataset, y, test_size = .2, random_state = 42)

scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [18]:
print(dataset.info())
print(dataset.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1014 entries, 0 to 1013
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          1014 non-null   int64  
 1   SystolicBP   1014 non-null   int64  
 2   DiastolicBP  1014 non-null   int64  
 3   BS           1014 non-null   float64
 4   BodyTemp     1014 non-null   float64
 5   HeartRate    1014 non-null   int64  
dtypes: float64(2), int64(4)
memory usage: 47.7 KB
None
               Age   SystolicBP  DiastolicBP           BS     BodyTemp  \
count  1014.000000  1014.000000  1014.000000  1014.000000  1014.000000   
mean     29.871795   113.198225    76.460552     8.725986    98.665089   
std      13.474386    18.403913    13.885796     3.293532     1.371384   
min      10.000000    70.000000    49.000000     6.000000    98.000000   
25%      19.000000   100.000000    65.000000     6.900000    98.000000   
50%      26.000000   120.000000    80.000000     7.500000 

In [19]:
results = {}
models = {
    'Logistic Regression': LogisticRegression(),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'MLP': MLPClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': xgb.XGBClassifier(),
    'LightGBM': lgb.LGBMClassifier(),
    'SVC': SVC(),
}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    results[name] = {
        'accuracy': acc,
        'precision': prec,
        'recall': rec,
        'f1': f1,
    }
    
    print(f"Model: {name}")
    print(f"Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}, F1: {f1:.4f}")
    print("-" * 50) 

Model: Logistic Regression
Accuracy: 0.7192, Precision: 0.8113, Recall: 0.6992, F1: 0.7511
--------------------------------------------------
Model: KNN
Accuracy: 0.7044, Precision: 0.7944, Recall: 0.6911, F1: 0.7391
--------------------------------------------------
Model: Decision Tree
Accuracy: 0.8621, Precision: 0.8626, Recall: 0.9187, F1: 0.8898
--------------------------------------------------
Model: Random Forest
Accuracy: 0.8670, Precision: 0.8582, Recall: 0.9350, F1: 0.8949
--------------------------------------------------




Model: MLP
Accuracy: 0.7291, Precision: 0.8778, Recall: 0.6423, F1: 0.7418
--------------------------------------------------
Model: Gradient Boosting
Accuracy: 0.7980, Precision: 0.8306, Recall: 0.8374, F1: 0.8340
--------------------------------------------------
Model: XGBoost
Accuracy: 0.8867, Precision: 0.8968, Recall: 0.9187, F1: 0.9076
--------------------------------------------------
[LightGBM] [Info] Number of positive: 485, number of negative: 326
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000060 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 118
[LightGBM] [Info] Number of data points in the train set: 811, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.598027 -> initscore=0.397252
[LightGBM] [Info] Start training from score 0.397252
Model: LightGBM
Accuracy: 0.8719, Precision: 0.8880, Recall: 0.9024, F1: 0.8952
----------------------------------------

In [20]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 1, 5],
    'reg_alpha': [0, 0.5, 1],
    'reg_lambda': [1, 1.5, 2],
}

xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    n_iter=50,
    scoring='f1',
    cv=5,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train_scaled, y_train)
print("Best paramss:", random_search.best_params_)

best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test_scaled)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters: {'subsample': 1.0, 'reg_lambda': 1, 'reg_alpha': 0, 'n_estimators': 200, 'max_depth': 7, 'learning_rate': 0.2, 'gamma': 0, 'colsample_bytree': 0.8}
              precision    recall  f1-score   support

           0       0.87      0.84      0.85        80
           1       0.90      0.92      0.91       123

    accuracy                           0.89       203
   macro avg       0.88      0.88      0.88       203
weighted avg       0.89      0.89      0.89       203



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
