<hr>

# LIBRARIES

In [75]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

<hr>

# DATASET

In [76]:
df = pd.read_csv("final_df_merged.csv")

In [77]:
df.head()

Unnamed: 0,case,d1,d2,d3,d4,d5,d6,d7,d8,d9,...,d30,d31,d32,d33,d34,d35,d36,d37,y1,y2
0,9002116,12.754902,12.745283,13.036364,13.434783,13.372881,13.341667,13.090164,12.975806,13.070312,...,17.583333,18.382166,18.923077,19.74359,20.655844,21.228758,21.947368,23.42,3,B
1,9005075,31.630952,31.597826,29.6,27.888889,26.76699,25.409524,24.179245,23.28972,23.146789,...,35.964539,36.77305,36.23741,38.524823,38.414286,38.690647,39.597122,40.304348,0,A
2,9005132,12.60274,11.950617,11.37931,11.054348,11.212121,11.603774,11.594595,11.443478,11.697479,...,13.598485,14.115385,15.955224,17.777778,17.8,17.896296,18.014925,16.230088,4,B
3,9026934,14.588235,14.337079,14.852632,15.080808,15.529412,16.103774,16.834862,16.630631,16.530973,...,23.943548,23.209677,24.226562,23.75,23.48062,23.7,22.131783,23.427481,2,A
4,9030718,31.166667,30.41,28.771429,27.907407,26.609091,25.027273,24.892857,24.535714,28.33871,...,31.465517,31.543103,31.061404,31.327434,22.785714,19.689655,17.989247,26.477876,3,B


* df['y1'] = Kellgren-Lawrence (KL) grades [0,1,2,3,4] (V00XRKL)

* df['y2'] = Knee osteoarthritis (OA), A=non-OA and B=OA (V03KL)

In [78]:
y1_labels = df['y1'].unique().tolist()
y1_labels.sort()
print(y1_labels)
print(type(y1_labels[0]))

[0, 1, 2, 3, 4]
<class 'int'>


In [79]:
df['y2'] = df['y2'].map({'A': 0, 'B': 1})

In [80]:
y2_labels = df['y2'].unique().tolist()
y2_labels.sort()
print(y2_labels)
print(type(y2_labels[0]))

[0, 1]
<class 'int'>


In [81]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 197 entries, 0 to 196
Data columns (total 40 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   case    197 non-null    int64  
 1   d1      197 non-null    float64
 2   d2      197 non-null    float64
 3   d3      197 non-null    float64
 4   d4      197 non-null    float64
 5   d5      197 non-null    float64
 6   d6      197 non-null    float64
 7   d7      197 non-null    float64
 8   d8      197 non-null    float64
 9   d9      197 non-null    float64
 10  d10     197 non-null    float64
 11  d11     197 non-null    float64
 12  d12     197 non-null    float64
 13  d13     197 non-null    float64
 14  d14     197 non-null    float64
 15  d15     197 non-null    float64
 16  d16     197 non-null    float64
 17  d17     197 non-null    float64
 18  d18     197 non-null    float64
 19  d19     197 non-null    float64
 20  d20     197 non-null    float64
 21  d21     197 non-null    float64
 22  d2

<hr>

# DEFINE X & Y

In [82]:
X = df.iloc[:, 1:-2]  # Features

In [83]:
y1 = df['y1']  # Target variable for KL grades
y2 = df['y2']  # Target variable for OA classification

<hr>
<hr>

# PREDICT 'Y1' (KL GRADES)

<hr>

## train / test / split

In [84]:
X_train, X_test, y1_train, y1_test = train_test_split(X, y1, test_size=0.2, random_state=16)

<hr>

## Models

In [85]:
y1_model_DecisionTree = DecisionTreeClassifier(
    criterion='gini',
    max_depth=5,
    random_state=16
)

In [86]:
y1_model_RandomForest = RandomForestClassifier(
    n_estimators=200,
    random_state=16
)

In [87]:
y1_model_XGBoost = XGBClassifier(
    objective="multi:softmax",  # outputs class labels directly
    num_class=5,                # number of classes
    n_estimators=300,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="mlogloss",     # avoids warning
    tree_method="hist",         # fast on CPU
    random_state=42
)

In [88]:
y1_model_gnb = GaussianNB()

In [89]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

y1_model_clf = LogisticRegression(
    solver="lbfgs",
    max_iter=1000,
    random_state=42
)

<hr>

## Train models

In [90]:
# Train the model

y1_model_DecisionTree.fit(X_train, y1_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,16
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [91]:
y1_model_RandomForest.fit(X_train, y1_train)

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [92]:
y1_model_XGBoost.fit(X_train, y1_train)

0,1,2
,objective,'multi:softmax'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [93]:
y1_model_gnb.fit(X_train, y1_train)

0,1,2
,priors,
,var_smoothing,1e-09


In [94]:
y1_model_clf.fit(X_train_scaled, y1_train)

  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
  raw_prediction = X @ weights.T + intercept  # ndarray, likely C-contiguous
  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
  grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,1000


<hr>

## Predictions

In [95]:
y1_pred_DecisionTree = y1_model_DecisionTree.predict(X_test)

In [96]:
y1_pred_RandomForest = y1_model_RandomForest.predict(X_test)

In [97]:
y1_pred_XGBoost = y1_model_XGBoost.predict(X_test)

In [98]:
y1_pred_gnb = y1_model_gnb.predict(X_test)

In [99]:
y1_pred_clf = y1_model_clf.predict(X_test_scaled)

  ret = a @ b
  ret = a @ b
  ret = a @ b


<hr>

## Accuracy Score

In [100]:
print("y1 DecisionTree Accuracy:", accuracy_score(y1_test, y1_pred_DecisionTree))
print("y1 RandomForest Accuracy:", accuracy_score(y1_test, y1_pred_RandomForest))
print("y1 XGBoost Accuracy:", accuracy_score(y1_test, y1_pred_XGBoost))
print("y1 Gaussian NB Accuracy:", accuracy_score(y1_test, y1_pred_gnb))
print("y1 LogReg Accuracy:", accuracy_score(y1_test, y1_pred_clf))

y1 DecisionTree Accuracy: 0.35
y1 RandomForest Accuracy: 0.275
y1 XGBoost Accuracy: 0.325
y1 Gaussian NB Accuracy: 0.2
y1 LogReg Accuracy: 0.125


<hr>

## Models comparison with loop

In [101]:
models = {
    "DecisionTree": DecisionTreeClassifier(
        criterion='gini',
        max_depth=5,
        random_state=16
    ),
    "RandomForest": RandomForestClassifier(
        n_estimators=200,
        random_state=16
    ),
    "XGBoost": XGBClassifier(
        objective="multi:softmax",  # outputs class labels directly
        num_class=5,                # number of classes
        n_estimators=300,
        max_depth=5,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="mlogloss",     # avoids warning
        tree_method="hist",         # fast on CPU
        random_state=42
    ),
    "GaussianNB": GaussianNB()
}

In [102]:
for name, model in models.items():
    model.fit(X_train, y1_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y1_test, y_pred)
    print(f"{name}: accuracy = {acc:.3f}")

DecisionTree: accuracy = 0.350
RandomForest: accuracy = 0.275
XGBoost: accuracy = 0.325
GaussianNB: accuracy = 0.200


In [103]:
best_model = models["DecisionTree"]
y1_pred = best_model.predict(X_test)
print(classification_report(y1_test, y1_pred))
cm = confusion_matrix(y1_test, y1_pred)
print(cm)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.00      0.00      0.00         4
           2       0.29      0.33      0.31        12
           3       0.50      0.42      0.45        12
           4       0.36      0.56      0.43         9

    accuracy                           0.35        40
   macro avg       0.23      0.26      0.24        40
weighted avg       0.32      0.35      0.33        40

[[0 0 1 1 1]
 [0 0 3 0 1]
 [0 0 4 4 4]
 [0 1 3 5 3]
 [1 0 3 0 5]]


<hr>
<hr>

# PREDICT 'Y2' (A/B)

<hr>

## train / test / split

In [104]:
X_train, X_test, y2_train, y2_test = train_test_split(X, y2, test_size=0.2, random_state=16)

In [105]:
for name, model in models.items():
    model.fit(X_train, y2_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y2_test, y_pred)
    print(f"{name}: accuracy = {acc:.3f}")

DecisionTree: accuracy = 0.575
RandomForest: accuracy = 0.675
XGBoost: accuracy = 0.675
GaussianNB: accuracy = 0.550


In [106]:
best_model_y2 = models["XGBoost"]
y2_pred = best_model_y2.predict(X_test)
print(classification_report(y2_test, y2_pred))
y2_cm = confusion_matrix(y2_test, y2_pred)
print(y2_cm)

              precision    recall  f1-score   support

           0       0.38      0.27      0.32        11
           1       0.75      0.83      0.79        29

    accuracy                           0.68        40
   macro avg       0.56      0.55      0.55        40
weighted avg       0.65      0.68      0.66        40

[[ 3  8]
 [ 5 24]]
