<hr>
<hr>

# Libraries

In [247]:
import pandas as pd
import ast
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

In [248]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

<hr>
<hr>

# Dataset

In [249]:
df = pd.read_csv("secondAttempt_final_df_merged.csv")

* df['y1'] = Kellgren-Lawrence (KL) grades [0,1,2,3,4] (V00XRKL)

* df['y2'] = Knee osteoarthritis (OA), A=non-OA and B=OA (V03KL)

In [250]:
y1_labels = df['y1'].unique().tolist()
y1_labels.sort()
print(y1_labels)
print(type(y1_labels[0]))

[0, 1, 2, 3, 4]
<class 'int'>


In [251]:
df['y2'] = df['y2'].map({'A': 0, 'B': 1})

In [252]:
y2_labels = df['y2'].unique().tolist()
y2_labels.sort()
print(y2_labels)
print(type(y2_labels[0]))

[0, 1]
<class 'int'>


In [253]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196 entries, 0 to 195
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   case    196 non-null    int64  
 1   d1      196 non-null    float64
 2   d2      196 non-null    float64
 3   d3      196 non-null    float64
 4   d4      196 non-null    float64
 5   d5      196 non-null    float64
 6   d6      196 non-null    float64
 7   d7      196 non-null    float64
 8   d8      196 non-null    float64
 9   d9      196 non-null    float64
 10  y1      196 non-null    int64  
 11  y2      196 non-null    int64  
dtypes: float64(9), int64(3)
memory usage: 18.5 KB


In [254]:
df.head()

Unnamed: 0,case,d1,d2,d3,d4,d5,d6,d7,d8,d9,y1,y2
0,9002116,107.338565,104.820274,107.061919,111.174675,95.146036,102.706782,110.308689,104.477525,106.551461,3,1
1,9005075,105.673943,112.80328,110.81303,111.818826,111.626515,113.517526,108.385382,107.01513,106.081957,0,0
2,9005132,120.823517,118.419393,122.192484,122.710571,120.421853,123.701605,128.661326,124.004919,121.071465,4,1
3,9026934,96.883826,100.49409,97.709887,96.756534,100.893474,97.613851,100.807411,98.778936,99.925939,2,0
4,9030718,101.240029,106.378096,105.156002,106.783332,106.301636,101.372332,102.736142,103.037982,105.186959,3,1


<hr>

# DEFINE X & Y

In [255]:
X = df.iloc[:, 1:10]  # Features

In [256]:
X.head(2)

Unnamed: 0,d1,d2,d3,d4,d5,d6,d7,d8,d9
0,107.338565,104.820274,107.061919,111.174675,95.146036,102.706782,110.308689,104.477525,106.551461
1,105.673943,112.80328,110.81303,111.818826,111.626515,113.517526,108.385382,107.01513,106.081957


In [257]:
y1 = df['y1']  # Target variable for KL grades
y2 = df['y2']  # Target variable for OA classification

In [258]:
y1.head(2)

0    3
1    0
Name: y1, dtype: int64

In [259]:
y2.head(2)

0    1
1    0
Name: y2, dtype: int64

<hr>
<hr>

# PREDICT 'Y1' (KL GRADES)

<hr>

## train / test / split

In [260]:
X_train, X_test, y1_train, y1_test = train_test_split(X, y1, test_size=0.15, random_state=16)

In [261]:
y1_models = {
    "DecisionTree": DecisionTreeClassifier(
        criterion='gini',
        max_depth=5,
        random_state=16
    ),
    "RandomForest": RandomForestClassifier(
        n_estimators=200,
        random_state=16
    ),
    "XGBoost": XGBClassifier(
        objective="multi:softmax",  # outputs class labels directly
        num_class=5,                # number of classes
        n_estimators=300,
        max_depth=5,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="mlogloss",     # avoids warning
        tree_method="hist",         # fast on CPU
        random_state=42
    ),
    "GaussianNB": GaussianNB()
}

In [262]:
for name, model in y1_models.items():
    model.fit(X_train, y1_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y1_test, y_pred)
    print(f"{name}: accuracy = {acc:.3f}")

DecisionTree: accuracy = 0.300
RandomForest: accuracy = 0.233
XGBoost: accuracy = 0.100
GaussianNB: accuracy = 0.133


In [263]:
best_model_y1 = y1_models["DecisionTree"]
y1_pred = best_model_y1.predict(X_test)
print(classification_report(y1_test, y1_pred))

cm = confusion_matrix(y1_test, y1_pred)
print(f"Confusion matrix: \n{cm}\n")

y1_scores = cross_val_score(best_model_y1, X, y1, cv=10)
y1_10fold_accuracy_scores = y1_scores.tolist()
y1_10fold_accuracy_scores = [round(score, 2) for score in y1_10fold_accuracy_scores]

avg_y1_10fold_accuracy_scores = round(sum(y1_10fold_accuracy_scores) / len(y1_10fold_accuracy_scores), 2)
print(f"10-fold cross validation scores: \n\t{y1_10fold_accuracy_scores}")
print(f"Average score: {avg_y1_10fold_accuracy_scores}")
print(f"Best score: {max(y1_10fold_accuracy_scores)}")

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.00      0.00      0.00         2
           2       0.33      0.30      0.32        10
           3       0.42      0.56      0.48         9
           4       0.20      0.17      0.18         6

    accuracy                           0.30        30
   macro avg       0.19      0.20      0.19        30
weighted avg       0.28      0.30      0.28        30

Confusion matrix: 
[[0 1 0 1 1]
 [0 0 0 1 1]
 [0 0 3 5 2]
 [0 1 3 5 0]
 [0 2 3 0 1]]

10-fold cross validation scores: 
	[0.25, 0.3, 0.3, 0.3, 0.3, 0.3, 0.26, 0.21, 0.21, 0.11]
Average score: 0.25
Best score: 0.3


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


<hr>
<hr>

# PREDICT 'Y2' (A/B)

<hr>

## train / test / split

In [264]:
X_train, X_test, y2_train, y2_test = train_test_split(X, y2, test_size=0.2, random_state=16)

In [265]:
y2_models = {
    "DecisionTree": DecisionTreeClassifier(
        criterion='gini',
        max_depth=5,
        random_state=16
    ),
    "RandomForest": RandomForestClassifier(
        n_estimators=200,
        random_state=16
    ),
    "XGBoost": XGBClassifier(
        objective="multi:softmax",  # outputs class labels directly
        num_class=5,                # number of classes
        n_estimators=300,
        max_depth=5,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="mlogloss",     # avoids warning
        tree_method="hist",         # fast on CPU
        random_state=42
    ),
    "GaussianNB": GaussianNB()
}

In [266]:
for name, model in y2_models.items():
    model.fit(X_train, y2_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y2_test, y_pred)
    print(f"{name}: accuracy = {acc:.3f}")

DecisionTree: accuracy = 0.525
RandomForest: accuracy = 0.625
XGBoost: accuracy = 0.500
GaussianNB: accuracy = 0.525


In [268]:
best_model_y2 = y2_models["XGBoost"]
y2_pred = best_model_y2.predict(X_test)
print(classification_report(y2_test, y2_pred))

y2_cm = confusion_matrix(y2_test, y2_pred)
print(f"Confusion matrix: \n{y2_cm}\n")

y2_scores = cross_val_score(best_model_y2, X, y2, cv=10)
y2_10fold_accuracy_scores = y2_scores.tolist()
y2_10fold_accuracy_scores = [round(score, 2) for score in y2_10fold_accuracy_scores]

avg_y2_10fold_accuracy_scores = round(sum(y2_10fold_accuracy_scores) / len(y2_10fold_accuracy_scores), 2)

print(f"10-fold cross validation scores: \n\t{y2_10fold_accuracy_scores}")
print(f"Average score: {avg_y2_10fold_accuracy_scores}")
print(f"Best score: {max(y2_10fold_accuracy_scores)}")

              precision    recall  f1-score   support

           0       0.23      0.23      0.23        13
           1       0.63      0.63      0.63        27

    accuracy                           0.50        40
   macro avg       0.43      0.43      0.43        40
weighted avg       0.50      0.50      0.50        40

Confusion matrix: 
[[ 3 10]
 [10 17]]

10-fold cross validation scores: 
	[0.55, 0.5, 0.55, 0.55, 0.55, 0.55, 0.58, 0.53, 0.63, 0.79]
Average score: 0.58
Best score: 0.79
