<hr>
<hr>

# Libraries

In [119]:
import pandas as pd
import ast
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

In [120]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

<hr>
<hr>

# Dataset

In [121]:
df = pd.read_csv("secondAttempt_final_df_merged.csv")

* df['y1'] = Kellgren-Lawrence (KL) grades [0,1,2,3,4] (V00XRKL)

* df['y2'] = Knee osteoarthritis (OA), A=non-OA and B=OA (V03KL)

In [122]:
y1_labels = df['y1'].unique().tolist()
y1_labels.sort()
print(y1_labels)
print(type(y1_labels[0]))

[0, 1, 2, 3, 4]
<class 'int'>


In [126]:
df['y2'] = df['y2'].map({'A': 0, 'B': 1})

In [129]:
y2_labels = df['y2'].unique().tolist()
y2_labels.sort()
print(y2_labels)
print(type(y2_labels[0]))

[0, 1]
<class 'int'>


In [127]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196 entries, 0 to 195
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   case      196 non-null    int64  
 1   fv1_mean  196 non-null    float64
 2   fv2_mean  196 non-null    float64
 3   fv3_mean  196 non-null    float64
 4   y1        196 non-null    int64  
 5   y2        196 non-null    int64  
dtypes: float64(3), int64(3)
memory usage: 9.3 KB


In [128]:
df.head()

Unnamed: 0,case,fv1_mean,fv2_mean,fv3_mean,y1,y2
0,9002116,75.003809,108.119251,96.902299,3,1
1,9005075,98.129495,113.714378,106.785496,0,0
2,9005132,110.778392,124.761203,112.053324,4,1
3,9026934,96.534927,99.721493,86.627444,2,0
4,9030718,90.30299,104.339804,119.286584,3,1


<hr>

# DEFINE X & Y

In [130]:
X = df.iloc[:, 1:-2]  # Features

In [132]:
y1 = df['y1']  # Target variable for KL grades
y2 = df['y2']  # Target variable for OA classification

<hr>
<hr>

# PREDICT 'Y1' (KL GRADES)

<hr>

## train / test / split

In [133]:
X_train, X_test, y1_train, y1_test = train_test_split(X, y1, test_size=0.2, random_state=16)

In [134]:
y1_models = {
    "DecisionTree": DecisionTreeClassifier(
        criterion='gini',
        max_depth=5,
        random_state=16
    ),
    "RandomForest": RandomForestClassifier(
        n_estimators=200,
        random_state=16
    ),
    "XGBoost": XGBClassifier(
        objective="multi:softmax",  # outputs class labels directly
        num_class=5,                # number of classes
        n_estimators=300,
        max_depth=5,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="mlogloss",     # avoids warning
        tree_method="hist",         # fast on CPU
        random_state=42
    ),
    "GaussianNB": GaussianNB()
}

In [135]:
for name, model in y1_models.items():
    model.fit(X_train, y1_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y1_test, y_pred)
    print(f"{name}: accuracy = {acc:.3f}")

DecisionTree: accuracy = 0.250
RandomForest: accuracy = 0.225
XGBoost: accuracy = 0.250
GaussianNB: accuracy = 0.225


<hr>
<hr>

# PREDICT 'Y2' (A/B)

<hr>

## train / test / split

In [136]:
X_train, X_test, y2_train, y2_test = train_test_split(X, y2, test_size=0.2, random_state=16)

In [137]:
y2_models = {
    "DecisionTree": DecisionTreeClassifier(
        criterion='gini',
        max_depth=5,
        random_state=16
    ),
    "RandomForest": RandomForestClassifier(
        n_estimators=200,
        random_state=16
    ),
    "XGBoost": XGBClassifier(
        objective="multi:softmax",  # outputs class labels directly
        num_class=5,                # number of classes
        n_estimators=300,
        max_depth=5,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="mlogloss",     # avoids warning
        tree_method="hist",         # fast on CPU
        random_state=42
    ),
    "GaussianNB": GaussianNB()
}

In [138]:
for name, model in y2_models.items():
    model.fit(X_train, y2_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y2_test, y_pred)
    print(f"{name}: accuracy = {acc:.3f}")

DecisionTree: accuracy = 0.700
RandomForest: accuracy = 0.600
XGBoost: accuracy = 0.575
GaussianNB: accuracy = 0.575
