In [3]:
# imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# building the model
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

# metric functions
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import mean_absolute_error, mean_squared_error


In [5]:
# reshaping data -- split data into training and testing

file = pd.read_csv('final_avg_hr_data_updated.csv')
df = pd.DataFrame(file)

X = df[['heartrate','steps']]
y = df[['promis score']]

encoder = OrdinalEncoder(categories=[['Not fatigued at all','A little bit fatigued','Somewhat fatigued','Very fatigued']])
# y_encoded = encoder.fit_transform(y)
y_encoded = encoder.fit_transform(y.values.reshape(-1,1)).ravel().astype(int)

df_y_encoded = pd.DataFrame(y_encoded)
y_names = encoder.inverse_transform(y_encoded.reshape(-1,1))
df_y_names = pd.DataFrame(y_names)

map_data = [df_y_names, df_y_encoded] 
result = pd.concat(map_data, axis=1) # promis score labels and corresponding numerical value

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42) # split data into training and testing sets

In [7]:
# one vs the rest strategy
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

model = OneVsRestClassifier(LogisticRegression())
model.fit(X, y_encoded)

predictions = model.predict(X)
probabilities = model.predict_proba(X)

Predictions shape: (14204,)
Probabilities shape: (14204, 4)


In [61]:
# metrics !
from sklearn.metrics import cohen_kappa_score
from scipy.stats import kendalltau

# true test -- y_test, pred -- y_pred_test
mae = mean_absolute_error(y_test, y_pred_test)
tau, _ = kendalltau(y_test,y_pred_test)
qwk = cohen_kappa_score(y_test,y_pred_test,weights='quadratic')

print(f"Mean Absolute Error: {mae:.4f}")
print(f"Kendall's Tau: {tau:.4f}")
print(f"Quadratic Weighted Kappa: {qwk:.4f}")

Mean Absolute Error: 0.6959
Kendall's Tau: 0.0308
Quadratic Weighted Kappa: 0.0022


In [79]:
# using cross validation -- helps prevent overfitting (maintains distribution by training on "folds")
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer

def ordinal_scorer(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    tau, _ = kendalltau(y_true,y_pred)
    qwk = cohen_kappa_score(y_true,y_pred,weights='quadratic')
    return (1/mae)*(tau+qwk)/2

cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
scorer = make_scorer(ordinal_scorer)

model = OrdinalRidge()
# model.fit(X_train.values,y_train)

scores = cross_val_score(model, X, y, cv=cv, scoring=scorer,error_score='raise')

print(f"Cross-validation scores: {scores}")
print(f"Mean score: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")

ValueError: could not convert string to float: 'A little bit fatigued'

In [59]:
from mord import LogisticIT
from mord import LogisticAT
from mord import OrdinalRidge
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error

model = OrdinalRidge()
model.fit(X_train.values,y_train)

# Predict class labels
y_pred_train = model.predict(X_train.values)
y_pred_test = model.predict(X_test.values)

# Calculate classification accuracy
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)
mae = mean_absolute_error(y_test, y_pred_test)

print(f"Train Accuracy: {train_accuracy:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"MAE: {mae:.4f}")

for feature, coef in zip(X.columns, model.coef_):
    print(f"{feature}: {coef:.4f}")
    
# for a new prediction
# new_log = np.array([[160,10000]])  # heartrate, steps
# predicted_level = model.predict(new_log)
# fatigue_levels = ['Not fatigued at all','A little bit fatigued','Somewhat fatigued','Very fatigued']
# print(f"Predicted Fatigue Level: {fatigue_levels[predicted_level[0]]}")

Train Accuracy: 0.3945
Test Accuracy: 0.3851
MAE: 0.6959
heartrate: -0.0011
steps: 0.0000


In [25]:
df.dtypes

patient id        object
month              int64
day                int64
heartrate        float64
steps            float64
promis score    category
dtype: object