In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import (mean_absolute_error, mean_squared_error,
                             confusion_matrix, classification_report,
                             precision_score, recall_score, f1_score)

In [2]:
# Load dataset
data = pd.read_csv('/Users/macuser/Documents/Coding/RStudio/SIOP_ML_2023_Discord/data/cleaned_smushed_words.csv')

In [3]:
# Grab text exercise columns
text_columns = [f"text_exercise_{i}" for i in range(4, 20)] + ["text_exercise_final"]


# Define target and feature columns
target_columns = [
    "rating_chooses_appropriate_action", "rating_commits_to_action",
    "rating_gathers_information", "rating_identifies_issues_opportunities",
    "rating_interprets_information", "rating_involves_others",
    "rating_decision_making_final_score"
]

In [4]:
# Replace "NA" with "" in text columns and concatenate them
data = data.dropna(subset=target_columns)
data.loc[:, text_columns] = data.loc[:, text_columns].fillna('')
data['all_answers'] = data[text_columns].apply(lambda x: ' '.join(x), axis=1)

In [5]:
# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data['all_answers'], data[target_columns], test_size=0.2, random_state=42)

In [6]:
# Transform the text data using TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=10000)
X_train_transformed = vectorizer.fit_transform(X_train)
X_test_transformed = vectorizer.transform(X_test)

In [15]:
# Train ordinal classification models for each rating variable
ordinal_clf = MultiOutputClassifier(LogisticRegression(max_iter=1000), n_jobs=-1)
ordinal_clf.fit(X_train_transformed, y_train)

In [16]:
# Make predictions
y_pred = ordinal_clf.predict(X_test_transformed)

In [17]:
# Evaluate model performance - MAE and RMSE
mae = mean_absolute_error(y_test, y_pred, multioutput='raw_values')
mse = mean_squared_error(y_test, y_pred, multioutput='raw_values')
rmse = np.sqrt(mse)

In [18]:
print("Mean Absolute Error: ", mae)
print("Root Mean Squared Error: ", rmse)

Mean Absolute Error:  [0.31698113 0.34716981 0.52075472 0.37358491 0.38490566 0.46037736
 1.0490566 ]
Root Mean Squared Error:  [0.56301077 0.59558121 0.74225561 0.62344145 0.6672953  0.68405009
 1.50219965]


## What are MAE and RMSE, and what do they tell me?
The **Mean Absolute Error (MAE)** and **Root Mean Squared Error (RMSE)** are metrics used to evaluate the performance of a regression model. In this case, we have an ordinal classification model, which is a form of regression, so these metrics are appropriate.

**Mean Absolute Error (MAE)** is the average of the absolute differences between the predicted and actual values. It represents the average error magnitude without considering the direction. A *lower* MAE value indicates better performance of the model.

**Root Mean Squared Error (RMSE)** is the square root of the average of the squared differences between the predicted and actual values. RMSE gives more weight to larger errors, so it is more sensitive to outliers than MAE. A *lower* RMSE value indicates better performance of the model.

### Interpretation of MAE and RMSE results
The arrays correspond to the seven rating variables in the same order as they appear in the `target_columns` list. The first six rating variables are indicators of decision making, and the last variable is the final decision-making score.

The results says that the model has a lower error for the first six rating variables compared to the final decision-making score. Interesting. So, the model performs better at predicting the six indicators of decision making than predicting the overall final score 😑 The errors for the first six rating variables are in a similar range, which means the model's performance is pretty consistent across those variables.

Other notable information after researching (and help from chatGPT).  
- The first six rating variables have a scale of 1 to 4, while the final decision-making score has a scale of 1 to 7. So, the model has a decent performance for the first six rating variables. However, the final decision-making score has a much higher error.

In [21]:
# Evaluate model performance - Confusion Matrix, Classification Report, Precision, Recall, F1 Score
for i, target in enumerate(target_columns):
    print(f"Target column: {target}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test.iloc[:, i], y_pred[:, i]))
    print("Classification Report:")
    print(classification_report(y_test.iloc[:, i], y_pred[:, i]))
    print(f"Precision: {precision_score(y_test.iloc[:, i], y_pred[:, i], average='weighted')}")
    print(f"Recall: {recall_score(y_test.iloc[:, i], y_pred[:, i], average='weighted')}")
    print(f"F1 Score: {f1_score(y_test.iloc[:, i], y_pred[:, i], average='weighted')}")
    print("\n\n")

Target column: rating_chooses_appropriate_action
Confusion Matrix:
[[ 25  51   0]
 [  8 156   0]
 [  0  25   0]]
Classification Report:
              precision    recall  f1-score   support

         2.0       0.76      0.33      0.46        76
         3.0       0.67      0.95      0.79       164
         4.0       0.00      0.00      0.00        25

    accuracy                           0.68       265
   macro avg       0.48      0.43      0.42       265
weighted avg       0.63      0.68      0.62       265

Precision: 0.6334023382819739
Recall: 0.6830188679245283
F1 Score: 0.619149081257442



Target column: rating_commits_to_action
Confusion Matrix:
[[  0   0   1   0]
 [  0   0  36   0]
 [  0   0 163   3]
 [  0   0  51  11]]
Classification Report:
              precision    recall  f1-score   support

           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00        36
           3       0.65      0.98      0.78       166
           4       0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

## Summarize the Confusion Matrix, Classification Report, Precision, Recall, F1 Score results tell me?
**I love these illustrations!**

<div style="display: flex; flex-flow: row wrap; justify-content: space-around; ">
    <img src="https://miro.medium.com/v2/resize:fit:984/format:webp/1*f5ZeXvhsNFZ4q91M4Lotgg.jpeg" width=400px>
    <img src="https://miro.medium.com/v2/resize:fit:1018/format:webp/1*5CATnJ2FyNOF9xTpJ7qA5w.jpeg" width=400px>
</div>
