# Evaluating `TfidfLogisticRegression` Model

In [1]:
import json

import numpy as np

from text_classification import defs
from text_classification.data import Samples
from text_classification.models import TfidfLogisticRegression
from text_classification.metrics import report_from_base64
from text_classification.transforms import extract_label, LabelTransform

In [2]:
# todo: find a way to retrieve dagster asset metadata programatically, instead of via copying!

f1_train = "0.788"
f1_valid = "0.672"

valid_report_base64 = "ICAgICAgICAgICAgICAgICAgICAgICAgICBwcmUgICAgICAgcmVjICAgICAgIHNwZSAgICAgICAgZjEgICAgICAgZ2VvICAgICAgIGliYSAgICAgICBzdXAKCiAgICAgICAgICAgIEE6IEFydCAgICAgICAwLjc5ICAgICAgMC40MCAgICAgIDEuMDAgICAgICAwLjUzICAgICAgMC42MyAgICAgIDAuMzcgICAgICAgNDI0CiAgICBCOiBFbnZpcm9ubWVudCAgICAgICAwLjc0ICAgICAgMC41MCAgICAgIDAuOTkgICAgICAwLjYwICAgICAgMC43MSAgICAgIDAuNDggICAgICAgNDA3CiAgICAgICAgICBDOiBDcmltZSAgICAgICAwLjY1ICAgICAgMC40NCAgICAgIDAuOTkgICAgICAwLjUyICAgICAgMC42NiAgICAgIDAuNDEgICAgICAgNDA1CiAgICAgIEQ6IERpdmVyc2l0eSAgICAgICAwLjc0ICAgICAgMC43MiAgICAgIDAuOTUgICAgICAwLjczICAgICAgMC44MyAgICAgIDAuNjcgICAgICAyMDA5CiAgIEU6IFJlbGF0aW9uc2hpcCAgICAgICAwLjkxICAgICAgMC44MCAgICAgIDAuOTkgICAgICAwLjg1ICAgICAgMC44OSAgICAgIDAuNzggICAgICAgOTk1CiAgICAgICAgRjogRmFzaGlvbiAgICAgICAwLjgzICAgICAgMC44OSAgICAgIDAuOTcgICAgICAwLjg2ICAgICAgMC45MyAgICAgIDAuODYgICAgICAyMDE0CiAgICBHOiBVUyBQb2xpdGljcyAgICAgICAwLjgwICAgICAgMC45MiAgICAgIDAuOTAgICAgICAwLjg1ICAgICAgMC45MSAgICAgIDAuODIgICAgICA0MDIxCkg6IEZvcmVpZ24gQWZmYWlycyAgICAgICAwLjcyICAgICAgMC4zNiAgICAgIDEuMDAgICAgICAwLjQ4ICAgICAgMC42MCAgICAgIDAuMzMgICAgICAgNDE2CiAgICAgICAgSTogQml6YXJyZSAgICAgICAwLjYzICAgICAgMC4zNyAgICAgIDAuOTkgICAgICAwLjQ2ICAgICAgMC42MCAgICAgIDAuMzQgICAgICAgMzc4CiAgICAgIEo6IFBhcmVudGluZyAgICAgICAwLjc4ICAgICAgMC44OSAgICAgIDAuOTYgICAgICAwLjgzICAgICAgMC45MiAgICAgIDAuODUgICAgICAxOTMxCgogICAgICAgYXZnIC8gdG90YWwgICAgICAgMC43OSAgICAgIDAuNzkgICAgICAwLjk1ICAgICAgMC43OCAgICAgIDAuODYgICAgICAwLjc0ICAgICAxMzAwMAo="
valid_report = report_from_base64(valid_report_base64)
print(valid_report)

                          pre       rec       spe        f1       geo       iba       sup

            A: Art       0.79      0.40      1.00      0.53      0.63      0.37       424
    B: Environment       0.74      0.50      0.99      0.60      0.71      0.48       407
          C: Crime       0.65      0.44      0.99      0.52      0.66      0.41       405
      D: Diversity       0.74      0.72      0.95      0.73      0.83      0.67      2009
   E: Relationship       0.91      0.80      0.99      0.85      0.89      0.78       995
        F: Fashion       0.83      0.89      0.97      0.86      0.93      0.86      2014
    G: US Politics       0.80      0.92      0.90      0.85      0.91      0.82      4021
H: Foreign Affairs       0.72      0.36      1.00      0.48      0.60      0.33       416
        I: Bizarre       0.63      0.37      0.99      0.46      0.60      0.34       378
      J: Parenting       0.78      0.89      0.96      0.83      0.92      0.85      1931

       a

* Best Precision: "E: Relationship" 91%
* Worst Precision: "I: Bizarre" 63%

* Best Recall: "G: US Politics" 92%
* Worst Recall: "H: Foreign Affairs" 36%
    
🧠 Quite different performance between categories, that's hidden by a (macro) averaged F1 Score.

❓ Is performance correlated to number of samples per category?

🧠 Yes, majority class in training set is "G: US Politics". And "I: Bizarre" is a minority classes.

💡 Could try balancing the classes.

💪 Add undersampling (as simplest approach) and retest.

In [3]:
# after undersampling

f1_train = "0.883"
f1_valid = "0.652"

valid_report_base64 = "ICAgICAgICAgICAgICAgICAgICAgICAgICBwcmUgICAgICAgcmVjICAgICAgIHNwZSAgICAgICAgZjEgICAgICAgZ2VvICAgICAgIGliYSAgICAgICBzdXAKCiAgICAgICAgICAgIEE6IEFydCAgICAgICAwLjM5ICAgICAgMC43MCAgICAgIDAuOTYgICAgICAwLjUwICAgICAgMC44MiAgICAgIDAuNjYgICAgICAgNDI0CiAgICBCOiBFbnZpcm9ubWVudCAgICAgICAwLjQ4ICAgICAgMC43NSAgICAgIDAuOTcgICAgICAwLjU4ICAgICAgMC44NSAgICAgIDAuNzEgICAgICAgNDA3CiAgICAgICAgICBDOiBDcmltZSAgICAgICAwLjQyICAgICAgMC43MyAgICAgIDAuOTcgICAgICAwLjU0ICAgICAgMC44NCAgICAgIDAuNjkgICAgICAgNDA1CiAgICAgIEQ6IERpdmVyc2l0eSAgICAgICAwLjc3ICAgICAgMC41OCAgICAgIDAuOTcgICAgICAwLjY2ICAgICAgMC43NSAgICAgIDAuNTQgICAgICAyMDA5CiAgIEU6IFJlbGF0aW9uc2hpcCAgICAgICAwLjg1ICAgICAgMC44MiAgICAgIDAuOTkgICAgICAwLjg0ICAgICAgMC45MCAgICAgIDAuODAgICAgICAgOTk1CiAgICAgICAgRjogRmFzaGlvbiAgICAgICAwLjg4ICAgICAgMC44MSAgICAgIDAuOTggICAgICAwLjg0ICAgICAgMC44OSAgICAgIDAuNzggICAgICAyMDE0CiAgICBHOiBVUyBQb2xpdGljcyAgICAgICAwLjkyICAgICAgMC43MSAgICAgIDAuOTcgICAgICAwLjgwICAgICAgMC44MyAgICAgIDAuNjcgICAgICA0MDIxCkg6IEZvcmVpZ24gQWZmYWlycyAgICAgICAwLjQwICAgICAgMC43NCAgICAgIDAuOTYgICAgICAwLjUyICAgICAgMC44NCAgICAgIDAuNzAgICAgICAgNDE2CiAgICAgICAgSTogQml6YXJyZSAgICAgICAwLjMyICAgICAgMC42MyAgICAgIDAuOTYgICAgICAwLjQyICAgICAgMC43OCAgICAgIDAuNTkgICAgICAgMzc4CiAgICAgIEo6IFBhcmVudGluZyAgICAgICAwLjgxICAgICAgMC44MSAgICAgIDAuOTcgICAgICAwLjgxICAgICAgMC44OSAgICAgIDAuNzcgICAgICAxOTMxCgogICAgICAgYXZnIC8gdG90YWwgICAgICAgMC43OSAgICAgIDAuNzMgICAgICAwLjk3ICAgICAgMC43NSAgICAgIDAuODQgICAgICAwLjY5ICAgICAxMzAwMAo="
valid_report = report_from_base64(valid_report_base64)
print(valid_report)

                          pre       rec       spe        f1       geo       iba       sup

            A: Art       0.39      0.70      0.96      0.50      0.82      0.66       424
    B: Environment       0.48      0.75      0.97      0.58      0.85      0.71       407
          C: Crime       0.42      0.73      0.97      0.54      0.84      0.69       405
      D: Diversity       0.77      0.58      0.97      0.66      0.75      0.54      2009
   E: Relationship       0.85      0.82      0.99      0.84      0.90      0.80       995
        F: Fashion       0.88      0.81      0.98      0.84      0.89      0.78      2014
    G: US Politics       0.92      0.71      0.97      0.80      0.83      0.67      4021
H: Foreign Affairs       0.40      0.74      0.96      0.52      0.84      0.70       416
        I: Bizarre       0.32      0.63      0.96      0.42      0.78      0.59       378
      J: Parenting       0.81      0.81      0.97      0.81      0.89      0.77      1931

       a

👎 Worse validation metrics after undersampling

🧠 Minority classes have better recall, but worse precision.

🧠 Most likely due to incorrect `LogisticRegression` intercepts.

🧠 Will revert to using the original unbalanced training dataset.

❓ What are some examples that the model gets wrong?

In [4]:
model: TfidfLogisticRegression = defs.load_asset_value("tfidf_logistic_regression_model")  # type: ignore
validation_set: Samples = defs.load_asset_value("validation_set")  # type: ignore
label_transform: LabelTransform = defs.load_asset_value("label_transform")  # type: ignore

2023-02-12 23:33:12 +0000 - dagster - DEBUG - system - Loading file from: /Users/thomelane/Projects/text_classification/data/storage/tfidf_logistic_regression_model
2023-02-12 23:33:12 +0000 - dagster - DEBUG - system - Loading file from: /Users/thomelane/Projects/text_classification/data/storage/validation_set
2023-02-12 23:33:12 +0000 - dagster - DEBUG - system - Loading file from: /Users/thomelane/Projects/text_classification/data/storage/label_transform


In [5]:
X, y = extract_label(validation_set, label_transform)

In [6]:
y_pred = model.predict(X)

In [7]:
# get the indices of the samples that were misclassified
misclassified_idxs = np.where(y != y_pred)[0].tolist()

In [8]:
for i in misclassified_idxs[10:20]:
    sample = X[i].copy()
    sample["label_true"] = label_transform.idx_to_label(y[i])
    sample["label_pred"] = label_transform.idx_to_label(y_pred[i])
    print(json.dumps(sample, indent=4) + '\n')

{
    "headline": "ProFlowers Receives Barrage Of Twitter Complaints Following Bad Mother's Day Bouquets (PHOTO)",
    "short_description": "This is the second time they've dropped the ball during a major holiday this year.",
    "id": 23495,
    "label_true": "F: Fashion",
    "label_pred": "J: Parenting"
}

{
    "headline": "Mom Wants Justice For Boy Who Killed Himself After Social Media Hoax",
    "short_description": "Tyson Benz's mother says a 13-year-old girl charged in connection with her son's death is not facing a harsh enough punishment.",
    "id": 5901,
    "label_true": "C: Crime",
    "label_pred": "J: Parenting"
}

{
    "headline": "Texas Adoption Bill Could Allow Anti-Gay, Religious Discrimination",
    "short_description": "Critics of the bill say it legitimizes discrimination with taxpayer money.",
    "id": 41233,
    "label_true": "G: US Politics",
    "label_pred": "D: Diversity"
}

{
    "headline": "What the Paris Attack Is Really About (Hint -- Neither Free Sp

```
{
    "headline": "Cupid's Arrow",
    "short_description": "I remember, as a child, desperately wishing Cupid would hit me with his arrow so I'd fall instantly in love and live happily ever after. Cupid let me down.",
    "id": 16179,
    "label_true": "E: Relationship",
    "label_pred": "J: Parenting"
}
```

🧠 Can see how a bag-of-words approach could get confused here.

🧠 Contains the word "child" which was a highly weighted for the "J: Parenting" class.

🧠 "Cupid" have been filtered out by `max_features=10000`.

❓ Is "cupid" in the vocabulary?

In [9]:
"cupid" in model.tfidf_vectorizer.vocabulary_

False

🧠 A difficult tradeoff between adding more features (and over-fitting) and limiting them (and missing cases like this).

```
{
    "headline": "Texas Adoption Bill Could Allow Anti-Gay, Religious Discrimination",
    "short_description": "Critics of the bill say it legitimizes discrimination with taxpayer money.",
    "id": 41233,
    "label_true": "G: US Politics",
    "label_pred": "D: Diversity"
}
```

🚨 We see cases that could be the predicted class, even though ground truth label says otherwise.

🧠 Is this problem misspecificed? Is it really a multi-label problem (but our dataset only gives a single class).

🧠 Would the downstream application need a single class? Or would it support multi-label?

🧠 Where is this model being used? Just for search? In which case multi-label might work.

🚨 Would clarift these requirements in a real world assignment.

⭐️ Will focus on multi-class given time constaints, but would be interesting to map to a multi-label problem.

🧠 Some multi-label methods don't need complete labels to learn.

⭐️ Would also like to generate a confusion matrix to see which classes are being confused with each other.