In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from skmultilearn.model_selection import iterative_train_test_split
from sklearn.metrics import f1_score,  precision_score, recall_score, hamming_loss

from keras.preprocessing.text import Tokenizer
import tensorflow as tf
import tensorflow_addons as tfa
from keras.metrics import AUC
from tensorflow.keras.metrics import Precision, Recall

import spacy
from spacy.training import Example
from spacy.util import minibatch
import random

## **Load Cleaned Data**

In [None]:
# Load the cleaned datasets
df_user_inputs = pd.read_csv('../dataset/user_inputs_cleaned.csv')
df_labels = pd.read_csv('../dataset//labels_cleaned.csv')

# Remove unnecessary index columns
df_user_inputs.drop(df_user_inputs.columns[0], axis=1, inplace=True)
df_labels.drop(df_labels.columns[0], axis=1, inplace=True)

# Ensure alignment
assert len(df_labels) == len(df_user_inputs), "Datasets do not align!"

print(df_user_inputs.shape)
df_user_inputs.head(10)

(3974, 1)


Unnamed: 0,text
0,er is een teek op mijn been ik ben bang dat di...
1,er is een teek op mijn rug en ik krijg hem er ...
2,op mijn been zit een teek ik heb hem geprobeer...
3,ik heb allergieen
4,huid
5,roodheid
6,schilfering
7,ik heb wratten onder mijn voet
8,ik heb gisteren naar het bos geweest en zie nu...
9,ik voelde iets prikken


In [None]:
df_labels.head()

Unnamed: 0,"Niet lekker voelen, algehele malaise",Beenklachten,Bloedneus,Misselijkheid en overgeven,Brandwond,Buikpijn,Suikerziekte (ontregeld),Diarree,Duizeligheid,Gebitsklachten,...,Coronavirus,Knieklachten,Liesklachten,Elleboogklachten,Schouderklachten,Oorsuizen,Hand- en polsklachten,Enkelklachten,Dikke enkels of voeten,Vingerklachten
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Determine model baseline
label_frequencies = df_labels.sum().sort_values(ascending=False)
label_frequencies / df_labels.shape[0]

Huidklachten                            0.088827
Beenklachten                            0.072723
Buikpijn                                0.062154
Oorklachten                             0.052843
Misselijkheid en overgeven              0.037242
                                          ...   
Liesklachten                            0.005033
Tekenbeet                               0.005033
Verdrinking                             0.005033
Verwonding aan de buik                  0.005033
Niet lekker voelen, algehele malaise    0.002516
Length: 74, dtype: float64

We see that if we predict every time the label with the highest frequency (Huidklachten), our model will be correct around 9% of the time. We want our model to perform at least better than this 9% threshold

## **Prepare Data to Model Format**

To accomodate the nature of multi-label classification. Instead of using the traditional method `train_test_split`, we employ iterative stratified sampling `iterative_train_test_split`, to provide a well-balanced distribution of all label combinations in both training and test sets.

In [None]:
## Split data to train:val:test

# Prepare data for iterative train test split
# X must be 2D np.ndarray and y must be 2D binary np.ndarray
X_texts = df_user_inputs['text'].values
X_texts = X_texts.reshape(-1, 1)
y = df_labels.values

# Split the data 60:20:20 with multi-label stratification
train_texts, y_train, test_texts, y_test = iterative_train_test_split(X_texts, y, test_size = 0.2)
#val_texts, y_val, test_texts, y_test = iterative_train_test_split(tmp_texts, y_tmp, test_size = 0.5)

# Sanity checks to confirm the shapes of the datasets
assert train_texts.shape[0] == y_train.shape[0], "Mismatch in train data and labels"
assert test_texts.shape[0] == y_test.shape[0], "Mismatch in test data and labels"

train_texts, test_texts = train_texts.ravel(), test_texts.ravel()
#val_texts = val_texts.ravel()

print(train_texts.shape, y_train.shape, test_texts.shape)
train_texts

(3175,) (3175, 74) (799,)


array(['er is een teek op mijn been ik ben bang dat die er al een tijdje op heeft gezeten',
       'op mijn been zit een teek ik heb hem geprobeerd te verwijderen maar het lukt niet',
       'ik heb allergieen', ...,
       'vannacht met slapen denk ik gekke beweging gemaakt want mn nek is nu helemaal stijf kan niet meer naar rechts kijken',
       'heb al langere tijd pijn in mn nek krijg dan soms tintelingen over mijn arm heb dan ook minder kracht in mijn arm',
       'doet zeer als ik mn hoofd beweeg'], dtype=object)

## **Model Building: spaCy NL**

For simplicity, we train and evaluate base classifiers with OneVsRestClassifier. We select classifiers that are known to work well with multi-label text classification, such as Naive Bayes, SVM, and Logistic Regression.

To handle class imbalance, we consider the following:
- Class weights in loss function
- Evaluation metrics: F1 score (micro) for individual labels and Hamming loss for overall metrics (i.e. evaluate label prediction rather than label combination)

We identify the best-performing classifier based on the F1 score and Hamming Loss. Then we proceed to test this classifer with more advanced techniques that consider label correlations, i.e. Classifier Chains (CC) and Random k-Labelsets (RAkEL).

### **A. Data & Model Preparation**

In [None]:
def initialize_spacy_model(model_size="sm"):
  """
  Initialize and return a spaCy model with a text classification component.
  Args:
      model_size (str): Size of the spaCy Dutch language model.
  Returns:
      spacy.Language: An initialized spaCy model.
  """
  nlp = spacy.load(f'nl_core_news_{model_size}')

  """
  # If we have more time, we could add in transformer which could boost performance
  # Configure the transformer
  transformer_config = {
      "model": {
          "@architectures": "spacy-transformers.TransformerModel.v1",
          "name": "wietsedv/bert-base-dutch-cased",
          "tokenizer_config": {"use_fast": True},
          "get_spans": {"@span_getters": "spacy-transformers.strided_spans.v1", "window": 128, "stride": 96}
      }
  }

  # Add transformer to the pipeline
  transformer = nlp.add_pipe("transformer", config=transformer_config)
  """

  # Add text classification component
  textcat = nlp.add_pipe("textcat_multilabel", last=True)

  # Add labels to text classifier
  for label in df_labels.columns:
      textcat.add_label(label)

  nlp.initialize()

  return nlp

In [None]:
def prepare_spacy_data(set_texts, y_set, model_size="sm"):
  """
  Prepares the training/test data in the format required by spaCy for model training/evaluation.

  Args:
      set_texts (np.ndarray): Array of training/test texts
      y_set (np.ndarray): Array of train/test labels
      model_size (str): The size of the pre-trained spaCy Dutch language model to use. Set default to small.

  Returns:
      list: A list of spaCy Example objects representing the training/test data.
  """

  nlp = initialize_spacy_model(model_size) # just for spacy data preparation

  set_data = []
  for text, labels in zip(set_texts, y_set):
      doc = nlp.make_doc(text)
      example = Example.from_dict(doc, {"cats": {label: labels[idx] for idx, label in enumerate(df_labels.columns)}})
      set_data.append(example)

  return set_data

train_data = prepare_spacy_data(train_texts, y_train)
print(f"# training examples: {len(train_data)}")

test_data = prepare_spacy_data(test_texts, y_test)
print(f"# test examples: {len(test_data)}")

# training examples: 3175
# test examples: 799


In [None]:
def oversample_spacy(train_data, df_labels, threshold=30):
    """
    Applies custom oversampling to the minority classes in the training data to address class imbalance.

    Args:
        train_data (list): The list of training data examples (spaCy Example objects).
        df_labels (pd.DataFrame): The DataFrame containing the labels for each training example.
        threshold (int): The threshold for identifying minority classes to be oversampled.

    Returns:
        list: The modified training data after applying oversampling.
    """

    # Convert labels to a numpy array for processing
    labels = np.array([list(example.y.cats.values()) for example in train_data])
    # Identify minority classes
    minority_labels = (labels.sum(axis=0) < threshold)
    # Identify instances with minority class labels
    minority_instances = labels[:, minority_labels].sum(axis=1) > 0
    # Oversample these instances
    oversampled_data = [train_data[i] for i, is_minority in enumerate(minority_instances) if is_minority]
    # Combine with original data
    combined_data = train_data + oversampled_data * 1

    return combined_data

train_data = oversample_spacy(train_data, df_labels)
print(f"# training examples after oversampling: {len(train_data)}")

# training examples after oversampling: 3907


### **B. Set up Model Training & Evaluation**

In [None]:
def train_spacy_model(model_size="sm"):
  """
  Train a spaCy model using k-fold cross-validation and return the trained model.
  Args:
      model_size (str): Size of the spaCy Dutch language model.
  Returns:
      spacy.Language: A trained spaCy model.
  """
  N_EPOCHS = 50
  BATCH_SIZE = 128
  DROPOUT = 0.2

  train_data = prepare_spacy_data(train_texts, y_train, model_size)

  kf = KFold(n_splits=2, shuffle=True, random_state=42)
  validation_scores = []
  hamming_losses = []

  print("Training the model...")

  # Cross-validation
  for fold, (train_indices, val_indices) in enumerate(kf.split(train_data)):
      print(f"\nFold {fold + 1}")
      print('{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}'.format(
            'EPOCH', 'TR_LOSS', 'TR_AUC', 'VAL_AUC', 'TR_F1', 'VAL_F1', 'TR_P', 'VAL_P', 'TR_R', 'VAL_R'))

      nlp = initialize_spacy_model(model_size)  # Initialize a new model for each fold

      train_fold = [train_data[i] for i in train_indices]
      val_fold = [train_data[i] for i in val_indices]

      for epoch in range(N_EPOCHS):
          losses = {}
          random.shuffle(train_fold)
          for batch in minibatch(train_fold, size=BATCH_SIZE):
              nlp.update(batch, drop=DROPOUT, losses=losses)

          # Evaluate on train and validation set
          train_scores = nlp.evaluate(train_fold)
          val_scores = nlp.evaluate(val_fold)
          print('{0:d}\t{1:.3f}\t{2:.3f}\t{3:.3f}\t{4:.3f}\t{5:.3f}\t{6:.3f}\t{7:.3f}\t{8:.3f}\t{9:.3f}'.format(
                int(epoch), losses['textcat_multilabel'],
                train_scores['cats_macro_auc'], val_scores['cats_macro_auc'],
                train_scores['cats_micro_f'], val_scores['cats_micro_f'],
                train_scores['cats_micro_p'], val_scores['cats_micro_p'],
                train_scores['cats_micro_r'], val_scores['cats_micro_r']))


      # Evaluate on validation fold
      y_true = np.array([list(example.y.cats.values()) for example in val_fold])
      y_pred = np.array([list(nlp(example.x.text).cats.values()) for example in val_fold])
      y_pred_binary = (y_pred >= 0.5).astype(int)
      hamming = hamming_loss(y_true, y_pred_binary)

      hamming_losses.append(hamming)
      validation_scores.append(val_scores)

  return nlp, validation_scores, hamming_losses

In [None]:
## Evaluate spacy model on validation data from training averaged across k-folds
def spacy_val_hamming_loss(hamming_losses):
  """
  Evaluates and prints the average F1 score and Hamming loss across validation folds.

  Args:
      hamming_losses (list): List of Hamming loss values from each validation fold.
  """

  # Calculate and print average metrics
  average_hamming = np.mean(hamming_losses)
  print(f"Average Validation Hamming Loss: {average_hamming}")

In [None]:
## Evaluate spacy model on test data
def spacy_test_hamming_loss(nlp, test_data):
  """
  Evaluates the spaCy model on the test data and prints the F1 score and Hamming loss.

  Args:
      nlp (spacy.Language): The trained spaCy model.
      test_data (list): A list of spaCy Example objects representing the test data.
  """

  # Predict and evaluate
  y_true_test = np.array([list(example.y.cats.values()) for example in test_data])
  y_pred_test = np.array([list(nlp(example.x.text).cats.values()) for example in test_data])

  # Binarize y_pred_test
  y_pred_test_binary = (y_pred_test >= 0.5).astype(int)

  # Calculate Hamming loss for the test set
  hamming_test = hamming_loss(y_true_test, y_pred_test_binary)

  print(f"Test Hamming Loss: {hamming_test}")

### **C. Model Development and Experimentation**

In [None]:
def train_spacy_model(model_size="sm"):
  """
  Train a spaCy model using k-fold cross-validation and return the trained model.
  Args:
      model_size (str): Size of the spaCy Dutch language model.
  Returns:
      spacy.Language: A trained spaCy model.
  """
  N_EPOCHS = 50
  BATCH_SIZE = 128
  DROPOUT = 0.2
  KFOLD_SIZE = 5

  train_data = prepare_spacy_data(train_texts, y_train, model_size)

  kf = KFold(n_splits=KFOLD_SIZE, shuffle=True, random_state=42)
  validation_scores = []
  hamming_losses = []

  print("Training the model...")

  # Cross-validation
  for fold, (train_indices, val_indices) in enumerate(kf.split(train_data)):
      print(f"\nFold {fold + 1}")
      print('{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}'.format(
            'EPOCH', 'TR_LOSS', 'TR_AUC', 'VAL_AUC', 'TR_F1', 'VAL_F1', 'TR_P', 'VAL_P', 'TR_R', 'VAL_R'))

      nlp = initialize_spacy_model(model_size)  # Initialize a new model for each fold

      train_fold = [train_data[i] for i in train_indices]
      val_fold = [train_data[i] for i in val_indices]

      for epoch in range(N_EPOCHS):
          losses = {}
          random.shuffle(train_fold)
          for batch in minibatch(train_fold, size=BATCH_SIZE):
              nlp.update(batch, drop=DROPOUT, losses=losses)

          # Evaluate on train and validation set
          train_scores = nlp.evaluate(train_fold)
          val_scores = nlp.evaluate(val_fold)
          print('{0:d}\t{1:.3f}\t{2:.3f}\t{3:.3f}\t{4:.3f}\t{5:.3f}\t{6:.3f}\t{7:.3f}\t{8:.3f}\t{9:.3f}'.format(
                int(epoch), losses['textcat_multilabel'],
                train_scores['cats_macro_auc'], val_scores['cats_macro_auc'],
                train_scores['cats_micro_f'], val_scores['cats_micro_f'],
                train_scores['cats_micro_p'], val_scores['cats_micro_p'],
                train_scores['cats_micro_r'], val_scores['cats_micro_r']))


      # Evaluate on validation fold
      y_true = np.array([list(example.y.cats.values()) for example in val_fold])
      y_pred = np.array([list(nlp(example.x.text).cats.values()) for example in val_fold])
      y_pred_binary = (y_pred >= 0.5).astype(int)
      hamming = hamming_loss(y_true, y_pred_binary)

      hamming_losses.append(hamming)
      validation_scores.append(val_scores)

  return nlp, validation_scores, hamming_losses

In [None]:
## Evaluate spacy model on validation data from training averaged across k-folds
def evaluate_spacy_validation(f1_scores, hamming_losses):
  """
  Evaluates and prints the average F1 score and Hamming loss across validation folds.

  Args:
      f1_scores (list): List of F1 scores from each validation fold.
      hamming_losses (list): List of Hamming loss values from each validation fold.
  """

  # Calculate and print average metrics
  average_f1 = np.mean(f1_scores)
  average_hamming = np.mean(hamming_losses)
  print(f"Average Validation F1 Score: {average_f1}")
  print(f"Average Validation Hamming Loss: {average_hamming}")

In [None]:
## Evaluate spacy model on test data
def evaluate_spacy_test(nlp, test_data):
  """
  Evaluates the spaCy model on the test data and prints the F1 score and Hamming loss.

  Args:
      nlp (spacy.Language): The trained spaCy model.
      test_data (list): A list of spaCy Example objects representing the test data.
  """

  # Predict and evaluate
  y_true_test = np.array([list(example.y.cats.values()) for example in test_data])
  y_pred_test = np.array([list(nlp(example.x.text).cats.values()) for example in test_data])

  # Binarize y_pred_test
  y_pred_test_binary = (y_pred_test >= 0.5).astype(int)

  # Calculate F1 score and Hamming loss for the test set
  f1_test = f1_score(y_true_test, y_pred_test_binary, average='micro')
  hamming_test = hamming_loss(y_true_test, y_pred_test_binary)

  print(f"Test F1 Score: {f1_test}")
  print(f"Test Hamming Loss: {hamming_test}")

### **D. Train spaCy model**

In [None]:
nlp, f1_scores, hamming_losses = train_spacy_model(model_size="md")

Training the model...

Fold 1
EPOCH	TR_LOSS	TR_AUC	VAL_AUC	TR_F1	VAL_F1	TR_P 	VAL_P	TR_R 	VAL_R


  matches = self.matcher(doc, allow_missing=True, as_spans=False)


0	2.384	0.526	0.521	0.010	0.013	0.007	0.009	0.018	0.022
1	0.388	0.565	0.541	0.000	0.000	0.000	0.000	0.000	0.000
2	0.211	0.617	0.571	0.000	0.000	0.000	0.000	0.000	0.000
3	0.208	0.647	0.596	0.000	0.000	0.000	0.000	0.000	0.000
4	0.203	0.655	0.607	0.022	0.013	0.955	0.812	0.011	0.007
5	0.197	0.660	0.617	0.163	0.083	0.892	0.733	0.090	0.044
6	0.188	0.664	0.627	0.246	0.127	0.865	0.677	0.143	0.070
7	0.181	0.670	0.636	0.322	0.179	0.886	0.723	0.197	0.102
8	0.174	0.676	0.645	0.367	0.199	0.899	0.736	0.230	0.115
9	0.165	0.684	0.653	0.419	0.229	0.927	0.712	0.271	0.137
10	0.159	0.689	0.661	0.468	0.257	0.932	0.709	0.313	0.157
11	0.151	0.697	0.668	0.502	0.270	0.938	0.726	0.342	0.166
12	0.143	0.699	0.670	0.557	0.310	0.932	0.703	0.397	0.199
13	0.139	0.704	0.676	0.587	0.321	0.954	0.729	0.424	0.206
14	0.133	0.711	0.683	0.621	0.349	0.952	0.716	0.461	0.231
15	0.128	0.714	0.687	0.635	0.360	0.942	0.692	0.479	0.243
16	0.122	0.714	0.687	0.665	0.366	0.963	0.726	0.508	0.245
17	0.120	0.721	0.691	0.677	0.375	0.962	0.

  matches = self.matcher(doc, allow_missing=True, as_spans=False)


0	2.373	0.522	0.503	0.026	0.016	0.026	0.015	0.027	0.018
1	0.375	0.548	0.527	0.000	0.000	0.000	0.000	0.000	0.000
2	0.212	0.609	0.557	0.000	0.000	0.000	0.000	0.000	0.000
3	0.208	0.646	0.582	0.000	0.000	0.000	0.000	0.000	0.000
4	0.205	0.664	0.600	0.029	0.015	0.879	0.750	0.015	0.008
5	0.198	0.666	0.601	0.108	0.049	0.874	0.620	0.057	0.025
6	0.188	0.664	0.602	0.205	0.132	0.859	0.695	0.116	0.073
7	0.181	0.665	0.608	0.284	0.179	0.886	0.727	0.169	0.102
8	0.171	0.666	0.619	0.422	0.262	0.883	0.672	0.277	0.163
9	0.161	0.673	0.631	0.483	0.297	0.897	0.694	0.331	0.189
10	0.153	0.681	0.637	0.529	0.320	0.917	0.712	0.372	0.206
11	0.146	0.683	0.644	0.572	0.350	0.919	0.700	0.415	0.233
12	0.140	0.688	0.649	0.597	0.355	0.932	0.706	0.439	0.238
13	0.133	0.694	0.656	0.622	0.369	0.934	0.689	0.466	0.252
14	0.129	0.698	0.660	0.644	0.363	0.949	0.676	0.487	0.248
15	0.123	0.701	0.666	0.664	0.370	0.955	0.679	0.509	0.255
16	0.119	0.704	0.669	0.688	0.388	0.955	0.669	0.538	0.273
17	0.116	0.709	0.672	0.707	0.387	0.956	0.

In [None]:
spacy_val_hamming_loss(hamming_losses)

# Evaluation metrics for test set
test_data = prepare_spacy_data(test_texts, y_test, "md")
test_scores = nlp.evaluate(test_data)
spacy_test_hamming_loss(nlp, test_data)
test_scores['cats_macro_auc'], test_scores['cats_micro_f'], test_scores['cats_micro_p'], test_scores['cats_micro_r']

Average Validation Hamming Loss: 0.013824256825285333


  matches = self.matcher(doc, allow_missing=True, as_spans=False)


Test Hamming Loss: 0.013564252613063627


(0.7637546840199227,
 0.4430555555555556,
 0.6744186046511628,
 0.3298862461220269)

The low validation and test F1 scores suggest that the model's performance in classifying the multi-label data is not that great. The fact that both validation and test scores are similar indicates that the model is not overfitting. However, the low F1 scores could point to underfitting or a model that struggles to capture the complexities of the data.

The Hamming loss being around 0.014 for both validation and test suggests a low rate of incorrect label assignments, which is good, but the F1 score implies there's room for improvement in the model's precision and recall balance.

Suggested next steps for future:
1. Implement Transformer-based Models: Try models like BERT or LLM (e.g. GPT, Llama2, Bloom, Mistral, etc) for better contextual understanding.
2. Hyperparameter Tuning: Adjust learning rates, epochs, and batch sizes to optimize performance.
3. Data Augmentation: Enhance the dataset, focusing on minority classes to address imbalance.
4. Advanced Preprocessing: Refine text preprocessing to improve feature extraction.
5. Error Analysis: Identify specific weaknesses of the model and target improvements accordingly.



### **2.2.4. Serial Best SpaCy NL Model**

In [None]:
# Save the trained model to a local directory in Google Colab
output_directory = '../models/model_spacy'  # Replace with your desired output directory

# Create the output directory if it doesn't exist
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

# Save the trained model to the output directory
nlp.to_disk(output_directory)

# Verify that the model has been saved
print(f"Model saved to {output_directory}")

Model saved to /content/drive/MyDrive/Colab Notebooks/complaint_prediction_spacy
