In [8]:
import re
import pandas as pd
import numpy as np

from skmultilearn.model_selection import iterative_train_test_split

#import spacy
#import nltk
#from nltk.corpus import stopwords
#nltk.download('stopwords')

## **Load Data**

In [3]:
# Load the datasets
df_user_inputs = pd.read_csv('../dataset/user_inputs.csv', delimiter=';')
df_labels = pd.read_csv('../dataset/labels.csv', delimiter=";")

# Remove unnecessary index columns
df_user_inputs.drop(df_user_inputs.columns[0], axis=1, inplace=True)
df_labels.drop(df_labels.columns[0], axis=1, inplace=True)

# Remove classes with < 2 instances (this is only 'no complaints' label with 0 instance so not a big deal)
# We need to do this to split the data later with stratification
df_labels = df_labels.loc[:, (df_labels.sum(axis=0) >= 2)]

# Ensure alignment
assert len(df_labels) == len(df_user_inputs), "Datasets do not align!"

print(df_user_inputs.shape)
df_user_inputs.head(10)

(3974, 1)


Unnamed: 0,text
0,Er is een teek op mijn been. Ik ben bang dat d...
1,Er is een teek op mijn rug en ik krijg hem er ...
2,Op mijn been zit een teek. Ik heb hem geprobee...
3,Ik heb allergieen
4,huid
5,roodheid
6,schilfering
7,Ik heb wratten onder mijn voet
8,Ik heb gisteren naar het bos geweest en zie nu...
9,Ik voelde iets prikken


## **Preprocess Data**

In [4]:
## Preprocess user input text

# Load the Dutch language model from Spacy
#nlp = spacy.load("nl_core_news_sm")

# Set of Dutch stopwords from NLTK
#dutch_stopwords = set(stopwords.words('dutch'))

def preprocess_text(text):
    """
    Preprocesses the input text by lowercasing, removing special characters, and removing stopwords.
    Args:
        text (str): The text to preprocess.
    Returns:
        str: The preprocessed text.
    """

    # Convert to lowercase
    text = text.lower()

    # Remove special characters
    text = re.sub(r'\W+', ' ', text)
    """
    # Tokenize text
    tokens = word_tokenize(text)

    # Remove stopwords
    filtered_tokens = [token for token in tokens if token not in dutch_stopwords]

    # Lemmatize each token
    doc = nlp(" ".join(filtered_tokens))
    lemmas = [token.lemma_ for token in doc]

    text = ' '.join(lemmas)
    """
    return text

df_user_inputs['text'] = df_user_inputs['text'].apply(preprocess_text)

df_user_inputs.head(10)

Unnamed: 0,text
0,er is een teek op mijn been ik ben bang dat di...
1,er is een teek op mijn rug en ik krijg hem er ...
2,op mijn been zit een teek ik heb hem geprobeer...
3,ik heb allergieen
4,huid
5,roodheid
6,schilfering
7,ik heb wratten onder mijn voet
8,ik heb gisteren naar het bos geweest en zie nu...
9,ik voelde iets prikken


## **Train-Test Split**

To accomodate the nature of multi-label classification. Instead of using the traditional method `train_test_split`, we employ iterative stratified sampling `iterative_train_test_split`, to provide a well-balanced distribution of all label combinations in both training and test sets.

In [9]:
## Split data to train:val:test

# Prepare data for iterative train test split
# X must be 2D np.ndarray and y must be 2D binary np.ndarray
X_texts = df_user_inputs['text'].values
X_texts = X_texts.reshape(-1, 1)
y = df_labels.values

# Split the data 70:15:15 with multi-label stratification
SEED = 42
np.random.seed(SEED)
train_texts, y_train, tmp_texts, y_tmp = iterative_train_test_split(X_texts, y, test_size = 0.3)
val_texts, y_val, test_texts, y_test = iterative_train_test_split(tmp_texts, y_tmp, test_size = 0.5)

# Sanity checks to confirm the shapes of the datasets
assert train_texts.shape[0] == y_train.shape[0], "Mismatch in train data and labels"
assert val_texts.shape[0] == y_val.shape[0], "Mismatch in train data and labels"
assert test_texts.shape[0] == y_test.shape[0], "Mismatch in test data and labels"

train_texts, test_texts = train_texts.ravel(), test_texts.ravel()
val_texts = val_texts.ravel()

print(train_texts.shape, y_train.shape, val_texts.shape, test_texts.shape)
train_texts

(2773,) (2773, 74) (604,) (597,)


array(['er is een teek op mijn been ik ben bang dat die er al een tijdje op heeft gezeten',
       'roodheid', 'schilfering', ...,
       'vannacht met slapen denk ik gekke beweging gemaakt want mn nek is nu helemaal stijf kan niet meer naar rechts kijken',
       'heb al langere tijd pijn in mn nek krijg dan soms tintelingen over mijn arm heb dan ook minder kracht in mijn arm',
       'doet zeer als ik mn hoofd beweeg'], dtype=object)

## **Save data to csv**

In [17]:
# Convert texts to DataFrame
train_texts_df = pd.DataFrame(train_texts, columns=['text'])
val_texts_df = pd.DataFrame(val_texts, columns=['text'])
test_texts_df = pd.DataFrame(test_texts, columns=['text'])

# Convert labels to DataFrame
y_train_df = pd.DataFrame(y_train, columns=df_labels.columns)
y_val_df = pd.DataFrame(y_val, columns=df_labels.columns)
y_test_df = pd.DataFrame(y_test, columns=df_labels.columns)

# Save texts and labels to separate CSV files
train_texts_df.to_csv('../dataset/train_texts.csv', index=False, sep=';')
val_texts_df.to_csv('../dataset/val_texts.csv', index=False, sep=';')
test_texts_df.to_csv('../dataset/test_texts.csv', index=False, sep=';')

y_train_df.to_csv('../dataset/y_train.csv', index=False, sep=';')
y_val_df.to_csv('../dataset/y_val.csv', index=False, sep=';')
y_test_df.to_csv('../dataset/y_test.csv', index=False, sep=';')
