# Model developement

## Data & Package import

In [None]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
project_root = os.path.dirname(os.getcwd())

In [None]:
from dotenv import load_dotenv
load_dotenv()
from utils.gcp import load_data_from_gcs
from google.auth import credentials
from google.cloud import storage


project_root = os.path.dirname(os.getcwd())
service_account = os.path.join(project_root, os.getenv("GCP_SERVICE_ACCOUNT"))
client = storage.Client.from_service_account_json(service_account)

# Load data from GCS
bucket_name = os.getenv("GCP_BUCKET_NAME")
file_name = os.getenv("GCP_DATA_PATH")
data = load_data_from_gcs(bucket_name, file_name, client)

data["tokens"] = data["tokens"].apply(lambda x: eval(x))
data["label"] = data["label"].astype(int)
# Shuffle data and reset_index
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
data['label'] = data['label'].apply(lambda x: 1 if x == 2 else x)

# Data Exploration

### EDA

#### Class balance

In [None]:
# Class distribution
class_distribution = data["label"].value_counts(normalize=True)
print(class_distribution)

#### Input Analysis

In [None]:
from tqdm import tqdm
# Explore input length and uniqueness

# Length analysis in prepeparation for padding
data['tokens_length'] = data['tokens'].apply(lambda x: len(x))
mean_length = data['tokens_length'].mean()
quantiles_length = data['tokens_length'].quantile([0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99])

# Unique tokens per sample
data["unique_tokens"] = data['tokens'].apply(lambda x: len(set(x)))
mean_unique_tokens = data['unique_tokens'].mean()
quantiles_unique_tokens = data['unique_tokens'].quantile([0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99])


# Vocabulary analysis
vocab_size = len(set([token for token in data['tokens'] for token in token]))


# Top most frequent tokens
token_count = {}
for row in tqdm(data.tokens, desc="Progress"):
	for token in row :
		if token not in list(token_count.keys()):
			token_count[token] = 1
		else:
			token_count[token] += 1
token_count = dict(sorted(token_count.items(), key=lambda item: item[1], reverse=True))
top_tokens = dict(list(token_count.items())[:20])

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


# Data
x = list(top_tokens.keys())
y = list(top_tokens.values())

# Plotting
fig, ax = plt.subplots(1, 1, figsize=(10, 5))
sns.barplot(x=x, y=y, ax=ax)
ax.set_title("Top 20 most frequent tokens")
ax.set_ylabel("Frequency")
ax.set_xlabel("Token")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Token length distribution and outlier detection
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

fig, ax = plt.subplots(1, 2, figsize=(10, 5))

# Apply log transformation
data['log_tokens_length'] = data['tokens_length'].apply(lambda x: np.log(x+1))

# Token length distribution
sns.histplot(data['log_tokens_length'], kde=True, ax=ax[0])
ax[0].set_title("Token length distribution")

# Outlier detection
sns.scatterplot(x=list(data.index), y='log_tokens_length', data=data, ax=ax[1])
ax[1].set_title("Token length diagnostic plot")

# Outlier detection
mean = data['log_tokens_length'].mean()
median = data['log_tokens_length'].median()
std = data['log_tokens_length'].std()
mad = np.median(np.abs(data['log_tokens_length'] - median))

upper_bound = median + 2.5*(1.48*mad), mean + 2.5*std
lower_bound = median - 2.5*(1.48*mad), mean - 2.5*std

ax[1].axhline(upper_bound[0], color='r', linestyle='--')
ax[1].axhline(lower_bound[0], color='r', linestyle='--')
ax[1].axhline(upper_bound[1], color='g', linestyle='--')
ax[1].axhline(lower_bound[1], color='g', linestyle='--')
ax[1].axhline(mean, color='g', linestyle='-')
ax[1].axhline(median, color='r', linestyle='-')

plt.show();

In [None]:
# Explore unique/length of tokens
data["unique_ratio"] = data['unique_tokens'] / data['tokens_length']

# Apply log transformation
epsilon = 1e-6
data['log_unique_ratio'] = data['unique_ratio']


# Unique ratio distribution
fig, ax = plt.subplots(1, 1, figsize=(10, 5))
sns.histplot(data['unique_ratio'], kde=True, ax=ax)
plt.title("Unique ratio distribution")
plt.show();

#### Remove outliers

In [None]:
# Remove outliers
data = data[(data['log_tokens_length'] < upper_bound[0]) & (data['log_tokens_length'] > lower_bound[0])]

# Remove empty tokens
data = data[data['tokens_length'] > 0]

# Remove tokens with low unique ratio
data = data[data['unique_ratio'] > 0.2]

In [None]:
# Save data locally
import pickle
path = os.path.join(project_root, "datasets/processed/data.pkl")
with open(path, "wb") as f:
    pickle.dump(data, f)

# Training Setup & Embedding

#### Tokenizer & Embedding

Tokenizer

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

# Parameters for vectorization
tokens = list(set([token for token in data['tokens'] for token in token]))  
max_tokens = len(tokens) # Vocabulary size
max_length = data.tokens_length.max()    # Sequence length after padding

# Define TextVectorization layer
text_vectorizer = TextVectorization(
    max_tokens=max_tokens,  # Maximum vocab size
    output_mode="int",      # Map tokens to integers
    output_sequence_length=int(max_length), # Ensure padding/truncation
    name="text_vectorizer"  # Name of the layer
)

# Prepare and adapt the vectorizer to the dataset
text_vectorizer.adapt(data["corrected_text"])

Embedding

In [None]:
# Load the pre-trained model
from gensim.models import KeyedVectors
word2vec = KeyedVectors.load_word2vec_format(os.path.join(project_root, 'datasets/.originals/Google-News-Vectors-Negative-300.bin'), binary=True)

# Initialize the embedding matrix
embedding_dim = 300
embedding_matrix = np.zeros((len(text_vectorizer.get_vocabulary()), embedding_dim))

# Fill the embedding matrix with Word2Vec vectors
for i, word in tqdm(enumerate(text_vectorizer.get_vocabulary()), desc='progression'):
    if i == 0:  # Reserved for padding, indices are already zero-initialized
        continue
    elif word == "[UNK]":  # Default unknown token in TextVectorization
        embedding_matrix[i] = np.random.rand(embedding_dim)
    elif word in word2vec:
        embedding_matrix[i] = word2vec[word]
    else:
        embedding_matrix[i] = np.random.rand(embedding_dim)  # Random vector for OOV tokens

del word2vec

In [None]:
from tensorflow.keras.layers import Embedding
embedding_dim = 300
embedding_layer = Embedding(
    input_dim=len(text_vectorizer.get_vocabulary()),
    output_dim=embedding_dim,
    weights=[embedding_matrix],
    trainable=False,
    name="embedding"
)

#### Model Setup (metrics, input, build_model)

Custom Metrics

In [None]:
import tensorflow as tf
# Define custom metrics
class PrecisionMulticlass(tf.keras.metrics.Metric):
    def __init__(self, name='precision', n_class=2, **kwargs):
        super(PrecisionMulticlass, self).__init__(name=name, **kwargs)
        self.precision = self.add_weight(
            shape=(n_class,),
            name='precision',
            initializer='zeros')
        self.n_class = n_class
        self.true_positives = self.add_weight(name='true_positives', shape=(self.n_class,), initializer='zeros')
        self.false_positives = self.add_weight(name='false_positives', shape=(self.n_class,), initializer='zeros')
        
        
    def update_state(self, y_true, y_pred, sample_weight=None):
        y_true = tf.cast(y_true, tf.int64)
        y_pred = tf.cast(tf.one_hot(tf.argmax(y_pred, axis=1), self.n_class), tf.int64)
        
        for i in range(self.n_class):
            true_positive = tf.reduce_sum(y_true[:, i] * y_pred[:, i])
            false_positive = tf.reduce_sum(tf.cast(tf.logical_and(tf.equal(y_true[:, i], 0), tf.equal(y_pred[:, i], 1)), tf.int64))
            
            index = [[i]]  # Index for the class we are updating
            self.true_positives.assign(tf.tensor_scatter_nd_add(self.true_positives, index, [true_positive]))
            self.false_positives.assign(tf.tensor_scatter_nd_add(self.false_positives, index, [false_positive]))
            
    def result(self):
        precision_per_class = self.true_positives / (self.true_positives + self.false_positives + tf.keras.backend.epsilon())
        return tf.reduce_mean(precision_per_class)
    
    def reset_state(self):
        self.true_positives.assign(tf.zeros(self.n_class))
        self.false_positives.assign(tf.zeros(self.n_class))
    
    def get_config(self):
        config = super().get_config()
        config.update({
            "n_class": self.n_class,
        })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)
        

class RecallMulticlass(tf.keras.metrics.Metric):
    def __init__(self, name='recall', n_class=2, **kwargs):
        super(RecallMulticlass, self).__init__(name=name, **kwargs)
        self.recall = self.add_weight(
            shape=(n_class,),
            name='recall',
            initializer='zeros')
        self.n_class = n_class
        self.true_positives = self.add_weight(name='true_positives', shape=(self.n_class,), initializer='zeros')
        self.false_negatives = self.add_weight(name='false_negatives', shape=(self.n_class,), initializer='zeros')
        
    def update_state(self, y_true, y_pred, sample_weight=None):
        y_true = tf.cast(y_true, tf.int64)
        y_pred = tf.cast(tf.one_hot(tf.argmax(y_pred, axis=1), self.n_class), tf.int64)
        
        for i in range(self.n_class):
            true_positive = tf.reduce_sum(y_true[:, i] * y_pred[:, i])
            false_negative = tf.reduce_sum(tf.cast(tf.logical_and(tf.equal(y_true[:, i], 1), tf.equal(y_pred[:, i], 0)), tf.int64))
            
            index = [[i]]  # Index for the class we are updating
            self.true_positives.assign(tf.tensor_scatter_nd_add(self.true_positives, index, [true_positive]))
            self.false_negatives.assign(tf.tensor_scatter_nd_add(self.false_negatives, index, [false_negative]))
            
    def result(self):
        recall_per_class = self.true_positives / (self.true_positives + self.false_negatives + tf.keras.backend.epsilon())
        return tf.reduce_mean(recall_per_class)
    
    def reset_state(self):
        self.true_positives.assign(tf.zeros(self.n_class))
        self.false_negatives.assign(tf.zeros(self.n_class))
        
    def get_config(self):
        config = super().get_config()
        config.update({
            "n_class": self.n_class,
        })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)
        
        
        
class F1ScoreMulticlass(tf.keras.metrics.Metric):
    def __init__(self, name='f1_score', n_class=2, **kwargs):
        super(F1ScoreMulticlass, self).__init__(name=name, **kwargs)
        self.n_class = n_class
        self.precision = PrecisionMulticlass(n_class=n_class)
        self.recall = RecallMulticlass(n_class=n_class)

    def update_state(self, y_true, y_pred, sample_weight=None):
        self.precision.update_state(y_true, y_pred, sample_weight)
        self.recall.update_state(y_true, y_pred, sample_weight)

    def result(self):
        precision = self.precision.result()
        recall = self.recall.result()
        f1_score = 2 * (precision * recall) / (precision + recall + tf.keras.backend.epsilon())
        return f1_score

    def reset_state(self):
        self.precision.reset_state()
        self.recall.reset_state()
        
    def get_config(self):
        config = super().get_config()
        config.update({
            "n_class": self.n_class,
        })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)
        
        

Custom Loss

In [None]:
# Define custom weighted loss
class_distribution = data['label'].value_counts(normalize=True)
weights = (1/class_distribution) # Weight
weights = weights / weights.sum() # Normalize

# Define custom loss
class WeightedCategoricalCrossEntropy(tf.keras.losses.Loss):
    def __init__(self, weights, name='weighted_categorical_crossentropy', **kwargs):
        super(WeightedCategoricalCrossEntropy, self).__init__()
        self.weights = tf.cast(weights, tf.float32)
        
    def call(self, y_true, y_pred):
        y_true = tf.cast(y_true, tf.float32)
        y_pred = tf.cast(y_pred, tf.float32)
        # Clip y_pred to avoid log(0)
        y_pred = tf.clip_by_value(y_pred, tf.keras.backend.epsilon(), 1 - tf.keras.backend.epsilon())
        weighted_losses = -self.weights * y_true * tf.math.log(y_pred)
        return tf.reduce_mean(tf.reduce_sum(weighted_losses, axis=1))
    
    def get_config(self):
        config = super().get_config()
        config.update({
            "weights": self.weights,
        })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)

Callbacks

In [None]:
import os
import tensorflow as tf

# Early stopping

def early_stopping():
    return tf.keras.callbacks.EarlyStopping(
        monitor='recall',     
        patience=10,             
        mode='max',            
        min_delta=0.001,        
        restore_best_weights=True
    )

# TensorBoard
def tensorboard(log_dir:str = os.path.join(project_root, "logs", "fit")):
    return tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

# ModelCheckpoint
def model_checkpoint(model_name):
    return tf.keras.callbacks.ModelCheckpoint(
        filepath=os.path.join(project_root, "models", f"{model_name}"),
        monitor='val_recall',
        save_best_only=True,
        save_weights_only=False,
        save_format='tf',
        mode='max',
        verbose=1
    )

Build Dataset

In [None]:
def build_dataset(data: pd.DataFrame, batch_size:int = 512):
    from tensorflow.keras.utils import to_categorical
    
    # Prepare dataset
    X = data["corrected_text"]
    y = data["label"]
    y = to_categorical(y, num_classes=2)

    # Split data
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create tf.data.Dataset
    train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
    test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test))


    return train_dataset.batch(batch_size), test_dataset.batch(batch_size), X_test.index #, X_train, X_test, y_train, y_test

In [None]:
import os
import pickle

# Save the TextVectorization layer configuration and weights
text_vectorizer_config = text_vectorizer.get_config()
text_vectorizer_weights = text_vectorizer.get_weights()

with open(os.path.join(project_root, 'models', 'text_vectorizer', 'config.pkl'), 'wb') as f:
    pickle.dump(text_vectorizer_config, f)

with open(os.path.join(project_root, 'models', 'text_vectorizer', 'weights.pkl'), 'wb') as f:
    pickle.dump(text_vectorizer_weights, f)

Train / Test class balance check

In [None]:
train_dataset, test_dataset, X_train, X_test, y_train, y_test = build_dataset(data)

# Inspect class balance on training set and test set
train_labels = y_train.argmax(axis=1)
test_labels = y_test.argmax(axis=1)

# Train balance analysis
class_count = np.unique(train_labels, return_counts=True)[1]
total = class_count.sum()
train_class_distribution = class_count / total
print(f"Train class distribution: {train_class_distribution}")

# Test balance analysis
class_count = np.unique(test_labels, return_counts=True)[1]
total = class_count.sum()
test_class_distribution = class_count / total
print(f"Test class distribution: {test_class_distribution}")

# Total class distrib
print(f"Total class distribution : {list(class_distribution)}")

Model

In [None]:
def build_model(layers: list, loss: tf.keras.losses.Loss, metrics: list, model_name: str):
    
    tf.random.set_seed(42)
    
    # Define model
    # Input Layer
    inputs = tf.keras.layers.Input(shape=(1,), dtype=tf.string, name='features')
    x = text_vectorizer(inputs)
    x = embedding_layer(x)
    
    
    # Hidden Layers
    for layer in layers:
        x = layer(x)
        
    # Output Layer
    outputs = tf.keras.layers.Dense(2, activation='softmax', name='output')(x)
    
    # Model
    model = tf.keras.Model(inputs, outputs, name=model_name)
    
    # Compile
    model.compile(
        loss=loss,
        optimizer=tf.keras.optimizers.legacy.Adam(),
        metrics=metrics,
    )
    
    model.summary()
    
    return model
    

In [None]:
def train_model(model, training_data, testing_data, epochs:int = 100, callbacks:list = None):
    history = model.fit(training_data,
        validation_data=testing_data,
        epochs=epochs,
        callbacks=callbacks
    )
    
    return history

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.metrics import classification_report

def plot_classification_report(model, testing_data):
    """
    This function takes a model and testing dataset as input, evaluates the model on the test set, 
    and generates a classification report with precision, recall, and F1 scores per class.
    
    Parameters:
    - model: A trained Keras model
    - testing_data: A TensorFlow dataset (or a tf.data.Dataset) that contains features and labels for evaluation.
    
    Returns:
    - A bar plot with precision, recall, and F1 scores per label.
    """

    # Evaluate the model on the test set
    metrics = model.evaluate(testing_data)
    # Print overall results
    print("Overall results")
    for metric, value in zip(model.metrics_names, metrics):
        print(f"{metric}: {value}")

    # Get true labels and predictions
    y_pred = model.predict(testing_data)
    y_pred = y_pred.argmax(axis=1)  # Convert predictions to label indices (class labels)

    # Extract all labels from the testing_data
    def get_all_labels(dataset):
        all_labels = []
        for batch in dataset.as_numpy_iterator():
            labels = batch[1]  # Assuming '1' corresponds to the labels in your dataset
            all_labels.append(labels)
        return np.concatenate(all_labels, axis=0)

    # Get true labels as a single NumPy array
    true_labels_one_hot = get_all_labels(testing_data)
    true_labels = true_labels_one_hot.argmax(axis=1)

    # Generate classification report
    report = classification_report(true_labels, y_pred, target_names=["Neutral", "Toxic"], output_dict=True)

    # Convert the classification report dictionary into a pandas DataFrame
    report_df = pd.DataFrame(report).transpose()

    # Plotting grouped bars for precision, recall, and F1-score
    ax = report_df[['precision', 'recall', 'f1-score']].plot(kind='barh', figsize=(10, 6))
    
    # Set plot title and labels
    ax.set_title('Classification Report Metrics per Label')
    ax.set_xlabel('Scores')
    ax.set_ylabel('Labels')
    ax.legend(title="Metrics")
    
    # Display the plot
    plt.tight_layout()
    plt.show()

    return ax, metrics  # Return axis object for further manipulation if needed



def plot_metrics_with_seaborn(history):

    # Extract metrics from history
    metrics = [key for key in history.history.keys() if 'val_' not in key]
    epochs = range(1, len(history.history[metrics[0]]) + 1)
    

    for metric in metrics:
        plt.figure(figsize=(6, 3))
        
        # Plot training metric
        sns.lineplot(x=epochs, y=history.history[metric], label=f'Training {metric.capitalize()}', marker='o')
        
        # Plot validation metric if available
        if f'val_{metric}' in history.history:
            sns.lineplot(x=epochs, y=history.history[f'val_{metric}'], label=f'Validation {metric.capitalize()}', marker='o')
        
        # Formatting
        plt.title(f'{metric.capitalize()} Over Epochs', fontsize=16)
        plt.xlabel('Epochs', fontsize=14)
        plt.ylabel(metric.capitalize(), fontsize=14)
        plt.legend(loc='best', fontsize=12)
        plt.grid(True)
        plt.xticks(epochs)  # Show each epoch number on x-axis
        plt.tight_layout()  # Adjust layout for better fit
        plt.show()

# Training

In [None]:
import tensorflow as tf

# Check if GPUs are available
gpus = tf.config.experimental.list_physical_devices('GPU')
print("Available GPUs:", gpus)


## Baseline Model

Build model

In [None]:
# Data
training_data, testing_data = build_dataset(data)

# Metrics
metrics = [
    PrecisionMulticlass(n_class=2),
    RecallMulticlass(n_class=2),
    F1ScoreMulticlass(n_class=2),
]

# Loss
loss = WeightedCategoricalCrossEntropy(weights)

# Callbacks
callbacks = [
    early_stopping(),
    #tensorboard(),
    model_checkpoint("baseline_model")
]

# Layers
layers = [
    tf.keras.layers.LSTM(64, return_sequences=False, name='lstm'),
    tf.keras.layers.Dense(64, activation='relu', name='dense_1'),
    tf.keras.layers.Dense(32, activation='relu', name='dense_2')
]

# Model
model = build_model(layers=layers, loss=loss, metrics=metrics, 
                    model_name="baseline_model")

Training

In [None]:
baseline_model_history = train_model(model, training_data, testing_data, epochs=100, callbacks=callbacks)

Model Evaluation

In [None]:
plot_classification_report(model, testing_data)

In [None]:
plot_metrics_with_seaborn(baseline_model_history)

## Bidirectionnal LSTM

In [None]:
# Data
training_data, testing_data = build_dataset(data)

# Metrics
metrics = [
    PrecisionMulticlass(n_class=2),
    RecallMulticlass(n_class=2),
    F1ScoreMulticlass(n_class=2),
]

# Loss
loss = WeightedCategoricalCrossEntropy(weights)

# Callbacks
callbacks = [
    early_stopping(),
    #tensorboard(),
    model_checkpoint("bi_lstm")
]

# Layers
layers = [
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, activation="tanh",return_sequences=True, name='bi_lstm')),
    tf.keras.layers.Dropout(0.2, name='dropout'),
    tf.keras.layers.LSTM(64, activation='tanh', name='lstm', return_sequences=False),
    tf.keras.layers.Dense(64, activation='relu', name='dense_1'),
    tf.keras.layers.Dropout(0.2, name='dropout_2'),
    tf.keras.layers.Dense(64, activation='relu', name='dense_2'),
    tf.keras.layers.Dense(16, activation='relu', name='dense_3')
]

# Model
bi_lstm_model = build_model(layers=layers, loss=loss, metrics=metrics, 
                    model_name="bi_lstm_2")

In [None]:
bi_lstm_model_history = train_model(bi_lstm_model, training_data, testing_data, epochs=100, callbacks=callbacks)

In [None]:
plot_classification_report(bi_lstm_model, testing_data)

In [None]:
plot_metrics_with_seaborn(bi_lstm_model_history)

## GRU model

In [None]:
data = pd.read_pickle(os.path.join(project_root, "datasets/processed/data.pkl"))
data['label'] = data['label'].apply(lambda x: 1 if x == 2 else x)
training_data, testing_data, test_indices = build_dataset(data)


In [None]:
# Data
#training_data, testing_data = build_dataset(data)

# Metrics
metrics = [
    PrecisionMulticlass(n_class=2),
    RecallMulticlass(n_class=2),
    F1ScoreMulticlass(n_class=2),
]

# Loss
loss = WeightedCategoricalCrossEntropy(weights)

# Callbacks
callbacks = [
    early_stopping(),
    #tensorboard(),
    model_checkpoint("bi_lstm")
]

# Layers
layers = [
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, activation="tanh",return_sequences=True, name='bi_GRU')),
    tf.keras.layers.Dropout(0.2, name='dropout'),
    tf.keras.layers.GRU(128, activation='tanh', name='GRU'),
    tf.keras.layers.Dropout(0.2, name='dropout_1'),
    tf.keras.layers.Dense(64, activation='relu', name='dense_1'),
    tf.keras.layers.Dropout(0.2, name='dropout_2'),
    tf.keras.layers.Dense(16, activation='relu', name='dense_2')
]

# Model
bi_gru_model = build_model(layers=layers, loss=loss, metrics=metrics, 
                    model_name="bi_gru")

In [None]:
bi_gru_model_history = train_model(bi_gru_model, training_data, testing_data, epochs=100, callbacks=callbacks)

In [None]:
from functools import partial
from utils.custom_metrics import PrecisionMulticlass, RecallMulticlass, F1ScoreMulticlass, WeightedCategoricalCrossEntropy
weights = pd.Series([0.334298, 0.665702])
model = tf.keras.models.load_model(
    os.path.join(project_root, "models", "bi_gru"),
    custom_objects={'PrecisionMulticlass': PrecisionMulticlass,
                    'RecallMulticlass': RecallMulticlass,
                    'F1ScoreMulticlass': F1ScoreMulticlass,
                    'WeightedCategoricalCrossEntropy': partial(WeightedCategoricalCrossEntropy, weights=weights)}
)

In [None]:
# Load testing data
testing_data = tf.data.Dataset.load(os.path.join(project_root, 'datasets', 'training', 'testing_data_2labels_tf'))
testing_data

In [None]:
plot_classification_report(model, testing_data)

### Model results analysis

In [None]:
# Predict on the test dataset
y_pred_probs = model.predict(testing_data)
y_pred = tf.argmax(y_pred_probs, axis=1)

# Get the true labels
y_true = tf.concat([y for x, y in testing_data], axis=0)
y_true = tf.argmax(y_true, axis=1)

# Find the indices where the predictions are incorrect
incorrect_indices = tf.where(y_pred != y_true)
incorrect_indices = tf.squeeze(incorrect_indices).numpy()

# Find the indices where the predictions are correct
corrected_indices = tf.where(y_pred == y_true)
corrected_indices = tf.squeeze(corrected_indices).numpy()

# Retrieve the corresponding rows from the original dataframe
failed_predictions = data.iloc[incorrect_indices].copy()
correct_predictions = data.iloc[corrected_indices].copy()

# Add predicted probabilities and final predictions as new columns
failed_predictions['predicted_probabilities'] = y_pred_probs[incorrect_indices].tolist()
failed_predictions['final_prediction'] = y_pred.numpy()[incorrect_indices]
correct_predictions['predicted_probabilities'] = y_pred_probs[corrected_indices].tolist()
correct_predictions['final_prediction'] = y_pred.numpy()[corrected_indices]

In [None]:
# Plot failed predictions distributions
fig, ax = plt.subplots(1, 4, figsize=(15, 5))

# Distribution of tokens_length in failed predictions
sns.histplot(failed_predictions['log_tokens_length'], kde=True, ax=ax[0])
ax[0].set_title("Tokens length in failed predictions")
# Distribution of tokens_length in correct predictions
sns.histplot(correct_predictions['log_tokens_length'], kde=True, ax=ax[1])
ax[1].set_title("Tokens length in correct predictions")
# Distribution of unique_tokens in failed predictions
sns.histplot(failed_predictions['unique_tokens'], kde=True, ax=ax[2])
ax[2].set_title("Unique tokens in failed predictions")
# Distribution of unique_tokens in correct predictions
sns.histplot(correct_predictions['unique_tokens'], kde=True, ax=ax[3])
ax[3].set_title("Unique tokens in correct predictions")

plt.show();

In [None]:
failed_predictions[failed_predictions.label == failed_predictions.final_prediction].shape

# MLFlow setup

In [None]:
import tensorflow as tf
import mlflow
import mlflow.tensorflow

project_root = os.path.dirname(os.getcwd())

# Set the experiment name (if it doesn't exist, it will be created)
mlflow.set_experiment("baseline_model_experiment")

# Start an MLflow run
with mlflow.start_run(run_name="baseline_model"):


    # Save a model
    baseline_model_json = baseline_model.to_json()
    with open(os.path.join(project_root, "model_structure.json"), "w") as json_file:
        json_file.write(baseline_model_json)
        mlflow.log_artifact(os.path.join(project_root, "model_structure.json"))  # Save model structure to MLflow
        
    mlflow.tensorflow.save_model(baseline_model, 'baseline_model',
                                 input_example=input_example)

    # Save the model's weights
    baseline_model.save_weights(os.path.join(project_root, "model_weights.h5"))
    mlflow.log_artifact(os.path.join(project_root, "model_weights.h5"))  # Save weights to MLflow

    # Alternatively, you can log the entire model in one go:
    mlflow.tensorflow.log_model(baseline_model, "full_model")
        
    # Save embedding matrix
    mlflow.log_artifact(os.path.join(project_root, 'models', 'embedding_matrix_300.npy'))
        
    # Log evaluation metrics
    mlflow.log_metric("eval_loss", eval_loss)
    mlflow.log_metric("eval_precision", eval_precision)
    mlflow.log_metric("eval_recall", eval_recall)


    # Log parameters
    mlflow.log_param("vocab_size", vocab_size)
    mlflow.log_param("embedding_dim", embedding_dim)
    mlflow.log_param("batch_size", batch_size)
    mlflow.log_param("epochs", len(baseline_model_history.history['loss']))
    mlflow.log_param("early_stopping_patience", early_stopping.patience)
    mlflow.log_param("early_stopping_min_delta", early_stopping.min_delta)

    
    mlflow.end_run()