In [1]:
# !pip install --upgrade pip

In [2]:
# !pip install transformers==4.29.0
# !pip install datasets
# !pip install huggingface-hub
# !pip install tensorflow==2.12.0 keras==2.12.0
# ! pip uninstall tensorflow
# ! pip install tensorflow-gpu

In [3]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModel, TFDebertaV2ForSequenceClassification
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, f1_score, precision_recall_curve

2024-11-07 23:13:09.183854: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-11-07 23:13:09.302768: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-07 23:13:09.306480: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-11-07 23:13:09.306492: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore 

In [4]:
!nvidia-smi

Thu Nov  7 23:13:13 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Quadro RTX 6000                Off |   00000000:1B:00.0 Off |                  Off |
| 33%   34C    P8             14W /  260W |       3MiB /  24576MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Quadro RTX 6000                Off

In [5]:
# ######Absolutely hide before submitting########

# login(token='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')

read the data from drive and remove empty values

In [6]:
url = "https://drive.google.com/uc?export=download&id=103IC2SS2II9Q0jdin4rTAXNiw3VshQ9Z"

data = pd.read_csv(url)
data = data.dropna(subset=['text_stopwords', 'written_text', 'emotions'])

data.head()

Unnamed: 0,text_stopwords,emotions,written_text,cluster
0,wife board insists eating real dinner,['amusement'],I couldn't get my wife on board. She insists o...,11
1,awful video wanted jump real cut start followe...,"['desire', 'disgust']",What an awful video. I just wanted to see the ...,14
2,imagining drug stay ping ponging air drone kee...,['excitement'],I love imagining that the drugs just stay ping...,1
3,completely correct might lost control blown ti...,['fear'],Completely correct. But he Might have lost con...,22
4,file keep phone accessing within,['caring'],This probably not a good file to keep on your ...,11


Check if all columns are correct

In [7]:
assert {'text_stopwords', 'written_text', 'emotions', 'cluster'}.issubset(data.columns), "Dataset must contain 'text_stopwords', 'written_text', 'emotions', 'cluster' columns."

Convert emotions from string representation of lists to actual lists

In [8]:
if isinstance(data['emotions'].iloc[0], str):
    import ast
    data['emotions'] = data['emotions'].apply(ast.literal_eval)

Prepare emotion labels using MultiLabelBinarizer

In [9]:
# Binarize emotion labels
mlb = MultiLabelBinarizer()
emotion_labels = mlb.fit_transform(data['emotions'])
emotion_classes = mlb.classes_
num_classes = len(emotion_classes)

Conver clusters to int

Tokenize texts

In [10]:
tokenizer = AutoTokenizer.from_pretrained("kamalkraj/deberta-v2-xlarge")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
# Tokenize 'written_text'
encodings_written = tokenizer(
    data['written_text'].tolist(),
    truncation=True,
    padding=True,
    max_length=128,
)

# Tokenize 'text_stopwords'
encodings_stopwords = tokenizer(
    data['text_stopwords'].tolist(),
    truncation=True,
    padding=True,
    max_length=128,
)

Prepare cluster input

In [12]:
# Prepare TensorFlow datasets
features = {
    'input_ids_written': encodings_written['input_ids'],
    'attention_mask_written': encodings_written['attention_mask'],
    'input_ids_stopwords': encodings_stopwords['input_ids'],
    'attention_mask_stopwords': encodings_stopwords['attention_mask'],
    'cluster': data['cluster'].astype(int).values,
}
labels = emotion_labels


In [13]:
print("First input text:", data['written_text'].iloc[0])
print("First label:", labels[0])

First input text: I couldn't get my wife on board. She insists on eating a "real dinner".
First label: [0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [14]:
print("Sample labels:", labels[:5])
print("Labels shape:", labels.shape)

Sample labels: [[0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
Labels shape: (17966, 28)


In [15]:
import numpy as np
label_sums = labels.sum(axis=0)
print("Label counts per class:", label_sums)

Label counts per class: [1414  947  943 1234 1287  912  871 1059  844 1164 1018  946  820  921
  931 1102  670 1101  912  809  653 1118  713 1062  689  769 1139  861]


In [16]:
indices = np.arange(len(labels))  # len(labels) should be 17966

# Split indices
from sklearn.model_selection import train_test_split

train_indices, val_indices = train_test_split(indices, test_size=0.2, random_state=42)

# Initialize dictionaries to hold train and validation features
train_features = {}
val_features = {}

# Split each feature array using the indices
for key in features:
    feature_array = np.array(features[key])
    train_features[key] = feature_array[train_indices]
    val_features[key] = feature_array[val_indices]

# Split labels
train_labels = labels[train_indices]
val_labels = labels[val_indices]

Create Tensor dataset

In [17]:
def create_dataset(features, labels, batch_size):
    dataset = tf.data.Dataset.from_tensor_slices((features, labels))
    dataset = dataset.shuffle(1000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset

Split data into training and validation sets

In [18]:
batch_size = 16
train_dataset = create_dataset(train_features, train_labels, batch_size)
val_dataset = create_dataset(val_features, val_labels, batch_size)

2024-11-07 23:13:20.467440: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-11-07 23:13:20.467562: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2024-11-07 23:13:20.467598: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2024-11-07 23:13:20.467628: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcufft.so.10'; dlerror: libcufft.so.10: cannot open shared object file: No such file or directory
2024-11-07 23:13:20.468518: W tensorfl

Define Model

In [19]:
import tensorflow as tf

physical_devices = tf.config.list_physical_devices('GPU')
print("Num GPUs Available: ", len(physical_devices))
print("GPUs:", physical_devices)


Num GPUs Available:  0
GPUs: []


In [20]:
strategy = tf.distribute.MirroredStrategy(devices=["/gpu:0", "/gpu:1"])

with strategy.scope():

    # Model Definition
    from tensorflow.keras.layers import Input, Dense, Dropout, Concatenate, Embedding
    from tensorflow.keras.models import Model
    from transformers import TFDebertaV2ForSequenceClassification  # Updated import

    # Cluster Embedding
    num_clusters = data['cluster'].nunique()
    cluster_embedding_dim = 16

    # Inputs
    input_ids_written = Input(shape=(None,), dtype=tf.int32, name='input_ids_written')
    attention_mask_written = Input(shape=(None,), dtype=tf.int32, name='attention_mask_written')

    input_ids_stopwords = Input(shape=(None,), dtype=tf.int32, name='input_ids_stopwords')
    attention_mask_stopwords = Input(shape=(None,), dtype=tf.int32, name='attention_mask_stopwords')

    cluster_input = Input(shape=(), dtype=tf.int32, name='cluster_input')

    # Number of labels
    num_labels = num_classes  # Assuming num_classes is defined elsewhere (28 in your case)

    # DeBERTa V2 Models with Multi-Label Classification Heads
    deberta_model_written = TFDebertaV2ForSequenceClassification.from_pretrained(
        "kamalkraj/deberta-v2-xlarge",
        num_labels=num_labels,
        problem_type="multi_label_classification",
    )
    deberta_model_stopwords = TFDebertaV2ForSequenceClassification.from_pretrained(
        "kamalkraj/deberta-v2-xlarge",
        num_labels=num_labels,
        problem_type="multi_label_classification",
    )

    # Get logits from both models
    outputs_written = deberta_model_written(
        input_ids=input_ids_written,
        attention_mask=attention_mask_written,
        training=True
    )
    logits_written = outputs_written.logits  # Shape: (batch_size, num_labels)

    outputs_stopwords = deberta_model_stopwords(
        input_ids=input_ids_stopwords,
        attention_mask=attention_mask_stopwords,
        training=True
    )
    logits_stopwords = outputs_stopwords.logits  # Shape: (batch_size, num_labels)

    # Cluster Embedding
    cluster_embedding = Embedding(
        input_dim=num_clusters + 1,  # +1 if clusters start from 0
        output_dim=cluster_embedding_dim,
        name='cluster_embedding'
    )
    cluster_embeds = cluster_embedding(cluster_input)

    # Concatenate all features
    combined_output = Concatenate()([
        logits_written,
        logits_stopwords,
        cluster_embeds
    ])

    # Fully connected layers
    x = Dropout(0.3)(combined_output)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.3)(x)
    outputs = Dense(num_labels, activation='sigmoid')(x)

    # Define the model
    model = Model(
        inputs=[
            input_ids_written,
            attention_mask_written,
            input_ids_stopwords,
            attention_mask_stopwords,
            cluster_input
        ],
        outputs=outputs
    )

    # Compile the model
    optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
    loss_fn = tf.keras.losses.BinaryCrossentropy()
    model.compile(optimizer=optimizer, loss=loss_fn)


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1')


All model checkpoint layers were used when initializing TFDebertaV2ForSequenceClassification.

Some layers of TFDebertaV2ForSequenceClassification were not initialized from the model checkpoint at kamalkraj/deberta-v2-xlarge and are newly initialized: ['pooler', 'classifier', 'cls_dropout']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
All model checkpoint layers were used when initializing TFDebertaV2ForSequenceClassification.

Some layers of TFDebertaV2ForSequenceClassification were not initialized from the model checkpoint at kamalkraj/deberta-v2-xlarge and are newly initialized: ['pooler', 'classifier', 'cls_dropout']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.
Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/re

Callbacks

In [21]:
# Callbacks
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, Callback

# Custom callback to compute F1 score at the end of each epoch
class F1ScoreCallback(Callback):
    def __init__(self, validation_data):
        super(F1ScoreCallback, self).__init__()
        self.validation_data = validation_data
        self.best_f1 = 0
        self.best_weights = None
        self.best_thresholds = None
        self.patience = 3
        self.epochs_no_improve = 0

    def on_epoch_end(self, epoch, logs=None):
        val_features, val_labels = self.validation_data
        val_preds = self.model.predict(val_features)

        optimal_thresholds = []
        all_preds_bin = np.zeros_like(val_preds)

        for i in range(num_classes):
            precision, recall, thresholds = precision_recall_curve(val_labels[:, i], val_preds[:, i])
            f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
            optimal_idx = np.argmax(f1_scores)
            optimal_threshold = thresholds[optimal_idx]
            optimal_thresholds.append(optimal_threshold)
            all_preds_bin[:, i] = (val_preds[:, i] >= optimal_threshold).astype(int)

        validation_f1 = f1_score(val_labels, all_preds_bin, average='micro')
        print(f"Epoch {epoch+1} - Validation F1 Score: {validation_f1:.4f}")

        print(classification_report(val_labels, all_preds_bin, target_names=emotion_classes))

        if validation_f1 > self.best_f1:
            self.best_f1 = validation_f1
            self.best_weights = self.model.get_weights()
            self.best_thresholds = optimal_thresholds.copy()
            self.epochs_no_improve = 0
            print(f"Validation F1 increased to {validation_f1:.4f}, saving model weights...")
        else:
            self.epochs_no_improve += 1
            print(f"No improvement in validation F1 for {self.epochs_no_improve} epoch(s)")
            if self.epochs_no_improve >= self.patience:
                print("Early stopping triggered.")
                self.model.stop_training = True

f1_callback = F1ScoreCallback(
    validation_data=(
        {
            'input_ids_written': np.array(val_features['input_ids_written']),
            'attention_mask_written': np.array(val_features['attention_mask_written']),
            'input_ids_stopwords': np.array(val_features['input_ids_stopwords']),
            'attention_mask_stopwords': np.array(val_features['attention_mask_stopwords']),
            'cluster_input': np.array(val_features['cluster']),
        },
        val_labels
    )
)

Train

In [None]:
# Train the model
num_epochs = 100

model.fit(
    {
        'input_ids_written': np.array(train_features['input_ids_written']),
        'attention_mask_written': np.array(train_features['attention_mask_written']),
        'input_ids_stopwords': np.array(train_features['input_ids_stopwords']),
        'attention_mask_stopwords': np.array(train_features['attention_mask_stopwords']),
        'cluster_input': np.array(train_features['cluster']),
    },
    train_labels,
    validation_data=(
        {
            'input_ids_written': np.array(val_features['input_ids_written']),
            'attention_mask_written': np.array(val_features['attention_mask_written']),
            'input_ids_stopwords': np.array(val_features['input_ids_stopwords']),
            'attention_mask_stopwords': np.array(val_features['attention_mask_stopwords']),
            'cluster_input': np.array(val_features['cluster']),
        },
        val_labels
    ),
    epochs=num_epochs,
    callbacks=[f1_callback],
    batch_size=batch_size
)


Epoch 1/100


2024-11-07 23:14:06.916501: W tensorflow/core/framework/dataset.cc:769] Input of GeneratorDatasetOp::Dataset will not be optimized because the dataset does not implement the AsGraphDefInternal() method needed to apply optimizations.


INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0', '/job:localhost/replica:0/task:0/device:CPU:0').
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0', '/job:localhost/replica:0/task:0/device:CPU:0').
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0', '/job:localhost/replica:0/task:0/device:CPU:0').
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0', '/job:localhost/replica:0/task:0/device:CPU:0').
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0', '/job:localhost/replica:0/task:0/device:CPU:0').
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:

Save model

In [None]:
# Load the best model weights
model.set_weights(f1_callback.best_weights)

# Save the model and tokenizer
model_save_name = 'deberta_emotion_classifier_tf'
model.save_pretrained(model_save_name)
tokenizer.save_pretrained(model_save_name)

# Save emotion classes and best thresholds
import json
with open(os.path.join(model_save_name, 'emotion_classes.json'), 'w') as f:
    json.dump(emotion_classes.tolist(), f)
with open(os.path.join(model_save_name, 'optimal_thresholds.json'), 'w') as f:
    json.dump(f1_callback.best_thresholds, f)

print(f"Model, tokenizer, and thresholds have been saved in '{model_save_name}'.")

Upload to my huggingface

In [None]:
# from huggingface_hub import HfApi

# # Log in to Hugging Face
# # If not already logged in earlier
# # login(token='your_huggingface_token')

# # Upload model
# model.push_to_hub(model_save_name)
# tokenizer.push_to_hub(model_save_name)

# # Prepare dataset for upload
# from datasets import Dataset

# # Convert pandas DataFrame to Hugging Face Dataset
# hf_dataset = Dataset.from_pandas(data)

# # Save dataset locally
# dataset_save_path = 'emotion_dataset_pt'
# hf_dataset.save_to_disk(dataset_save_path)

# # Upload dataset to Hugging Face
# hf_dataset.push_to_hub('emotion_dataset_pt')

# print("Model and dataset have been uploaded to Hugging Face Hub.")