In [None]:
!pip install transformers
!pip install tensorflow
!pip install sklearn
!pip install tf-keras



In [28]:
from transformers import AlbertTokenizer, TFAlbertForSequenceClassification
import pandas as pd
import tensorflow as tf
import numpy as np
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import f1_score


In [3]:
from google.colab import files

# File upload window khulne ke liye
uploaded = files.upload()


Saving go-emotions-dataset.csv to go-emotions-dataset.csv


In [15]:
ds = pd.read_csv("go-emotions-dataset.csv")


In [None]:
label_ds = ds.iloc[:, 2:]  # Assuming labels are in columns starting from the 3rd column
bool_cols = label_ds.select_dtypes(include='bool').columns

for col in label_ds.columns:
    if label_ds[col].isin(['True', 'False']).all():
        label_ds[col] = label_ds[col].map({'True': True, 'False': False})

label_ds[bool_cols] = label_ds[bool_cols].astype(int)

print(label_ds.dtypes)
print(label_ds.head())


In [None]:
class_frequency = label_ds.sum(axis=0)
total_samples = len(label_ds)
classes = len(label_ds.columns)

class_frequency = class_frequency.astype(int)
class_frequency = class_frequency.replace(0, 1)
class_weight = total_samples / (classes * class_frequency)
print(class_weight)

class_weight_dict = class_weight.to_dict()
print(class_weight_dict)


In [7]:
def tokenized_data(text, batch_size=64, max_length=128):
    tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
    tokenized_text = []

    # Process text in batches
    for i in range(0, len(text), batch_size):
        batch = text[i:i+batch_size]
        # Tokenize the batch
        encoded_batch = tokenizer(batch, padding='max_length', truncation=True, return_tensors='tf', max_length=max_length)
        tokenized_text.append(encoded_batch)

    # Concatenate all batches and ensure padding/truncation is correct
    tokenized_text = {key: tf.concat([batch[key] for batch in tokenized_text], axis=0) for key in tokenized_text[0]}
    return tokenized_text


In [None]:
text = ds['text'].tolist()  # Ensure the 'text' column exists in your dataset
output = tokenized_data(text)


In [9]:
train_labels = label_ds.values  # Convert the label dataframe into a numpy array for multi-label classification

train_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': output['input_ids'], 'attention_mask': output['attention_mask']},
    train_labels  # multi-label targets (shape: [num_samples, 31])
))

# Shuffle, batch, and prefetch for performance optimization
train_dataset = train_dataset.shuffle(1000).batch(8).prefetch(tf.data.experimental.AUTOTUNE)


In [None]:
num_emotions = 29  # Assuming 29 emotion categories
model = TFAlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=num_emotions, problem_type="multi_label_classification")


In [11]:
optimizer = Adam(learning_rate=2e-5)
loss_fn = tf.keras.losses.BinaryCrossentropy(from_logits=True)


In [12]:
def compute_f1_score(y_true, y_pred):
    y_true = y_true.numpy()  # Convert Tensor to Numpy
    y_pred = tf.round(tf.sigmoid(y_pred)).numpy()  # Sigmoid to probabilities, round to binary labels
    return f1_score(y_true, y_pred, average='macro', zero_division=1)  # Macro-average F1 score


In [13]:
num_epochs = 1  # Number of epochs you want to run
for epoch in range(num_epochs):
    epoch_f1_scores = []  # List to store F1 scores for each batch
    for batch in train_dataset:
        inputs, labels = batch  # Tokenized inputs and true labels

        with tf.GradientTape() as tape:
            model_output = model(inputs)  # Model output (logits)
            logits = model_output.logits  # Extract logits from output
            loss = loss_fn(labels, logits)  # Compute loss

        # Backpropagation and optimization
        grads = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))

        # Calculate F1 score for the current batch
        f1 = compute_f1_score(labels, logits)
        epoch_f1_scores.append(f1)

    # Calculate average F1 score for the epoch
    avg_f1 = np.mean(epoch_f1_scores)
    print(f"Epoch {epoch}: Loss = {loss.numpy()}, F1 Score = {avg_f1}")


Epoch 0: Loss = 0.1221495196223259, F1 Score = 0.7516326977047355


In [None]:
from transformers import AlbertTokenizer
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = TFAlbertForSequenceClassification.from_pretrained('albert-base-v2')


In [None]:
model.save_pretrained("emotion-model")
tokenizer.save_pretrained("emotion-model")


In [11]:
import shutil
shutil.make_archive("emotion-model", 'zip', "emotion-model")


'/content/emotion-model.zip'

In [12]:
from google.colab import files
files.download("emotion-model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [13]:
!pip install sentencepiece

