In [6]:
from google.colab import files

uploaded = files.upload()

Saving dataset.csv to dataset.csv


In [2]:
# -*- coding: utf-8 -*-
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer
import tensorflow as tf

# 1. Load and preprocess data
df = pd.read_csv("dataset.csv", encoding='utf-8')
df.columns = ["text", "label"]
df.dropna(inplace=True)

# 2. Convert string labels to numerical values FIRST
label_map = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
df['label'] = df['label'].map(label_map)

# 3. Split data
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label'], random_state=42)

# 4. Load tokenizer
model_name = "google/muril-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# 5. Tokenize datasets
def tokenize_function(examples):
    return tokenizer(examples, padding="max_length", truncation=True, max_length=128)

train_encodings = tokenize_function(train_df['text'].tolist())
val_encodings = tokenize_function(val_df['text'].tolist())

# 6. Create TensorFlow datasets
def create_tf_dataset(encodings, labels):
    return tf.data.Dataset.from_tensor_slices((
        {
            'input_ids': encodings['input_ids'],
            'attention_mask': encodings['attention_mask']
        },
        labels
    )).batch(16)

train_dataset = create_tf_dataset(train_encodings, train_df['label'].values)
val_dataset = create_tf_dataset(val_encodings, val_df['label'].values)

# 7. Load model with correct label mappings
id2label = {0: 'Negative', 1: 'Neutral', 2: 'Positive'}
model = TFAutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    id2label=id2label,
    label2id=label_map
)

# 8. Compile with legacy optimizer (for TF compatibility)
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

# 9. Train the model
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=3
)

# 10. Optional: Save the model
model.save_pretrained("muril_sentiment_model")
tokenizer.save_pretrained("muril_sentiment_model")

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at google/muril-base-cased and are newly initialized: ['classifier', 'bert/pooler/dense/bias:0', 'bert/pooler/dense/kernel:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Epoch 2/3
Epoch 3/3


('muril_sentiment_model/tokenizer_config.json',
 'muril_sentiment_model/special_tokens_map.json',
 'muril_sentiment_model/vocab.txt',
 'muril_sentiment_model/added_tokens.json',
 'muril_sentiment_model/tokenizer.json')