In [1]:
from google.colab import files

uploaded = files.upload()

Saving dataset.csv to dataset.csv


In [8]:
# Imports
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, TFBertForSequenceClassification

# Load data
df = pd.read_csv("dataset.csv")
texts = df["text"].values
labels = df["label"].values

# Convert string labels to integers (if needed)
if isinstance(labels[0], str):
    le = LabelEncoder()
    labels = le.fit_transform(labels)

# Split data
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

# Tokenize
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize(texts, tokenizer, max_len=128):
    return tokenizer(
        texts.tolist(),
        padding="max_length",
        truncation=True,
        max_length=max_len,
        return_tensors="tf"
    )

train_encodings = tokenize(train_texts, tokenizer)
test_encodings = tokenize(test_texts, tokenizer)

# Create datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    {"input_ids": train_encodings["input_ids"], "attention_mask": train_encodings["attention_mask"]},
    train_labels.astype(np.int32)
)).shuffle(1000).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    {"input_ids": test_encodings["input_ids"], "attention_mask": test_encodings["attention_mask"]},
    test_labels.astype(np.int32)
)).batch(16)

# Load model
model = TFBertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=3,
    id2label={0: "Negative", 1: "Neutral", 2: "Positive"}
)

# Compile and train
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"]
)

history = model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=3,
    batch_size=16
)

# Evaluate
loss, accuracy = model.evaluate(test_dataset)
print(f"Test Accuracy: {accuracy:.4f}")

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Epoch 2/3
Epoch 3/3
Test Accuracy: 0.4901


In [9]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
import numpy as np

# 1. Load your ALREADY TRAINED model (from memory - no need to save/reload)
# (Assuming you have these variables from your training cell)
# model = Your trained TFBertForSequenceClassification
# tokenizer = Your BertTokenizer

# 2. Define sample texts (code-mixed English-Hindi-Bangla)
samples = [
    "I am very happy!",                    # English (Positive)
    "यह बहुत बुरा है",                     # Hindi (Negative)
    "আমি একদম খুশি নই",                    # Bangla (Negative)
    "The product is okay",                 # English (Neutral)
    "मध्यम quality है",                    # Hindi (Neutral)
    "এটা ভালো ছিল না",                     # Bangla (Negative)
    "This is awesome!",                    # English (Positive)
    "कामचलाऊ है",                         # Hindi (Neutral)
    "আমি খুব খুশি!"                       # Bangla (Positive)
]

# 3. Tokenize samples
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
inputs = tokenizer(
    samples,
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="tf"
)

# 4. Predict
raw_preds = model.predict(dict(inputs))
preds = tf.argmax(raw_preds.logits, axis=1).numpy()

# 5. Map predictions to labels
label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}

# 6. Print human-readable results
print("=== Code-Mixed Sentiment Predictions ===")
print(f"{'Text':<40} | {'Prediction':<10}")
print("-" * 55)
for text, pred in zip(samples, preds):
    truncated_text = text[:35] + "..." if len(text) > 35 else text
    print(f"{truncated_text:<40} | {label_map[pred]:<10}")

=== Code-Mixed Sentiment Predictions ===
Text                                     | Prediction
-------------------------------------------------------
I am very happy!                         | Positive  
यह बहुत बुरा है                          | Positive  
আমি একদম খুশি নই                         | Positive  
The product is okay                      | Positive  
मध्यम quality है                         | Positive  
এটা ভালো ছিল না                          | Positive  
This is awesome!                         | Positive  
कामचलाऊ है                               | Positive  
আমি খুব খুশি!                            | Positive  
