In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load Dataset
df = pd.read_csv("Meta_Hinglish_annotated.csv")

# Rename columns as per your dataset
df = df.rename(columns={'Sentence':'text','Sentiment': 'label'})

# Map Labels to Integers
label_map = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
label_map_reverse = {0: 'Negative', 1: 'Neutral', 2: 'Positive'}
df['label'] = df['label'].map(label_map)

# Train Test Split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    random_state=42
)

# Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicBERTv2-mlm-only")

# Encode Data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)

# Dataset Class
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]).to(device) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels[idx])).to(device)
        return item

train_dataset = SentimentDataset(train_encodings, train_labels)
test_dataset = SentimentDataset(test_encodings, test_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Load Model
model = AutoModelForSequenceClassification.from_pretrained(
    "ai4bharat/IndicBERTv2-mlm-only",
    num_labels=3
).to(device)

# Optimizer & Scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
num_training_steps = len(train_loader) * 3  # 3 epochs
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Training Function
def train_model(model, train_loader):
    model.train()
    for epoch in range(3):
        loop = tqdm(train_loader, leave=True)
        for batch in loop:
            optimizer.zero_grad()
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            loop.set_postfix(loss=loss.item())

train_model(model, train_loader)

# Evaluation Function
def evaluate(model, test_loader):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            outputs = model(**batch)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(batch['labels'].cpu().numpy())
    print(classification_report(true_labels, predictions, target_names=['Negative', 'Neutral', 'Positive']))

evaluate(model, test_loader)

# Prediction Function
def predict_sentiment(sentence):
    model.eval()
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        prediction = torch.argmax(outputs.logits, dim=-1).item()
    return label_map_reverse[prediction]

# Test Predictions





The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.75M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/639 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/IndicBERTv2-mlm-only and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 37/37 [00:07<00:00,  4.65it/s, loss=0.957]
100%|██████████| 37/37 [00:07<00:00,  5.25it/s, loss=0.879]
100%|██████████| 37/37 [00:07<00:00,  5.19it/s, loss=0.47]


              precision    recall  f1-score   support

    Negative       0.91      0.93      0.92        46
     Neutral       1.00      0.46      0.63        41
    Positive       0.73      0.98      0.84        60

    accuracy                           0.82       147
   macro avg       0.88      0.79      0.80       147
weighted avg       0.86      0.82      0.81       147



In [2]:
!pip install gradio


Collecting gradio
  Downloading gradio-5.25.1-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.5-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 (

In [3]:
import gradio as gr
import pandas as pd

# Function to predict sentiment for a CSV
def predict_from_csv(file):
    df = pd.read_csv(file.name)
    if "Sentence" not in df.columns:
        return "Uploaded CSV must have a 'Sentence' column.", None
    df["Predicted_Sentiment"] = df["Sentence"].apply(predict_sentiment)
    output_file = "/content/labeled_output.csv"
    df.to_csv(output_file, index=False)
    return df.head(), output_file


In [4]:
with gr.Blocks() as demo:
    gr.Markdown("# Hinglish Sentiment Classifier (IndicBERTv2)")

    with gr.Row():
        input_text = gr.Textbox(label="Enter a Hinglish sentence", placeholder="e.g. Yeh movie bahut acchi thi!")
        output_label = gr.Label(label="Predicted Sentiment")
        predict_btn = gr.Button("Predict Sentiment")

    predict_btn.click(fn=predict_sentiment, inputs=input_text, outputs=output_label)

    gr.Markdown("## 📄 Upload a CSV file to label all sentences")
    with gr.Row():
        file_input = gr.File(label="Upload CSV with 'Sentence' column")
        output_df = gr.Dataframe(label="Sample Predictions")
        download_link = gr.File(label="Download Labeled CSV")

    file_input.change(fn=predict_from_csv, inputs=file_input, outputs=[output_df, download_link])

demo.launch(share=True)  # Use share=True to get a public link


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://b91814c354ceaf8c68.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [40]:
test_sentences = [
    "Yeh movie bahut acchi thi!",
    "Mujhe yeh jagah bilkul pasand nahi aayi.",
    "Main kal shopping gaya tha.",
    "vaha bahut kharab admi hai",
    "vaha se jaa raha hye,wo aadmi, lekin faltu hye wo",
    "kal milte hai fir",
    "wo bahut kaam kar rahe hye bheta, lekin kya hi use hye faltu ka",
    "chal aaj to accha din hye",
    "tu chutya hye"


]

for sentence in test_sentences:
    print(f"Sentence: {sentence} -> Sentiment: {predict_sentiment(sentence)}")


Sentence: Yeh movie bahut acchi thi! -> Sentiment: Positive
Sentence: Mujhe yeh jagah bilkul pasand nahi aayi. -> Sentiment: Negative
Sentence: Main kal shopping gaya tha. -> Sentiment: Positive
Sentence: vaha bahut kharab admi hai -> Sentiment: Negative
Sentence: vaha se jaa raha hye,wo aadmi, lekin faltu hye wo -> Sentiment: Negative
Sentence: kal milte hai fir -> Sentiment: Positive
Sentence: wo bahut kaam kar rahe hye bheta, lekin kya hi use hye faltu ka -> Sentiment: Negative
Sentence: chal aaj to accha din hye -> Sentiment: Positive
Sentence: tu chutya hye -> Sentiment: Negative


In [41]:
test_sentences = [
    "tu kya kar rahe ho, acche kar rahe ho",
    "milenge kal",
    "shaam ko kaam karte hye, lekin mujhe pasand nahi hye uske saath kaam karna",
    "jo kaam wo kar rahe wo kharabh hye",
    "kal hamare saath bahut hi bhekhar hua",
    "tu aaja kal, lekin waha se mat aa wo log bahut hi danger hye",
    "jaldi jaldi se kaam kar, varna late ho jayega",
    "to be honest, maine ye kaam kiya",
    "aaram se jaa",
    "toda kaam kar jaldi se"
]

for sentence in test_sentences:
    print(f"Sentence: {sentence} -> Sentiment: {predict_sentiment(sentence)}")

Sentence: tu kya kar rahe ho, acche kar rahe ho -> Sentiment: Positive
Sentence: milenge kal -> Sentiment: Negative
Sentence: shaam ko kaam karte hye, lekin mujhe pasand nahi hye uske saath kaam karna -> Sentiment: Negative
Sentence: jo kaam wo kar rahe wo kharabh hye -> Sentiment: Negative
Sentence: kal hamare saath bahut hi bhekhar hua -> Sentiment: Negative
Sentence: tu aaja kal, lekin waha se mat aa wo log bahut hi danger hye -> Sentiment: Negative
Sentence: jaldi jaldi se kaam kar, varna late ho jayega -> Sentiment: Negative
Sentence: to be honest, maine ye kaam kiya -> Sentiment: Negative
Sentence: aaram se jaa -> Sentiment: Negative
Sentence: toda kaam kar jaldi se -> Sentiment: Negative


In [35]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline

tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicBERTv2-mlm-only")
model = AutoModelForMaskedLM.from_pretrained("ai4bharat/IndicBERTv2-mlm-only")

mlm_pipeline = pipeline("fill-mask", model=model, tokenizer=tokenizer)

mlm_pipeline("यह फिल्म [MASK] है।")  # Fills in the masked word


Some weights of the model checkpoint at ai4bharat/IndicBERTv2-mlm-only were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


[{'score': 0.05188636854290962,
  'token': 41057,
  'token_str': 'ऐतिहासिक',
  'sequence': 'यह फिल्म ऐतिहासिक है ।'},
 {'score': 0.0473288930952549,
  'token': 144308,
  'token_str': 'सुपरहिट',
  'sequence': 'यह फिल्म सुपरहिट है ।'},
 {'score': 0.04317407310009003,
  'token': 100946,
  'token_str': 'कॉमेडी',
  'sequence': 'यह फिल्म कॉमेडी है ।'},
 {'score': 0.025333136320114136,
  'token': 37991,
  'token_str': 'शानदार',
  'sequence': 'यह फिल्म शानदार है ।'},
 {'score': 0.024923335760831833,
  'token': 80887,
  'token_str': 'पारिवारिक',
  'sequence': 'यह फिल्म पारिवारिक है ।'}]

In [38]:
mlm_pipeline("मुझे पानी चाहिए, आप कैसे हैं, आज मौसम बहुत अच्छा है, मैं कल दिल्ली जाऊंगा, वह स्कूल में [MASK] है")

[{'score': 0.10856267809867859,
  'token': 16176,
  'token_str': 'नहीं',
  'sequence': 'मुझे पानी चाहिए , आप कैसे हैं , आज मौसम बहुत अच्छा है , मैं कल दिल्ली जाऊंगा , वह स्कूल में नहीं है'},
 {'score': 0.07749965041875839,
  'token': 19996,
  'token_str': 'आता',
  'sequence': 'मुझे पानी चाहिए , आप कैसे हैं , आज मौसम बहुत अच्छा है , मैं कल दिल्ली जाऊंगा , वह स्कूल में आता है'},
 {'score': 0.07490992546081543,
  'token': 23866,
  'token_str': 'जाती',
  'sequence': 'मुझे पानी चाहिए , आप कैसे हैं , आज मौसम बहुत अच्छा है , मैं कल दिल्ली जाऊंगा , वह स्कूल में जाती है'},
 {'score': 0.053766004741191864,
  'token': 38955,
  'token_str': 'रहता',
  'sequence': 'मुझे पानी चाहिए , आप कैसे हैं , आज मौसम बहुत अच्छा है , मैं कल दिल्ली जाऊंगा , वह स्कूल में रहता है'},
 {'score': 0.04996186122298241,
  'token': 19607,
  'token_str': 'जाता',
  'sequence': 'मुझे पानी चाहिए , आप कैसे हैं , आज मौसम बहुत अच्छा है , मैं कल दिल्ली जाऊंगा , वह स्कूल में जाता है'}]