In [7]:
import pandas as pd

# Load annotated dataset
file_path = "/content/hindi-english_annotated.csv"  # Change path if needed
df = pd.read_csv(file_path)

# Display dataset structure
print(df.head())
print("\nDataset shape:", df.shape)


   ID                                           Sentence  Category
0   1  Tumhari smile se lagta hai sab kuch theek ho j...  Positive
1   2  Tumhare bina toh yeh task kabhi complete hi na...  Negative
2   3  Aaj mausam pleasant hai, toh meeting ke baad w...   Neutral
3   4  Wah! Tumne toh project mein hamesha ki tarah a...  Negative
4   5     Tumhari dedication sab ke liye ek example hai.  Positive

Dataset shape: (300, 3)


In [8]:
import re
import nltk
from nltk.corpus import stopwords

# Download stopwords
nltk.download('stopwords')

# Define preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"http\S+|www\S+", "", text)  # Remove URLs
    text = re.sub(r"@\w+|#\w+", "", text)  # Remove mentions and hashtags
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove special characters & numbers
    text = " ".join([word for word in text.split() if word not in stopwords.words('english')])  # Remove stopwords
    return text

# Apply preprocessing
df["cleaned_text"] = df["Sentence"].apply(preprocess_text)

# Show results
print(df.head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


   ID                                           Sentence  Category  \
0   1  Tumhari smile se lagta hai sab kuch theek ho j...  Positive   
1   2  Tumhare bina toh yeh task kabhi complete hi na...  Negative   
2   3  Aaj mausam pleasant hai, toh meeting ke baad w...   Neutral   
3   4  Wah! Tumne toh project mein hamesha ki tarah a...  Negative   
4   5     Tumhari dedication sab ke liye ek example hai.  Positive   

                                        cleaned_text  
0  tumhari smile se lagta hai sab kuch theek ho j...  
1  tumhare bina toh yeh task kabhi complete hi na...  
2  aaj mausam pleasant hai toh meeting ke baad wa...  
3  wah tumne toh project mein hamesha ki tarah ap...  
4      tumhari dedication sab ke liye ek example hai  


In [9]:
from transformers import AutoTokenizer

# Load XLM-RoBERTa tokenizer
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

# Tokenize text
def tokenize_text(text):
    return tokenizer(text, padding="max_length", truncation=True, max_length=50, return_tensors="pt")

# Apply tokenization
df["tokenized_text"] = df["cleaned_text"].apply(tokenize_text)

# Display results
print(df.head())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

   ID                                           Sentence  Category  \
0   1  Tumhari smile se lagta hai sab kuch theek ho j...  Positive   
1   2  Tumhare bina toh yeh task kabhi complete hi na...  Negative   
2   3  Aaj mausam pleasant hai, toh meeting ke baad w...   Neutral   
3   4  Wah! Tumne toh project mein hamesha ki tarah a...  Negative   
4   5     Tumhari dedication sab ke liye ek example hai.  Positive   

                                        cleaned_text  \
0  tumhari smile se lagta hai sab kuch theek ho j...   
1  tumhare bina toh yeh task kabhi complete hi na...   
2  aaj mausam pleasant hai toh meeting ke baad wa...   
3  wah tumne toh project mein hamesha ki tarah ap...   
4      tumhari dedication sab ke liye ek example hai   

                tokenized_text  
0  [input_ids, attention_mask]  
1  [input_ids, attention_mask]  
2  [input_ids, attention_mask]  
3  [input_ids, attention_mask]  
4  [input_ids, attention_mask]  


In [10]:
import torch
from torch.utils.data import Dataset, DataLoader

# Convert sentiment labels to numerical values
label_map = {"Positive": 2, "Neutral": 1, "Negative": 0}
df["label"] = df["Category"].map(label_map)

# Define a PyTorch dataset class
class HinglishDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        return {
            "input_ids": torch.tensor(item["tokenized_text"]["input_ids"]).squeeze(),
            "attention_mask": torch.tensor(item["tokenized_text"]["attention_mask"]).squeeze(),
            "labels": torch.tensor(item["label"], dtype=torch.long),
        }

# Create dataset & dataloader
dataset = HinglishDataset(df)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# Print sample batch
sample_batch = next(iter(dataloader))
print(sample_batch)


{'input_ids': tensor([[     0,  38674,    161,  58269,    337,  21620,  98893,   8980,      2,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1],
        [     0,  76021,  13452,   1563,  28484,    739,  26774,   1337,  45458,
         127752,      2,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1,      1,      1,      1,      1,
              1,      1,      1,      1,      1],
        [     0,   5201,     10,  39036,   4092,    739,    191,    151,   6

  "input_ids": torch.tensor(item["tokenized_text"]["input_ids"]).squeeze(),
  "attention_mask": torch.tensor(item["tokenized_text"]["attention_mask"]).squeeze(),


In [11]:
import torch
import torch.nn as nn
from transformers import AutoModel

# Define the Sentiment Classification Model
class HinglishSentimentModel(nn.Module):
    def __init__(self):
        super(HinglishSentimentModel, self).__init__()
        self.xlm_roberta = AutoModel.from_pretrained("xlm-roberta-base")
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(768, 3)  # Output: 3 sentiment classes (Positive, Neutral, Negative)

    def forward(self, input_ids, attention_mask):
        outputs = self.xlm_roberta(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        dropout_output = self.dropout(pooled_output)
        logits = self.classifier(dropout_output)
        return logits

# Initialize model
model = HinglishSentimentModel()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Print model summary
print(model)


model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

HinglishSentimentModel(
  (xlm_roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=Tru

In [12]:
import torch.optim as optim
from transformers import get_scheduler

# Define Loss Function and Optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

# Learning Rate Scheduler
num_epochs = 3  # Adjust based on dataset size
num_training_steps = len(dataloader) * num_epochs
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

# Move model to GPU (if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("Training setup is ready!")


Training setup is ready!


In [13]:
from tqdm import tqdm

# Training Loop
def train_model(model, dataloader, optimizer, loss_fn, lr_scheduler, num_epochs=3):
    model.train()  # Set model to training mode
    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch + 1}/{num_epochs}")
        loop = tqdm(dataloader, leave=True)

        total_loss = 0
        for batch in loop:
            # Move batch to GPU (if available)
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Forward pass
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = loss_fn(outputs, labels)

            # Backward pass
            loss.backward()
            optimizer.step()
            lr_scheduler.step()

            total_loss += loss.item()
            loop.set_description(f"Loss: {total_loss / (loop.n + 1):.4f}")

    print("\nTraining complete!")

# Start Training
train_model(model, dataloader, optimizer, loss_fn, lr_scheduler, num_epochs=3)



Epoch 1/3


  "input_ids": torch.tensor(item["tokenized_text"]["input_ids"]).squeeze(),
  "attention_mask": torch.tensor(item["tokenized_text"]["attention_mask"]).squeeze(),
Loss: 1.2753: 100%|██████████| 38/38 [06:02<00:00,  9.54s/it]



Epoch 2/3


Loss: 1.1716: 100%|██████████| 38/38 [05:39<00:00,  8.93s/it]



Epoch 3/3


Loss: 1.2146: 100%|██████████| 38/38 [05:39<00:00,  8.94s/it]


Training complete!





In [14]:
from sklearn.metrics import accuracy_score, classification_report

# Function to evaluate the model
def evaluate_model(model, dataloader):
    model.eval()  # Set to evaluation mode
    predictions, true_labels = [], []

    with torch.no_grad():  # Disable gradient computation
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            labels = labels.cpu().numpy()

            predictions.extend(preds)
            true_labels.extend(labels)

    accuracy = accuracy_score(true_labels, predictions)
    report = classification_report(true_labels, predictions, target_names=["Negative", "Neutral", "Positive"])

    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:\n", report)

# Evaluate on the same dataset (for now)
evaluate_model(model, dataloader)


  "input_ids": torch.tensor(item["tokenized_text"]["input_ids"]).squeeze(),
  "attention_mask": torch.tensor(item["tokenized_text"]["attention_mask"]).squeeze(),


Accuracy: 0.3167

Classification Report:
               precision    recall  f1-score   support

    Negative       0.00      0.00      0.00        98
     Neutral       0.32      1.00      0.48        95
    Positive       0.00      0.00      0.00       107

    accuracy                           0.32       300
   macro avg       0.11      0.33      0.16       300
weighted avg       0.10      0.32      0.15       300



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [22]:
import torch
from torch.utils.data import Dataset, DataLoader

# Modify HinglishDataset class to handle unlabeled data
class HinglishDataset(Dataset):
    def __init__(self, dataframe, labeled=True):
        self.data = dataframe
        self.labeled = labeled  # Flag to check if dataset has labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        data = {
            "input_ids": torch.tensor(item["tokenized_text"]["input_ids"]).squeeze(),
            "attention_mask": torch.tensor(item["tokenized_text"]["attention_mask"]).squeeze(),
        }
        if self.labeled:
            data["labels"] = torch.tensor(item["label"], dtype=torch.long)
        return data

# Load Hinglish dataset
file_path = "/content/comments INDIA'S GOT LATENT  EP 10 ft. Raghu Ram _@tanmaybhat_  _@Sidwarrier_ buSdqtdn_4I.csv"
df_unlabeled = pd.read_csv(file_path)

# Use the correct text column
text_column = "simpleText"

# Preprocess text
df_unlabeled["cleaned_text"] = df_unlabeled[text_column].apply(preprocess_text)

# Tokenize text
df_unlabeled["tokenized_text"] = df_unlabeled["cleaned_text"].apply(tokenize_text)

# Create DataLoader for unlabeled dataset
unlabeled_dataset = HinglishDataset(df_unlabeled, labeled=False)
unlabeled_dataloader = DataLoader(unlabeled_dataset, batch_size=8, shuffle=False)

# Predict sentiment labels
df_unlabeled["predicted_sentiment"] = predict_sentiment(model, unlabeled_dataloader)

# Save pseudo-labeled dataset
df_unlabeled.to_csv("hinglish_pseudo_labeled.csv", index=False)

print("✅ Pseudo-labeling complete! File saved as 'hinglish_pseudo_labeled.csv'.")


  "input_ids": torch.tensor(item["tokenized_text"]["input_ids"]).squeeze(),
  "attention_mask": torch.tensor(item["tokenized_text"]["attention_mask"]).squeeze(),


✅ Pseudo-labeling complete! File saved as 'hinglish_pseudo_labeled.csv'.


In [29]:
import pandas as pd

# Load the pseudo-labeled dataset
df_check = pd.read_csv("hinglish_pseudo_labeled.csv")

# Display first few rows
print(df_check.head())

# Check if 'tokenized_text' column exists
if "tokenized_text" not in df_check.columns:
    print("❌ ERROR: 'tokenized_text' column is missing!")

# Check how many tokenized_text values are missing or empty
missing_values = df_check["tokenized_text"].isnull().sum()
print(f"⚠ Missing tokenized_text values: {missing_values}")

# Display some samples
print(df_check["tokenized_text"].sample(10))


  publishedTimeText                                         simpleText  votes  \
0      23 hours ago  Underrated guest Sid Warrier, Positive energy ...    804   
1      19 hours ago                        Asli Id se aao warrior bhai     23   
2      17 hours ago                                                Yep      1   
3      23 hours ago  No matter how big this show has gone, seeing S...    645   
4      16 hours ago                                    100 % right 😍🤩🙌      4   

                author  isReply  isHearted  isPinned  isPaid  paidAmount  \
0   @HumanityNotDoomed    False      False     False   False         NaN   
1   @sarcasticaloo2220     True      False     False   False         NaN   
2  @himangimahajan3631     True      False     False   False         NaN   
3            @mesatin2    False      False     False   False         NaN   
4      @mehtasanketd07     True      False     False   False         NaN   

   isSponsor  sponsorshipMonths  \
0       True         

In [31]:
import json

# Function to safely convert tokenized_text from string to dictionary
def safe_convert(text):
    try:
        return json.loads(text.replace("'", '"'))  # Fix any formatting issues
    except (ValueError, TypeError, json.JSONDecodeError):
        return None  # Return None for corrupt entries

# Function to safely tokenize text
def re_tokenize_text(text):
    if not isinstance(text, str):  # Ensure text is a string
        return None
    return tokenizer(text, padding="max_length", truncation=True).data

# Load the dataset again
df_check = pd.read_csv("hinglish_pseudo_labeled.csv")

# Convert tokenized_text back to dictionary format
df_check["tokenized_text"] = df_check["tokenized_text"].apply(safe_convert)

# Drop rows where safe_convert couldn't fix the data
df_check = df_check.dropna(subset=["tokenized_text"]).reset_index(drop=True)

# **Ensure cleaned_text is a string**
df_check["cleaned_text"] = df_check["cleaned_text"].astype(str)

# Apply tokenization again
df_check["tokenized_text"] = df_check["cleaned_text"].apply(re_tokenize_text)

# Save fixed dataset
df_check.to_csv("hinglish_pseudo_labeled_fixed.csv", index=False)
print("✅ Tokenized text fixed! Saved as 'hinglish_pseudo_labeled_fixed.csv'.")


✅ Tokenized text fixed! Saved as 'hinglish_pseudo_labeled_fixed.csv'.


In [35]:
# Load the dataset
file_path = "hinglish_pseudo_labeled_fixed.csv"
df_debug = pd.read_csv(file_path)

# Check missing values in key columns
print("Missing Values Count:")
print(df_debug.isnull().sum())

# Show rows where 'tokenized_text' is NaN
df_missing = df_debug[df_debug["tokenized_text"].isna()]
print("\n⚠️ Rows with missing tokenized_text:")
print(df_missing)

# Show the first 5 values of tokenized_text
print("\n🛠 First 5 tokenized_text values:")
print(df_debug["tokenized_text"].head())


Missing Values Count:
publishedTimeText      0
simpleText             0
votes                  0
author                 0
isReply                0
isHearted              0
isPinned               0
isPaid                 0
paidAmount             0
isSponsor              0
sponsorshipMonths      0
cleaned_text           0
tokenized_text         0
predicted_sentiment    0
dtype: int64

⚠️ Rows with missing tokenized_text:
Empty DataFrame
Columns: [publishedTimeText, simpleText, votes, author, isReply, isHearted, isPinned, isPaid, paidAmount, isSponsor, sponsorshipMonths, cleaned_text, tokenized_text, predicted_sentiment]
Index: []

🛠 First 5 tokenized_text values:
Series([], Name: tokenized_text, dtype: object)


In [36]:
import json

# Check if tokenized_text is stored as a string instead of JSON
def check_json_format(text):
    try:
        return json.loads(text.replace("'", '"'))  # Convert single quotes to double quotes
    except json.JSONDecodeError:
        return None  # Invalid format

df_debug["fixed_tokenized_text"] = df_debug["tokenized_text"].apply(check_json_format)

# Count invalid rows
invalid_count = df_debug["fixed_tokenized_text"].isna().sum()
print(f"\n❌ Invalid tokenized_text rows: {invalid_count}")

# Drop invalid rows
df_debug = df_debug.dropna(subset=["fixed_tokenized_text"])
df_debug.to_csv("hinglish_pseudo_labeled_fixed_v2.csv", index=False)
print("\n✅ Fixed dataset saved as 'hinglish_pseudo_labeled_fixed_v2.csv'.")



❌ Invalid tokenized_text rows: 0

✅ Fixed dataset saved as 'hinglish_pseudo_labeled_fixed_v2.csv'.


In [37]:
file_path = "hinglish_pseudo_labeled_fixed_v2.csv"
df_labeled = pd.read_csv(file_path)

# Check again
print(f"✅ Reloaded dataset shape: {df_labeled.shape}")


✅ Reloaded dataset shape: (0, 15)


In [38]:
import pandas as pd

# Load the dataset BEFORE preprocessing
file_path = "hinglish_pseudo_labeled_fixed.csv"
df_original = pd.read_csv(file_path)

# Show dataset shape
print(f"🟢 Original dataset shape: {df_original.shape}")

# Show first few rows
print(df_original.head())


🟢 Original dataset shape: (0, 14)
Empty DataFrame
Columns: [publishedTimeText, simpleText, votes, author, isReply, isHearted, isPinned, isPaid, paidAmount, isSponsor, sponsorshipMonths, cleaned_text, tokenized_text, predicted_sentiment]
Index: []


In [39]:
# Check missing values
print("\n🔍 Missing values in each column:")
print(df_original.isnull().sum())

# Check tokenized_text before cleaning
print("\n🛠 First 5 tokenized_text values BEFORE cleaning:")
print(df_original["tokenized_text"].head())

# Check predicted_sentiment before cleaning
print("\n🛠 First 5 predicted_sentiment values BEFORE cleaning:")
print(df_original["predicted_sentiment"].head())



🔍 Missing values in each column:
publishedTimeText      0
simpleText             0
votes                  0
author                 0
isReply                0
isHearted              0
isPinned               0
isPaid                 0
paidAmount             0
isSponsor              0
sponsorshipMonths      0
cleaned_text           0
tokenized_text         0
predicted_sentiment    0
dtype: int64

🛠 First 5 tokenized_text values BEFORE cleaning:
Series([], Name: tokenized_text, dtype: object)

🛠 First 5 predicted_sentiment values BEFORE cleaning:
Series([], Name: predicted_sentiment, dtype: object)


In [40]:
# Fill missing tokenized_text with empty JSON
df_original["tokenized_text"] = df_original["tokenized_text"].fillna("{}")

# Fill missing predicted_sentiment with "neutral"
df_original["predicted_sentiment"] = df_original["predicted_sentiment"].fillna("neutral")

# Save fixed dataset
df_original.to_csv("hinglish_fixed_final.csv", index=False)
print("✅ Fixed dataset saved as 'hinglish_fixed_final.csv'.")


✅ Fixed dataset saved as 'hinglish_fixed_final.csv'.


In [41]:
df_labeled = pd.read_csv("hinglish_fixed_final.csv")

# Show dataset shape
print(f"✅ Reloaded dataset shape after fixing: {df_labeled.shape}")


✅ Reloaded dataset shape after fixing: (0, 14)


In [42]:
import pandas as pd

# Load dataset
file_path = "hinglish_pseudo_labeled.csv"
df_check = pd.read_csv(file_path)

# Show dataset shape
print(f"🟢 Dataset shape BEFORE pseudo-labeling: {df_check.shape}")

# Show first few rows
print(df_check.head())


🟢 Dataset shape BEFORE pseudo-labeling: (4309, 14)
  publishedTimeText                                         simpleText  votes  \
0      23 hours ago  Underrated guest Sid Warrier, Positive energy ...    804   
1      19 hours ago                        Asli Id se aao warrior bhai     23   
2      17 hours ago                                                Yep      1   
3      23 hours ago  No matter how big this show has gone, seeing S...    645   
4      16 hours ago                                    100 % right 😍🤩🙌      4   

                author  isReply  isHearted  isPinned  isPaid  paidAmount  \
0   @HumanityNotDoomed    False      False     False   False         NaN   
1   @sarcasticaloo2220     True      False     False   False         NaN   
2  @himangimahajan3631     True      False     False   False         NaN   
3            @mesatin2    False      False     False   False         NaN   
4      @mehtasanketd07     True      False     False   False         NaN   

   is

In [43]:
# Check if cleaned text exists
if "cleaned_text" not in df_check.columns:
    raise ValueError("❌ Column 'cleaned_text' is missing! Preprocessing failed.")

print(f"🟢 Sample cleaned_text:\n{df_check['cleaned_text'].head()}")


🟢 Sample cleaned_text:
0    underrated guest sid warrier positive energy n...
1                          asli id se aao warrior bhai
2                                                  yep
3    matter big show gone seeing samay tanmay toget...
4                                                right
Name: cleaned_text, dtype: object


In [46]:
import json
import torch

# Function to safely convert tokenized_text
def safe_convert(text):
    try:
        return eval(text, {"tensor": torch.tensor}) if isinstance(text, str) else text
    except (SyntaxError, ValueError, NameError):
        return None  # Return None for corrupt entries

# Apply safe conversion
df_check["tokenized_text"] = df_check["tokenized_text"].apply(safe_convert)

# Drop rows where conversion failed
df_check = df_check.dropna(subset=["tokenized_text"])

# Save fixed dataset
df_check.to_csv("hinglish_fixed_final.csv", index=False)
print("✅ Tokenized text fixed! Saved as 'hinglish_fixed_final.csv'.")


✅ Tokenized text fixed! Saved as 'hinglish_fixed_final.csv'.


In [51]:
import pandas as pd

# Load fixed dataset
df_labeled = pd.read_csv("hinglish_fixed_final.csv")

# Check dataset shape & sample rows
print(f"✅ Reloaded dataset shape: {df_labeled.shape}")
df_labeled.head()


✅ Reloaded dataset shape: (4309, 14)


Unnamed: 0,publishedTimeText,simpleText,votes,author,isReply,isHearted,isPinned,isPaid,paidAmount,isSponsor,sponsorshipMonths,cleaned_text,tokenized_text,predicted_sentiment
0,23 hours ago,"Underrated guest Sid Warrier, Positive energy ...",804,@HumanityNotDoomed,False,False,False,False,,True,1.0,underrated guest sid warrier positive energy n...,"{'input_ids': tensor([[ 0, 1379, 2175,...",1
1,19 hours ago,Asli Id se aao warrior bhai,23,@sarcasticaloo2220,True,False,False,False,,False,,asli id se aao warrior bhai,"{'input_ids': tensor([[ 0, 50802, 3447,...",1
2,17 hours ago,Yep,1,@himangimahajan3631,True,False,False,False,,False,,yep,"{'input_ids': tensor([[ 0, 113, 4517, 2,...",1
3,23 hours ago,"No matter how big this show has gone, seeing S...",645,@mesatin2,False,False,False,False,,False,,matter big show gone seeing samay tanmay toget...,"{'input_ids': tensor([[ 0, 26866, 6957, 7...",1
4,16 hours ago,100 % right 😍🤩🙌,4,@mehtasanketd07,True,False,False,False,,False,,right,"{'input_ids': tensor([[ 0, 7108, 2, 1,...",1


In [52]:
# Check for missing values
print("\n🔍 Missing values in each column:")
print(df_labeled.isnull().sum())

# If empty, stop and debug
if df_labeled.empty:
    raise ValueError("❌ No valid data left after preprocessing! Check tokenized_text format.")



🔍 Missing values in each column:
publishedTimeText         0
simpleText                0
votes                     0
author                    1
isReply                   0
isHearted                 0
isPinned                  0
isPaid                    0
paidAmount             4309
isSponsor                 0
sponsorshipMonths      3814
cleaned_text            504
tokenized_text            0
predicted_sentiment       0
dtype: int64


In [53]:
import numpy as np

# Fill missing 'cleaned_text' values with an empty string to avoid errors
df_labeled["cleaned_text"].fillna("", inplace=True)

# Drop rows where tokenized_text is missing (these can't be used for training)
df_labeled = df_labeled.dropna(subset=["tokenized_text"])

# Ensure 'predicted_sentiment' column has valid numerical labels
sentiment_map = {"Negative": 0, "Neutral": 1, "Positive": 2}
df_labeled["label"] = df_labeled["predicted_sentiment"].map(sentiment_map)

# Drop any rows where sentiment mapping failed
df_labeled = df_labeled.dropna(subset=["label"])

# Convert 'label' to integer type
df_labeled["label"] = df_labeled["label"].astype(int)

# Save the cleaned dataset
df_labeled.to_csv("hinglish_final_cleaned.csv", index=False)

# Check the dataset shape after cleaning
print(f"✅ After cleaning, dataset shape: {df_labeled.shape}")


✅ After cleaning, dataset shape: (0, 15)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_labeled["cleaned_text"].fillna("", inplace=True)


In [55]:
print(f"Dataset Shape: {df_labeled.shape}")
print(df_labeled.head())  # Display the first few rows


Dataset Shape: (0, 15)
Empty DataFrame
Columns: [publishedTimeText, simpleText, votes, author, isReply, isHearted, isPinned, isPaid, paidAmount, isSponsor, sponsorshipMonths, cleaned_text, tokenized_text, predicted_sentiment, label]
Index: []


In [56]:
print("🔍 Missing values in each column:\n", df_labeled.isnull().sum())


🔍 Missing values in each column:
 publishedTimeText      0
simpleText             0
votes                  0
author                 0
isReply                0
isHearted              0
isPinned               0
isPaid                 0
paidAmount             0
isSponsor              0
sponsorshipMonths      0
cleaned_text           0
tokenized_text         0
predicted_sentiment    0
label                  0
dtype: int64


In [57]:
sentiment_map = {"Negative": 0, "Neutral": 1, "Positive": 2}

# Convert sentiment labels
df_labeled["label"] = df_labeled["predicted_sentiment"].map(sentiment_map)

# Drop rows where sentiment mapping failed
df_labeled = df_labeled.dropna(subset=["label"])

# Convert to integer type
df_labeled["label"] = df_labeled["label"].astype(int)

# Ensure text is properly formatted
df_labeled["cleaned_text"].fillna("", inplace=True)
df_labeled = df_labeled.dropna(subset=["tokenized_text"])  # Drop empty tokenized rows

# Save the cleaned dataset
df_labeled.to_csv("hinglish_final_cleaned.csv", index=False)

print(f"✅ After re-cleaning, dataset shape: {df_labeled.shape}")


✅ After re-cleaning, dataset shape: (0, 15)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_labeled["cleaned_text"].fillna("", inplace=True)


In [59]:
import pandas as pd

# Load the actual Hinglish dataset (replace with correct filename if needed)
file_path = "/content/comments INDIA'S GOT LATENT  EP 10 ft. Raghu Ram _@tanmaybhat_  _@Sidwarrier_ buSdqtdn_4I.csv"
df = pd.read_csv(file_path)

print(f"✅ Original dataset loaded! Shape: {df.shape}")
print(df.head())  # Show first few rows


✅ Original dataset loaded! Shape: (4309, 11)
  publishedTimeText                                         simpleText  votes  \
0      23 hours ago  Underrated guest Sid Warrier, Positive energy ...    804   
1      19 hours ago                        Asli Id se aao warrior bhai     23   
2      17 hours ago                                                Yep      1   
3      23 hours ago  No matter how big this show has gone, seeing S...    645   
4      16 hours ago                                    100 % right 😍🤩🙌      4   

                author  isReply  isHearted  isPinned  isPaid  paidAmount  \
0   @HumanityNotDoomed    False      False     False   False         NaN   
1   @sarcasticaloo2220     True      False     False   False         NaN   
2  @himangimahajan3631     True      False     False   False         NaN   
3            @mesatin2    False      False     False   False         NaN   
4      @mehtasanketd07     True      False     False   False         NaN   

   isSponso

In [60]:
print("🔍 Available columns:", df.columns)
print("🔍 Sample comments:\n", df["simpleText"].head())


🔍 Available columns: Index(['publishedTimeText', 'simpleText', 'votes', 'author', 'isReply',
       'isHearted', 'isPinned', 'isPaid', 'paidAmount', 'isSponsor',
       'sponsorshipMonths'],
      dtype='object')
🔍 Sample comments:
 0    Underrated guest Sid Warrier, Positive energy ...
1                          Asli Id se aao warrior bhai
2                                                  Yep
3    No matter how big this show has gone, seeing S...
4                                      100 % right 😍🤩🙌
Name: simpleText, dtype: object


In [61]:
import re

# Define a simple text cleaner function
def clean_text(text):
    if isinstance(text, str):  # Check if text is valid
        text = re.sub(r'[^A-Za-z0-9\s]', '', text)  # Remove special characters
        text = text.lower().strip()  # Convert to lowercase
        return text
    return ""  # If NaN or missing, return empty string

df["cleaned_text"] = df["simpleText"].apply(clean_text)

print("✅ Text cleaning complete! Sample:\n", df[["simpleText", "cleaned_text"]].head())


✅ Text cleaning complete! Sample:
                                           simpleText  \
0  Underrated guest Sid Warrier, Positive energy ...   
1                        Asli Id se aao warrior bhai   
2                                                Yep   
3  No matter how big this show has gone, seeing S...   
4                                    100 % right 😍🤩🙌   

                                        cleaned_text  
0  underrated guest sid warrier positive energy n...  
1                        asli id se aao warrior bhai  
2                                                yep  
3  no matter how big this show has gone seeing sa...  
4                                         100  right  


In [62]:
from transformers import AutoTokenizer

# Load a multilingual tokenizer (XLM-R, mBERT, etc.)
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

# Tokenize the text
df["tokenized_text"] = df["cleaned_text"].apply(lambda x: tokenizer(x, padding="max_length", truncation=True).data)

print("✅ Tokenization complete! Sample:\n", df[["cleaned_text", "tokenized_text"]].head())


✅ Tokenization complete! Sample:
                                         cleaned_text  \
0  underrated guest sid warrier positive energy n...   
1                        asli id se aao warrior bhai   
2                                                yep   
3  no matter how big this show has gone seeing sa...   
4                                         100  right   

                                      tokenized_text  
0  {'input_ids': [0, 1379, 2175, 297, 121399, 78,...  
1  {'input_ids': [0, 50802, 3447, 40, 10, 11, 31,...  
2  {'input_ids': [0, 113, 4517, 2, 1, 1, 1, 1, 1,...  
3  {'input_ids': [0, 110, 26866, 3642, 6957, 903,...  
4  {'input_ids': [0, 805, 7108, 2, 1, 1, 1, 1, 1,...  


In [63]:
if "predicted_sentiment" not in df.columns:
    print("❌ No sentiment labels found! Generating pseudo-labels...")
    df["predicted_sentiment"] = 1  # Default neutral


❌ No sentiment labels found! Generating pseudo-labels...


In [64]:
df.to_csv("hinglish_cleaned.csv", index=False)
print("✅ Hinglish dataset saved as 'hinglish_cleaned.csv'! Ready for training.")


✅ Hinglish dataset saved as 'hinglish_cleaned.csv'! Ready for training.


In [65]:
train_dataset = HinglishDataset(df)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

print(f"✅ Training dataset loaded! Total samples: {len(df)}")


✅ Training dataset loaded! Total samples: 4309


In [67]:
import pandas as pd

# Load the cleaned dataset
file_path = "hinglish_cleaned.csv"
df = pd.read_csv(file_path)

print(f"✅ Reloaded dataset shape: {df.shape}")
print(df.head())  # Show first few rows


✅ Reloaded dataset shape: (4309, 14)
  publishedTimeText                                         simpleText  votes  \
0      23 hours ago  Underrated guest Sid Warrier, Positive energy ...    804   
1      19 hours ago                        Asli Id se aao warrior bhai     23   
2      17 hours ago                                                Yep      1   
3      23 hours ago  No matter how big this show has gone, seeing S...    645   
4      16 hours ago                                    100 % right 😍🤩🙌      4   

                author  isReply  isHearted  isPinned  isPaid  paidAmount  \
0   @HumanityNotDoomed    False      False     False   False         NaN   
1   @sarcasticaloo2220     True      False     False   False         NaN   
2  @himangimahajan3631     True      False     False   False         NaN   
3            @mesatin2    False      False     False   False         NaN   
4      @mehtasanketd07     True      False     False   False         NaN   

   isSponsor  spons

In [68]:
import json
import torch

# Function to safely convert tokenized_text back to dictionary
def convert_tokenized_text(text):
    try:
        return json.loads(text.replace("'", '"'))  # Fix string format
    except (json.JSONDecodeError, TypeError):
        return None  # Return None for corrupt entries

df["tokenized_text"] = df["tokenized_text"].apply(convert_tokenized_text)

# Drop rows where tokenized_text couldn't be fixed
df = df.dropna(subset=["tokenized_text"])

print("✅ Tokenized text successfully converted back!")


✅ Tokenized text successfully converted back!


In [None]:
from torch.utils.data import Dataset, DataLoader

# Define a PyTorch dataset class
class HinglishDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        return {
            "input_ids": torch.tensor(item["tokenized_text"]["input_ids"]),
            "attention_mask": torch.tensor(item["tokenized_text"]["attention_mask"]),
            "label": torch.tensor(int(item["predicted_sentiment"]))
        }

# Create dataset & dataloader
train_dataset = HinglishDataset(df)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

print(f"✅ Training dataset loaded! Total samples: {len(df)}")


In [58]:
# Create DataLoader
train_dataset = HinglishDataset(df_labeled)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

print(f"✅ Training dataset loaded! Total samples: {len(df_labeled)}")


ValueError: num_samples should be a positive integer value, but got num_samples=0

In [54]:
import torch
from torch.utils.data import Dataset, DataLoader

# Define Hinglish Dataset class
class HinglishDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        return {
            "input_ids": torch.tensor(eval(item["tokenized_text"])["input_ids"]).squeeze(),
            "attention_mask": torch.tensor(eval(item["tokenized_text"])["attention_mask"]).squeeze(),
            "label": torch.tensor(item["label"], dtype=torch.long),
        }

# Create DataLoader
train_dataset = HinglishDataset(df_labeled)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

print(f"✅ Training dataset loaded! Total samples: {len(df_labeled)}")


ValueError: num_samples should be a positive integer value, but got num_samples=0

In [44]:
import torch

# Ensure dataset has sentences before labeling
if df_check.empty:
    raise ValueError("❌ Dataset is empty! Cannot pseudo-label.")

# Tokenize sentences
df_check["tokenized_text"] = df_check["cleaned_text"].apply(tokenize_text)

# Create dataset & dataloader
unlabeled_dataset = HinglishDataset(df_check)
unlabeled_dataloader = DataLoader(unlabeled_dataset, batch_size=8, shuffle=False)

# Predict labels
df_check["predicted_sentiment"] = predict_sentiment(model, unlabeled_dataloader)

# Save new labeled dataset
df_check.to_csv("hinglish_fixed_final.csv", index=False)
print("✅ Pseudo-labeling fixed! Saved as 'hinglish_fixed_final.csv'.")


ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [33]:
import torch
from torch.utils.data import Dataset, DataLoader

# Load the fixed dataset
file_path = "hinglish_pseudo_labeled_fixed.csv"
df_labeled = pd.read_csv(file_path)

# Convert predicted sentiment labels to integers
df_labeled.rename(columns={"predicted_sentiment": "label"}, inplace=True)
df_labeled["label"] = df_labeled["label"].astype(int)

# HinglishDataset class (Ensure correct tensor data type)
class HinglishDataset(Dataset):
    def __init__(self, dataframe, labeled=True):
        self.data = dataframe
        self.labeled = labeled  # Flag to check if dataset has labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        tokenized_text = json.loads(item["tokenized_text"].replace("'", '"'))  # Ensure JSON format

        data = {
            "input_ids": torch.tensor(tokenized_text["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(tokenized_text["attention_mask"], dtype=torch.long),
        }
        if self.labeled:
            data["labels"] = torch.tensor(item["label"], dtype=torch.long)

        return data

# Create DataLoader for training
train_dataset = HinglishDataset(df_labeled, labeled=True)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

print(f"✅ Training dataset loaded! Total samples: {len(df_labeled)}")


ValueError: num_samples should be a positive integer value, but got num_samples=0

In [28]:
# Check if DataFrame is empty after preprocessing
if df_labeled.empty:
    raise ValueError("❌ No valid data left after preprocessing! Check tokenized_text format.")

# Proceed only if data exists
train_dataset = HinglishDataset(df_labeled, labeled=True)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Retrain the model
train_model(model, train_dataloader, optimizer, loss_fn, lr_scheduler, num_epochs=5)

print("✅ Retraining complete! The model is now updated with pseudo-labeled data.")


ValueError: ❌ No valid data left after preprocessing! Check tokenized_text format.

In [27]:
import torch
from torch.utils.data import Dataset, DataLoader
import json

# Function to safely convert tokenized_text
def safe_convert(text):
    try:
        return json.loads(text.replace("'", '"'))  # Replace single quotes with double quotes for JSON
    except (ValueError, TypeError, json.JSONDecodeError):
        return None  # Return None for corrupt entries

# Load pseudo-labeled dataset
file_path = "hinglish_pseudo_labeled.csv"
df_labeled = pd.read_csv(file_path)

# Convert tokenized_text safely
df_labeled["tokenized_text"] = df_labeled["tokenized_text"].apply(safe_convert)

# Drop rows with invalid tokenized_text
df_labeled = df_labeled.dropna(subset=["tokenized_text"]).reset_index(drop=True)

# Rename 'predicted_sentiment' as 'label' for training
df_labeled.rename(columns={"predicted_sentiment": "label"}, inplace=True)

# Convert labels to integers
df_labeled["label"] = df_labeled["label"].astype(int)

# Fix HinglishDataset class to ensure correct tensor data type
class HinglishDataset(Dataset):
    def __init__(self, dataframe, labeled=True):
        self.data = dataframe
        self.labeled = labeled  # Flag to check if dataset has labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]

        tokenized_text = item["tokenized_text"]

        data = {
            "input_ids": torch.tensor(tokenized_text["input_ids"], dtype=torch.long),  # Fix dtype
            "attention_mask": torch.tensor(tokenized_text["attention_mask"], dtype=torch.long),  # Fix dtype
        }
        if self.labeled:
            data["labels"] = torch.tensor(item["label"], dtype=torch.long)  # Fix dtype

        return data

# Create DataLoader for training
train_dataset = HinglishDataset(df_labeled, labeled=True)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Retrain the model
train_model(model, train_dataloader, optimizer, loss_fn, lr_scheduler, num_epochs=5)

print("✅ Retraining complete! The model is now updated with pseudo-labeled data.")


ValueError: num_samples should be a positive integer value, but got num_samples=0