In [1]:
!pip install transformers
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.preprocessing import LabelEncoder



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv('codemix-main/train.csv')
print("Number of rows in data =",data.shape[0])
#shuffle the data
data = data.sample(frac=1).reset_index(drop=True)

Number of rows in data = 115000


In [3]:
X = data['tweets']
y = data['labels']
#take only 3000 samples
X = X[:80000]
y = y[:80000]

In [4]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [5]:
# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.1, random_state=42)

In [6]:
# Initialize the HingBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("l3cube-pune/hing-mbert")


In [7]:
# Tokenize the input data
X_train_encodings = tokenizer.batch_encode_plus(
    X_train.tolist(),
    truncation=True,
    padding=True,
    max_length=512,  # Adjust the max length as per your requirements
    return_tensors='pt'
)

X_test_encodings = tokenizer.batch_encode_plus(
    X_test.tolist(),
    truncation=True,
    padding=True,
    max_length=512,  # Adjust the max length as per your requirements
    return_tensors='pt'
)

In [8]:
# Create the DataLoader for training and testing sets
train_dataset = TensorDataset(
    X_train_encodings['input_ids'],
    X_train_encodings['attention_mask'],
    torch.tensor(y_train, dtype=torch.long)
)

test_dataset = TensorDataset(
    X_test_encodings['input_ids'],
    X_test_encodings['attention_mask'],
    torch.tensor(y_test, dtype=torch.long)
)

In [9]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [10]:
# Create the DistilBERT model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained("l3cube-pune/hing-mbert", num_labels=2, ignore_mismatched_sizes=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at l3cube-pune/hing-mbert and are newly initialized: ['bert.pooler.dense.weight', 'classifier.weight', 'bert.pooler.dense.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)



In [12]:
# Fine-tune the mBERT model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [13]:
model.train()
for epoch in range(12):  # Set the number of training epochs
    total_loss = 0
    for batch in train_loader:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1} - Average Loss: {total_loss/len(train_loader)}")


Epoch 1 - Average Loss: 0.32573047817539835
Epoch 2 - Average Loss: 0.22131142333460352
Epoch 3 - Average Loss: 0.16581369336559953
Epoch 4 - Average Loss: 0.11148681185635118
Epoch 5 - Average Loss: 0.08208528314757213
Epoch 6 - Average Loss: 0.06435111194345194
Epoch 7 - Average Loss: 0.05143170700818559
Epoch 8 - Average Loss: 0.04706224017137558
Epoch 9 - Average Loss: 0.04384139218358582
Epoch 10 - Average Loss: 0.038430325873409554
Epoch 11 - Average Loss: 0.03488959990163049
Epoch 12 - Average Loss: 0.03262718330887987


In [14]:
# Evaluate the model on the test set
model.eval()
with torch.no_grad():
    predicted_labels = []
    true_labels = []

    for batch in test_loader:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_labels.extend(torch.argmax(logits, dim=1).tolist())
        true_labels.extend(labels.tolist())

In [15]:
# Decode the predicted and true labels
predicted_labels = label_encoder.inverse_transform(predicted_labels)
true_labels = label_encoder.inverse_transform(true_labels)

In [16]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Calculate accuracy
accuracy = accuracy_score(true_labels, predicted_labels)
print("Accuracy:", accuracy)

f1 = f1_score(true_labels, predicted_labels, average='macro')
precision = precision_score(true_labels, predicted_labels, average='macro')
recall = recall_score(true_labels, predicted_labels, average='macro')

print("F1:", f1)
print("Precision:", precision)
print("Recall:", recall)


Accuracy: 0.885625
F1: 0.8855763252414797
Precision: 0.8860200452440747
Recall: 0.8855383500350886


In [17]:
data2 = pd.read_csv('codemix-main/test.csv')
#calculate accuracy on test data
X2 = data2['tweets']
y2 = data2['labels']
y2_encoded = label_encoder.fit_transform(y2)
X2 = X2
y2 = y2_encoded
X2_encodings = tokenizer.batch_encode_plus(
    X2.tolist(),
    truncation=True,
    padding=True,
    max_length=512,  # Adjust the max length as per your requirements
    return_tensors='pt'
)
test_dataset2 = TensorDataset(
    X2_encodings['input_ids'],
    X2_encodings['attention_mask'],
    torch.tensor(y2, dtype=torch.long)
)
test_loader2 = DataLoader(test_dataset2, batch_size=16, shuffle=False)
model.eval()
with torch.no_grad():
    predicted_labels = []
    true_labels = []

    for batch in test_loader2:
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_labels.extend(torch.argmax(logits, dim=1).tolist())
        true_labels.extend(labels.tolist())

# Decode the predicted and true labels
predicted_labels = label_encoder.inverse_transform(predicted_labels)
true_labels = label_encoder.inverse_transform(true_labels)
accuracy = accuracy_score(true_labels, predicted_labels)
print("Accuracy:", accuracy)
f1 = f1_score(true_labels, predicted_labels, average='macro')
precision = precision_score(true_labels, predicted_labels, average='macro')
recall = recall_score(true_labels, predicted_labels, average='macro')
print("F1:", f1)
print("Precision:", precision)
print("Recall:", recall)


Accuracy: 0.8846333333333334
F1: 0.8845598301401133
Precision: 0.8854344752834016
Recall: 0.8845669267625342
