In [1]:
# Importing the libraries needed
# !pip install transformers==3.0.2
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer
import logging
logging.basicConfig(level=logging.ERROR)

In [2]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
train = pd.read_csv('/kaggle/input/train-tsv/train.tsv', delimiter='\t')

In [4]:
train.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [5]:
import pandas as pd

# Specify the paths to the Parquet files
parquet_file_path1 = "/kaggle/input/hindi-sentiments/train-00000-of-00001.parquet"
parquet_file_path2 = "/kaggle/input/hindi-sa/train-00000-of-00001 (1).parquet"

# Read the Parquet files into pandas DataFrames
df1 = pd.read_parquet(parquet_file_path1)
df2 = pd.read_parquet(parquet_file_path2)
df = pd.concat([df1, df2], ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True)



In [6]:
df.head()

Unnamed: 0,text,label
0,हालाकि इसमें दो स्‍क्रीन की खूबी दी गई थी जिसम...,1
1,इसके जरिये आप इन‍कमिंग मैसेज का रिप्‍लाई भी कर...,1
2,अगर केतन केहता की 'रंग रसिया' समय से रिलीज हो ...,1
3,यह डिसप्ले ज्यादातर मामलों में अच्छा प्रदर्शन ...,2
4,इसका नाम बदलकर केवलादेव घना नेशनल पार्क रखा गया ।,1


In [7]:
df['label'].unique()

array([1, 2, 0])

In [8]:
df.describe()

Unnamed: 0,label
count,6662.0
mean,1.234164
std,0.756368
min,0.0
25%,1.0
50%,1.0
75%,2.0
max,2.0


In [9]:
new_df = df[['text', 'label']]

In [10]:
new_df.head()

Unnamed: 0,text,label
0,हालाकि इसमें दो स्‍क्रीन की खूबी दी गई थी जिसम...,1
1,इसके जरिये आप इन‍कमिंग मैसेज का रिप्‍लाई भी कर...,1
2,अगर केतन केहता की 'रंग रसिया' समय से रिलीज हो ...,1
3,यह डिसप्ले ज्यादातर मामलों में अच्छा प्रदर्शन ...,2
4,इसका नाम बदलकर केवलादेव घना नेशनल पार्क रखा गया ।,1


In [11]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 256
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 4
# EPOCHS = 1
LEARNING_RATE = 0.00001
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [12]:
class SentimentData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.label
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [13]:
train_size = 0.8
train_data=new_df.sample(frac=train_size,random_state=200)
test_data=new_df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = SentimentData(train_data, tokenizer, MAX_LEN)
testing_set = SentimentData(test_data, tokenizer, MAX_LEN)

FULL Dataset: (6662, 2)
TRAIN Dataset: (5330, 2)
TEST Dataset: (1332, 2)


In [14]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [15]:
class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained("roberta-base")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 5)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [16]:
model = RobertaClass()
model.to(device)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaClass(
  (l1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((

In [17]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [18]:
def calcuate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

In [19]:
from tqdm import tqdm

def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    
    # Wrap training_loader with tqdm for progress tracking
    for _, data in enumerate(tqdm(training_loader, desc=f"Epoch {epoch}")):
        ids = data['ids'].to(device, dtype=torch.long)
        mask = data['mask'].to(device, dtype=torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype=torch.long)

        outputs = model(ids, mask, token_type_ids)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accuracy(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)
        
        if nb_tr_steps % 5000 == 0:
            loss_step = tr_loss / nb_tr_steps
            accu_step = (n_correct * 100) / nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct * 100) / nb_tr_examples}')
    epoch_loss = tr_loss / nb_tr_steps
    epoch_accu = (n_correct * 100) / nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

EPOCHS = 14
for epoch in range(EPOCHS):
    train(epoch)


Epoch 0:   0%|          | 0/334 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Epoch 0: 100%|██████████| 334/334 [02:06<00:00,  2.63it/s]


The Total Accuracy for Epoch 0: 40.61913696060037
Training Loss Epoch: 1.1073498656292875
Training Accuracy Epoch: 40.61913696060037


Epoch 1: 100%|██████████| 334/334 [02:05<00:00,  2.65it/s]


The Total Accuracy for Epoch 1: 43.63977485928705
Training Loss Epoch: 1.0594262518211752
Training Accuracy Epoch: 43.63977485928705


Epoch 2: 100%|██████████| 334/334 [02:05<00:00,  2.66it/s]


The Total Accuracy for Epoch 2: 50.150093808630395
Training Loss Epoch: 0.9947055008953917
Training Accuracy Epoch: 50.150093808630395


Epoch 3: 100%|██████████| 334/334 [02:05<00:00,  2.65it/s]


The Total Accuracy for Epoch 3: 53.26454033771107
Training Loss Epoch: 0.9562325543629195
Training Accuracy Epoch: 53.26454033771107


Epoch 4: 100%|██████████| 334/334 [02:05<00:00,  2.65it/s]


The Total Accuracy for Epoch 4: 55.40337711069419
Training Loss Epoch: 0.9106192606651854
Training Accuracy Epoch: 55.40337711069419


Epoch 5: 100%|██████████| 334/334 [02:05<00:00,  2.65it/s]


The Total Accuracy for Epoch 5: 57.87992495309568
Training Loss Epoch: 0.8664897288123291
Training Accuracy Epoch: 57.87992495309568


Epoch 6: 100%|██████████| 334/334 [02:05<00:00,  2.65it/s]


The Total Accuracy for Epoch 6: 60.75046904315197
Training Loss Epoch: 0.8383265109119301
Training Accuracy Epoch: 60.75046904315197


Epoch 7: 100%|██████████| 334/334 [02:05<00:00,  2.65it/s]


The Total Accuracy for Epoch 7: 62.4015009380863
Training Loss Epoch: 0.807985848206246
Training Accuracy Epoch: 62.4015009380863


Epoch 8: 100%|██████████| 334/334 [02:06<00:00,  2.65it/s]


The Total Accuracy for Epoch 8: 63.86491557223265
Training Loss Epoch: 0.7814325184343818
Training Accuracy Epoch: 63.86491557223265


Epoch 9: 100%|██████████| 334/334 [02:05<00:00,  2.65it/s]


The Total Accuracy for Epoch 9: 65.29080675422139
Training Loss Epoch: 0.7531841788106336
Training Accuracy Epoch: 65.29080675422139


Epoch 10: 100%|██████████| 334/334 [02:05<00:00,  2.65it/s]


The Total Accuracy for Epoch 10: 67.5234521575985
Training Loss Epoch: 0.7129041882689128
Training Accuracy Epoch: 67.5234521575985


Epoch 11: 100%|██████████| 334/334 [02:05<00:00,  2.65it/s]


The Total Accuracy for Epoch 11: 69.30581613508443
Training Loss Epoch: 0.6820704017035262
Training Accuracy Epoch: 69.30581613508443


Epoch 12: 100%|██████████| 334/334 [02:05<00:00,  2.65it/s]


The Total Accuracy for Epoch 12: 72.13883677298311
Training Loss Epoch: 0.640739407724963
Training Accuracy Epoch: 72.13883677298311


Epoch 13: 100%|██████████| 334/334 [02:06<00:00,  2.65it/s]

The Total Accuracy for Epoch 13: 72.77673545966229
Training Loss Epoch: 0.6156333695182543
Training Accuracy Epoch: 72.77673545966229





In [24]:
from sklearn.metrics import f1_score, recall_score

model.eval()  # Set the model to evaluation mode
total_correct = 0
total_samples = 0

# For F1 and recall calculation
all_predicted_labels = []
all_true_labels = []

with torch.no_grad():
    for batch in test_loader:
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}
        
        # Forward pass
        outputs = model(**batch)
        
        # Compute predictions
        predicted_labels = torch.argmax(outputs.logits, dim=1)
        
        # Update total number of correct predictions
        total_correct += (predicted_labels == batch["labels"]).sum().item()
        
        # Update total number of samples
        total_samples += len(batch["labels"])
        
        # Collect predicted and true labels for F1 and recall calculation
        all_predicted_labels.extend(predicted_labels.cpu().numpy())
        all_true_labels.extend(batch["labels"].cpu().numpy())

# Calculate accuracy
accuracy = total_correct / total_samples
print(f"Model accuracy on test set: {accuracy}")

# Calculate F1 score and recall
f1 = f1_score(all_true_labels, all_predicted_labels, average='weighted')
recall = recall_score(all_true_labels, all_predicted_labels, average='weighted')

print(f"F1 score on test set: {f1}")
print(f"Recall on test set: {recall}")


NameError: name 'test_loader' is not defined