In [1]:
import json
import torch
from transformers import LlamaTokenizer
from torch.utils.data import Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments

# Load your JSON data
with open(r'D:\Projects\VA_new\qa_data.json', 'r') as f:
    data = json.load(f)

# Create a mapping from question numbers to labels
question_to_label = {item['question_number']: idx for idx, item in enumerate(data)}

# Define a custom dataset
class CustomDataset(Dataset):
    def __init__(self, data):
        # Load tokenizer from local files
        self.tokenizer = RobertaTokenizer.from_pretrained(r'C:\Users\vinaydeekshitGarimel\Downloads\roberta')
        self.encodings = self.tokenizer(
            [item['question'] for item in data],
            truncation=True,
            padding=True,
            max_length=512,
            return_tensors='pt'
        )
        # Ensure labels are using the mapped indices
        self.labels = torch.tensor([question_to_label[item['question_number']] for item in data])  # Map question_number to label

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.labels[idx]
        }

    def __len__(self):
        return len(self.labels)

# Prepare the dataset
dataset = CustomDataset(data)

# Load the model from local files with the correct number of labels and ignore size mismatches
model = RobertaForSequenceClassification.from_pretrained(
    r'C:\Users\vinaydeekshitGarimel\Downloads\roberta',
    num_labels=1291,
    ignore_mismatched_sizes=True
)

# Create and save the tokenizer after loading the model
tokenizer = RobertaTokenizer.from_pretrained(r'C:\Users\vinaydeekshitGarimel\Downloads\roberta')

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=4,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=10,
    evaluation_strategy='no',  # Change this line
    save_total_limit=2,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

# Start training
trainer.train()

# Save the model
model.save_pretrained(r'C:\Users\vinaydeekshitGarimel\Downloads\roberta-finetuned')
tokenizer.save_pretrained(r'C:\Users\vinaydeekshitGarimel\Downloads\roberta-finetuned-tokenizer')


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at C:\Users\vinaydeekshitGarimel\Downloads\roberta and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at C:\Users\vinaydeekshitGarimel\Downloads\roberta and are newly initialized because the shapes did not match:
- roberta.embeddings.position_embeddings.weight: found shape torch.Size([514, 768]) in the checkpoint and torch.Size([512, 768]) in the model instantiated
- roberta.embeddings.token_type_embeddings.weight: found shape torch.Size([1, 768]) in the checkpoint and torch.Size([2, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predi

Step,Training Loss
10,7.2403
20,7.1865
30,7.2356
40,7.2238
50,7.2471
60,7.224
70,7.2153
80,7.2067
90,7.2254
100,7.2305


('C:\\Users\\vinaydeekshitGarimel\\Downloads\\roberta-finetuned-tokenizer\\tokenizer_config.json',
 'C:\\Users\\vinaydeekshitGarimel\\Downloads\\roberta-finetuned-tokenizer\\special_tokens_map.json',
 'C:\\Users\\vinaydeekshitGarimel\\Downloads\\roberta-finetuned-tokenizer\\vocab.json',
 'C:\\Users\\vinaydeekshitGarimel\\Downloads\\roberta-finetuned-tokenizer\\merges.txt',
 'C:\\Users\\vinaydeekshitGarimel\\Downloads\\roberta-finetuned-tokenizer\\added_tokens.json')

In [5]:
import json
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification

# Load the fine-tuned model and tokenizer
model_path = r'C:\Users\vinaydeekshitGarimel\Downloads\roberta-finetuned'
tokenizer_path = r'C:\Users\vinaydeekshitGarimel\Downloads\roberta-finetuned-tokenizer'

tokenizer = RobertaTokenizer.from_pretrained(tokenizer_path)
model = RobertaForSequenceClassification.from_pretrained(model_path)

# Ensure the model is in evaluation mode
model.eval()

# Load the question-to-answer mapping from your original JSON data
with open(r'D:\Projects\VA_new\qa_data.json', 'r') as f:
    data = json.load(f)

# Create a mapping from label index to the answer
label_to_answer = {idx: item['answer'] for idx, item in enumerate(data)}

def predict(question):
    inputs = tokenizer(
        question,
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors='pt'
    )

    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    predicted_label = torch.argmax(logits, dim=1).item()
    
    # Debug print for predicted label
    print(f"Predicted Label for question '{question}': {predicted_label}")

    answer = label_to_answer.get(predicted_label, "No answer found for this label.")
    return answer


# Test the model with sample questions
test_questions = [
    "How can I find lost luggage?",
    "What airport facilities are available in the terminal?"
]

for question in test_questions:
    answer = predict(question)
    print(f"Question: {question} => Answer: {answer}")


Predicted Label for question 'How can I find lost luggage?': 646
Question: How can I find lost luggage? => Answer: The PRM Waiting Area is located at hyderabad airport at both Arrivals and Departures, offering PRM passengers a comfortable place to wait until their pre-booked assistance arrives or until they are ready to proceed.
Predicted Label for question 'What airport facilities are available in the terminal?': 646
Question: What airport facilities are available in the terminal? => Answer: The PRM Waiting Area is located at hyderabad airport at both Arrivals and Departures, offering PRM passengers a comfortable place to wait until their pre-booked assistance arrives or until they are ready to proceed.
