##**Multiclass Text Classification**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [16]:
!pip install -q transformers==4.34.0 datasets==2.14.5 accelerate==0.23.0 evaluate==0.4.1 peft==0.5.0

In [17]:
%cd /content/drive/MyDrive/Project/MCQA/MedMCQA

/content/drive/MyDrive/Project/MCQA/MedMCQA


###**1. Load Data**

In [18]:
import pandas as pd

train_df = pd.read_json('./data/train.json', lines=True)

In [5]:
train_df.head(2)

Unnamed: 0,question,exp,cop,opa,opb,opc,opd,subject_name,topic_name,id,choice_type
0,Chronic urethral obstruction due to benign pri...,Chronic urethral obstruction because of urinar...,3,Hyperplasia,Hyperophy,Atrophy,Dyplasia,Anatomy,Urinary tract,e9ad821a-c438-4965-9f77-760819dfa155,single
1,Which vitamin is supplied from only animal sou...,Ans. (c) Vitamin B12 Ref: Harrison's 19th ed. ...,3,Vitamin C,Vitamin B7,Vitamin B12,Vitamin D,Biochemistry,Vitamins and Minerals,e3d3c4e1-4fb2-45e7-9f88-247cc8f373b3,single


 - Single Choice: A => 0, B => 1, C => 2, D => 3, E => 4
 - Multiple Choice: A => 0, B => 1, C => 2, D => 3, E => 4, AB => 5, BC => 6,..., ABC => 10,..., ABCDE => 12.

In [19]:
from datasets import load_dataset
from datasets import DatasetDict

data_dir = "./data"

raw_dataset = {
    'train': load_dataset('json', data_files=f'{data_dir}/train.json')['train'],
    'valid': load_dataset('json', data_files=f'{data_dir}/dev.json')['train']
}
raw_dataset = DatasetDict(raw_dataset)

###**2. Dataloader**

In [20]:
import torch

id2label = {
    0: 'A',
    1: 'B',
    2: 'C',
    3: 'D',
    4: 'E'
}

label2id = {
    'A': 0,
    'B': 1,
    'C': 2,
    'D': 3,
    'E': 4
}

num_labels = len(id2label)

def preprocess_function(examples, max_seq_length, tokenizer):
    # Tokenize the texts
    sentences = []
    labels = []
    for example in zip(examples["question"], examples["exp"],
                       examples['opa'], examples['opb'], examples['opc'], examples['opd'],
                       examples['cop']):
        question = example[0]
        context = example[1]
        opa = example[2]
        opb = example[3]
        opc = example[4]
        opd = example[5]
        choices = f"A{opa}. \n B. {opb}. \n C. {opc} \n D. {opd}"
        prompt = f"Context: {context}. Question: {question}. Choice the correct answers from: {choices}"
        sentences.append(prompt)

        answer = example[6]-1
        labels.append(answer)


    model_inputs = tokenizer(
        sentences,
        padding="max_length",
        max_length=max_seq_length,
        truncation=True
    )
    model_inputs["labels"] = torch.tensor(labels)
    return model_inputs

In [21]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import get_peft_model, LoraConfig, TaskType

model_name = "allenai/scibert_scivocab_uncased"

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
    use_fast=True
)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
)

# peft_config = LoraConfig(
#         r=8,
#         lora_alpha=32,
#         lora_dropout=0.05,
#         target_modules=["query", "vavlue"],
#         bias="none"
#     )

# model = get_peft_model(model, peft_config)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
from functools import partial

processed_dataset = raw_dataset.map(
    partial(
        preprocess_function,
        max_seq_length=256,
        tokenizer=tokenizer
    ),
    batched=True,
    load_from_cache_file=False,
    remove_columns=['question', 'exp', 'cop', 'opa', 'opb', 'opc', 'opd', 'subject_name', 'topic_name', 'id', 'choice_type'],
    desc="Running tokenizer on dataset",
)

Running tokenizer on dataset:   0%|          | 0/182822 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/4183 [00:00<?, ? examples/s]

In [23]:
processed_dataset['train']['labels'][0]

2

###**3. Metric**

In [24]:
import numpy as np
import evaluate
from transformers import EvalPrediction

metric = evaluate.load("accuracy")
def compute_metrics(eval_pred: EvalPrediction):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    result = metric.compute(predictions=predictions, references=labels)
    return result

compute_metrics = compute_metrics

###**4. Trainer**

In [25]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    f"./model/{model_name}",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=128,
    num_train_epochs=1,
    evaluation_strategy = "steps",
    save_strategy = "steps",
    save_steps = 1000,
    eval_steps=1000,
    save_total_limit = 1,
    learning_rate=2e-5,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=processed_dataset["train"],
        eval_dataset=processed_dataset["valid"],
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
    )

In [26]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy
1000,0.7038,0.915117,0.552952
2000,0.6528,0.916879,0.557973
3000,0.6328,0.891839,0.578054
4000,0.6058,0.899996,0.578532
5000,0.5986,0.880475,0.586899


TrainOutput(global_step=5714, training_loss=0.6615255821155926, metrics={'train_runtime': 2527.5848, 'train_samples_per_second': 72.331, 'train_steps_per_second': 2.261, 'total_flos': 2.4051892520776704e+16, 'train_loss': 0.6615255821155926, 'epoch': 1.0})

In [27]:
%cd /content/drive/MyDrive/Project/MCQA/ViMedicalMCQA

/content/drive/MyDrive/Project/MCQA/ViMedicalMCQA


In [28]:
import pandas as pd
df = pd.read_csv("./data/public_test_with_context_en.csv")

In [36]:
convert_to_binary = {
    0: '10000',
    1: '01000',
    2: '00100',
    3: '00010',
    4: '00001'
}

def predict_per_sample(df, model, tokenizer):
    answers = []
    for idx, row in df.iterrows():
        context = row["context"].split("///")[0]
        question = row["question"]
        choices = [
            op for op in row[["option_1", "option_2", "option_3", "option_4", "option_5", "option_6"]].tolist()
            if isinstance(op, str)
        ]
        text_choices = " ".join(choices)
        prompt = f"Context: {context}. Question: {question}. Choice the correct answers from: {text_choices}"
        model_inputs = tokenizer(prompt, return_tensors="pt")
        outputs = model(**model_inputs)
        prediction = torch.argmax(outputs[0], axis=1)
        answer = convert_to_binary[prediction.item()]
        answer = answer[:len(choices)]
        answers.append(answer)
    return answers

In [30]:
model = model.to("cpu")

In [37]:
answers = predict_per_sample(df, model, tokenizer)
answers

['0010',
 '1000',
 '0010',
 '10',
 '1000',
 '0100',
 '0001',
 '00',
 '100',
 '00',
 '10000',
 '010',
 '010',
 '010',
 '100',
 '010',
 '0001',
 '0010',
 '010',
 '000',
 '100',
 '001',
 '010',
 '100',
 '0100',
 '10',
 '100',
 '010',
 '1000',
 '0001',
 '1000',
 '0100',
 '000',
 '000',
 '000',
 '001',
 '0010',
 '10',
 '100',
 '10',
 '000',
 '010',
 '001',
 '0010',
 '00100',
 '010',
 '100',
 '01',
 '100',
 '001',
 '10',
 '100',
 '100',
 '010',
 '100',
 '0010',
 '01',
 '010',
 '100',
 '000',
 '100',
 '1000',
 '000',
 '1000',
 '100',
 '000',
 '000',
 '100',
 '1000',
 '1000',
 '100',
 '000',
 '000',
 '1000',
 '100',
 '100',
 '0001',
 '1000',
 '0001',
 '000',
 '10000',
 '000',
 '0001',
 '1000',
 '100',
 '10000',
 '100',
 '1000',
 '100',
 '100',
 '0001',
 '1000',
 '0100',
 '1000',
 '1000',
 '1000',
 '1000',
 '0001',
 '1000',
 '100']

In [38]:
df['answer'] = answers

In [39]:
result_df = df[["id", "answer"]]

In [40]:
result_df

Unnamed: 0,id,answer
0,level3_1,0010
1,level3_2,1000
2,level3_5,0010
3,level3_13,10
4,level3_14,1000
...,...,...
95,level4_4,1000
96,level4_9,1000
97,level4_27,0001
98,level4_28,1000


In [41]:
result_df.to_csv('./data/result_en_scibert_multilabel.csv', index=False)

#Multi-label Text Classification