## ðŸ’¬ Tokenizer for Fine-Tuning Language Model

In [1]:
# Checking version of transformers library
import transformers
print(transformers.__version__)

  from .autonotebook import tqdm as notebook_tqdm


4.51.3


In [2]:
# Download Tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

In [3]:
# Import prescriptions data
import pandas as pd
dataset_for_train = pd.read_csv("finetuning_data/train.csv")
dataset_for_eval = pd.read_csv("finetuning_data/eval.csv")
dataset_for_test = pd.read_csv("finetuning_data/test.csv")

In [4]:
import random

def shuffle_token_label_chunks(chunks):
    random.shuffle(chunks)
    tokens = []
    ner_tags = []
    for chunk_tokens, chunk_labels in chunks:
        tokens.extend(chunk_tokens)
        ner_tags.extend(chunk_labels)
    return tokens, ner_tags

In [5]:
# Function to automate tag labels for each data
def automate_ner_tags_xlm_roberta_tokenizer(sentence):
    patient_name_tokens = tokenizer.tokenize(str(sentence["patient_name"]))
    patient_id_tokens = tokenizer.tokenize(str(sentence["patient_id"]))
    patient_birthdate_tokens = tokenizer.tokenize(str(sentence["patient_birthdate"]))
    drug_name_tokens = tokenizer.tokenize(sentence["drug_name"])
    dosage_tokens = tokenizer.tokenize(sentence["dosage"])
    form_tokens = tokenizer.tokenize(sentence["form"])
    drug_reg_no_tokens = tokenizer.tokenize(sentence["drug_reg_no"])
    mfg_date_tokens = tokenizer.tokenize(sentence["mfg_date"])
    exp_date_tokens = tokenizer.tokenize(sentence["exp_date"])
    warnings_tokens = tokenizer.tokenize(sentence["warnings"])
    indications_tokens = tokenizer.tokenize(sentence["indications"])
    usage_instructions_tokens = tokenizer.tokenize(sentence["usage_instructions"])
    
    patient_name_labels = ["O"] * (len(patient_name_tokens))
    patient_id_labels = ["O"] * (len(patient_id_tokens))
    patient_birthdate_labels = ["O"] * (len(patient_birthdate_tokens))
    drug_name_labels = ["B-DRUG_NAME"] + ["I-DRUG_NAME"] * (len(drug_name_tokens)-1)
    dosage_labels = ["B-DOSAGE"] + ["I-DOSAGE"] * (len(dosage_tokens) - 1)
    form_labels = ["B-FORM"] + ["I-FORM"] * (len(form_tokens) - 1)
    drug_reg_no_labels = ["O"] * (len(drug_reg_no_tokens))
    mfg_date_labels = ["O"] * (len(mfg_date_tokens))
    exp_date_labels = ["O"] * (len(exp_date_tokens))
    warnings_labels = ["B-WARNINGS"] + ["I-WARNINGS"] * (len(warnings_tokens)-1)
    indications_labels = ["B-INDICATIONS"] + ["I-INDICATIONS"] * (len(indications_tokens)-1)
    usage_instructions_labels = ["B-USAGE_INSTRUCTIONS"] + ["I-USAGE_INSTRUCTIONS"] * (len(usage_instructions_tokens)-1)
    
    tokens = patient_name_tokens + patient_id_tokens + patient_birthdate_tokens + drug_name_tokens + dosage_tokens + form_tokens + drug_reg_no_tokens + mfg_date_tokens + exp_date_tokens + warnings_tokens + indications_tokens + usage_instructions_tokens
    ner_tags = patient_name_labels + patient_id_labels + patient_birthdate_labels + drug_name_labels + dosage_labels + form_labels + drug_reg_no_labels + mfg_date_labels + exp_date_labels + warnings_labels + indications_labels + usage_instructions_labels

    chunks = [
        (patient_name_tokens, patient_name_labels),
        (patient_id_tokens, patient_id_labels),
        (patient_birthdate_tokens, patient_birthdate_labels),
        (drug_name_tokens, drug_name_labels),
        (dosage_tokens, dosage_labels),
        (form_tokens, form_labels),
        (drug_reg_no_tokens, drug_reg_no_labels),
        (mfg_date_tokens, mfg_date_labels),
        (exp_date_tokens, exp_date_labels),
        (warnings_tokens, warnings_labels),
        (indications_tokens, indications_labels),
        (usage_instructions_tokens, usage_instructions_labels),
    ]
    
    # Shuffle the chunks
    shuffled_tokens, shuffled_ner_tags = shuffle_token_label_chunks(chunks)
    
    return {
        "tokens": shuffled_tokens,
        "labels": shuffled_ner_tags
    }

In [6]:
# Call the function for training and evaluation datasets
dataset_for_train = [automate_ner_tags_xlm_roberta_tokenizer(row) for _, row in dataset_for_train.iterrows()]
dataset_for_eval = [automate_ner_tags_xlm_roberta_tokenizer(row) for _, row in dataset_for_eval.iterrows()]
dataset_for_test = [automate_ner_tags_xlm_roberta_tokenizer(row) for _, row in dataset_for_test.iterrows()]

# Save with JSON File
import json
with open('./shuffled_finetuning_data/train.json', 'w', encoding='utf-8') as f:
    json.dump(dataset_for_train, f, ensure_ascii=False, indent=4)

# Save with JSON File
import json
with open('./shuffled_finetuning_data/eval.json', 'w', encoding='utf-8') as f:
    json.dump(dataset_for_eval, f, ensure_ascii=False, indent=4)

# Save with JSON File
with open('./shuffled_finetuning_data/test.json', 'w', encoding='utf-8') as f:
    json.dump(dataset_for_test, f, ensure_ascii=False, indent=4)