# Libraries

In [61]:
import os
import torch
import requests
from PIL import Image

from sklearn.metrics import classification_report
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, AutoProcessor, AutoModelForImageTextToText, MllamaForConditionalGeneration, TrainingArguments, Trainer
from datasets import load_dataset, Dataset

# Meta Llama 3.2 3B Instruct

In [2]:
model_id = "models/Meta-Llama-3.2-3B-Instruct"

# Load model directly
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


In [4]:
messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
]
outputs = pipe(
    messages,
    max_new_tokens=256,
)

print(outputs[0]["generated_text"][-1])

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


{'role': 'assistant', 'content': "Yer lookin' fer a swashbucklin' chatbot, eh? Alright then, matey! I be Captain Clout, the scurviest pirate bot to ever sail the Seven Seas... o' cyberspace, that be! Me and me trusty AI crew be here to provide ye with treasure troves o' knowledge, answer yer questions, and maybe even engage in a bit o' pirate-y banter, savvy? So hoist the sails and set course fer a grand adventure, me hearty!"}


In [5]:
message1 = [
    {"role": "user", "content": "Who are you?"},
]

outputs = pipe(
    message1,
    max_new_tokens=256,
)

print(outputs[0]["generated_text"][-1])

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


{'role': 'assistant', 'content': 'I\'m an artificial intelligence model known as Llama. Llama stands for "Large Language Model Meta AI."'}


In [6]:
message2 = [
    {"role": "user", "content": "What can you do?"},
]

outputs = pipe(
    message2,
    max_new_tokens=256,
)

print(outputs[0]["generated_text"][-1])

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


{'role': 'assistant', 'content': "I can be used in a variety of ways, from helping you plan a vacation to creating art. I'm here to assist you in finding the help or information you need. My strengths include answering questions, generating text and images and even just chatting with you."}


In [7]:
message3 = [
    {"role": "user", "content": "What are the symptoms of strep throat?"},
]

outputs = pipe(
    message3,
    max_new_tokens=256,
)

print(outputs[0]["generated_text"][-1])

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


{'role': 'assistant', 'content': 'The symptoms of strep throat, also known as streptococcal pharyngitis, typically include:\n\n1. Sudden onset of severe sore throat: The throat pain is often described as sharp, stabbing, or burning, and can be intense enough to make it difficult to swallow.\n2. High fever: A high temperature, usually above 101°F (38.3°C), is common in the early stages of strep throat.\n3. White or yellow patches on the tonsils: These patches, also known as exudates, can be seen on the tonsils and may be accompanied by a white or yellowish discharge.\n4. Swollen and tender lymph nodes: The lymph nodes in the neck may become swollen and tender to the touch.\n5. Headache: A headache can occur due to the inflammation and pain caused by the infection.\n6. General feeling of illness: People with strep throat often feel unwell, with symptoms such as fatigue, weakness, and loss of appetite.\n7. Difficulty swallowing: The pain and inflammation in the throat can make it uncomfor

In [8]:
message4 = [
    {"role": "user", "content": "Can you tell me something about Human Phenotype Ontology (HPO)?"},
]

outputs = pipe(
    message4,
    max_new_tokens=256,
)

print(outputs[0]["generated_text"][-1])

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


{'role': 'assistant', 'content': "Human Phenotype Ontology (HPO) is a controlled vocabulary for describing human phenotypes. It's a standardized way to represent and categorize the wide range of physical, behavioral, and other characteristics that are present in humans.\n\nHere are some key aspects of HPO:\n\n1. **Definition**: HPO is a collaborative effort between the National Center for Biotechnology Information (NCBI) and the Human Phenotype Ontology Consortium (HPOC). It was developed to provide a common language for describing human phenotypes, making it easier to share and compare data across different studies and databases.\n2. **Scope**: HPO covers a vast range of human phenotypes, including genetic, environmental, and lifestyle-related traits. It includes phenotypes related to development, growth, and disease, as well as behavioral and cognitive traits.\n3. **Structure**: HPO uses a hierarchical structure, with phenotypes organized into categories and subcategories. Each pheno

In [9]:
hpo1 = [
    {"role": "user", "content": "What human phenotype do I have if my symptoms are numerous basal cell carcinomas (BCCs), odontogenic keratocysts of the jaws, palmar and/or plantar pits, skeletal abnormalities and intracranial calcifications?"},
]

outputs = pipe(
    hpo1,
    max_new_tokens=256,
)

print(outputs[0]["generated_text"][-1])

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


{'role': 'assistant', 'content': "The human phenotype you're describing is known as Nevoid Basal Cell Carcinoma Syndrome (NBCCS), also referred to as Gorlin syndrome.\n\nNBCCS is a rare genetic disorder caused by mutations in the PTCH1 gene, which is a tumor suppressor gene. This syndrome is characterized by multiple basal cell carcinomas (BCCs), odontogenic keratocysts of the jaws, palmar and/or plantar pits, skeletal abnormalities, and intracranial calcifications.\n\nOther common features of NBCCS include:\n\n- Dental abnormalities, such as supernumerary teeth and impacted teeth\n- Cerebellar ataxia (loss of coordination)\n- Osteoporosis\n- Cleft palate\n- Skin lesions, such as seborrheic keratoses and keratoacanthomas\n- Eye abnormalities, such as cataracts and corneal calcifications\n\nNBCCS is usually inherited in an autosomal dominant pattern, meaning that a single copy of the mutated gene is enough to cause the condition.\n\nIt's essential to note that a diagnosis of NBCCS can o

# NLP4RARE-CM-UC3m

## Load Data

In [44]:
def read_files(data_dir):
    texts = []
    annotations = []
    for filename in os.listdir(data_dir):
        if filename.endswith(".txt"):
            with open(os.path.join(data_dir, filename), 'r') as f:
                texts.append(f.read())
        elif filename.endswith(".ann"):
            with open(os.path.join(data_dir, filename), 'r') as f:
                annotations.append(f.read())
    return texts, annotations

train_texts, train_annotations = read_files("dataset/RareDis-v1/train")
dev_texts, dev_annotations = read_files("dataset/RareDis-v1/dev")
test_texts, test_annotations = read_files("dataset/RareDis-v1/test")

In [45]:
# Debugging: Print the contents of train_texts and train_annotations
print("Train Texts:", train_texts)
print("Train Annotations:", train_annotations)

Train Texts: ['Nephrogenic systemic fibrosis (NSF) is a rare disorder that occurs in some individuals with reduced kidney function, who have been exposed to an intravenous contrast material that contains gadolinium. A contrast material is a dye that is sometimes used during magnetic resonance imaging (MRI). The term, fibrosis, refers to the thickening and scarring of connective tissue, most often the consequence of inflammation or injury. NSF is characterized by thickening and hardening (fibrosis) of the skin, subcutaneous tissues, and, sometimes, underlying skeletal muscle. The arms and legs are most often affected. In some cases, the skin on the trunk can also become involved. This proliferation of fibrotic tissue may become systemic, extending to other areas including the smooth, delicate membrane that surrounds the lungs (pleura), the sac surrounding the heart (pericardium), the thin sheet of muscle that aids respiration by moving up and down when breathing (diaphragm), and the out

In [46]:
# Debugging: Print the contents of train_texts and train_annotations
print("Dev Texts:", dev_texts)
print("Dev Annotations:", dev_annotations)

Dev Texts: ['Hereditary ataxias affect males and females in equal numbers. It is estimated that 150,000 people in the United States are affected by, or at risk for, hereditary ataxia. There is variation among the specific forms of hereditary ataxia as to when they typically first appear. Some ataxias are more common in certain ethnic groups.  For example, SCA3 is more common in the Portuguese population, SCA10 is more common in the Mexican population, and DRPLA is more common in Japan.', 'Infants with EI may be born with red, blistering and denuded skin with visible areas of skin thickening. Over time, there is a gradual decrease in blistering, but an increase in the severity of the scaling and skin thickening. Scales tend to form in parallel rows of spines or ridges. A generalized erythroderma (redness of the skin) may be present in some individuals. Skin infections with common bacteria can be a problem. Heat intolerance is common. A palmoplantar keratoderma may be present and can be 

In [47]:
# Debugging: Print the contents of train_texts and train_annotations
print("Test Texts:", test_texts)
print("Test Annotations:", test_annotations)

Test Texts: ['The autosomal recessive forms of glycogen storage disease IX affect males and females in equal numbers. The X-linked forms primarily affect males, although females can have symptoms, such as enlargement of the liver and, more rarely, females can have symptoms similar to those seen in males. GSD-IX types A, B and C are estimated to affect 1 in 100,000 individuals in the general population. These disorders account for approximately 25% of all glycogen storage disorders making GSD-IX one of the most common forms of these disorders. Because some affected individuals go undiagnosed or misdiagnosed, it is difficult to determine the true frequency of GSD-IX in the general population. GSD-IXd is extremely rare and its prevalence is unknown.', 'Functional neurological disorder (FND) is a medical condition in which there is a problem with the functioning of the nervous system and how the brain and body sends and/or receives signals, rather than a structural disease process such as 

## PreProcess

In [48]:
def preprocessing(texts, annotations):
    processed_data = []
    for text, ann in zip(texts, annotations):
        processed_data.append({"text": text, "annotations": ann})
    return processed_data

train_data = preprocessing(train_texts, train_annotations)
dev_data = preprocessing(dev_texts, dev_annotations)
test_data = preprocessing(test_texts, test_annotations)

In [49]:
print("Processed Training Data:", train_data)

Processed Training Data: [{'text': 'Nephrogenic systemic fibrosis (NSF) is a rare disorder that occurs in some individuals with reduced kidney function, who have been exposed to an intravenous contrast material that contains gadolinium. A contrast material is a dye that is sometimes used during magnetic resonance imaging (MRI). The term, fibrosis, refers to the thickening and scarring of connective tissue, most often the consequence of inflammation or injury. NSF is characterized by thickening and hardening (fibrosis) of the skin, subcutaneous tissues, and, sometimes, underlying skeletal muscle. The arms and legs are most often affected. In some cases, the skin on the trunk can also become involved. This proliferation of fibrotic tissue may become systemic, extending to other areas including the smooth, delicate membrane that surrounds the lungs (pleura), the sac surrounding the heart (pericardium), the thin sheet of muscle that aids respiration by moving up and down when breathing (di

In [50]:
print("Processed Dev Data:", dev_data)

Processed Dev Data: [{'text': 'Hereditary ataxias affect males and females in equal numbers. It is estimated that 150,000 people in the United States are affected by, or at risk for, hereditary ataxia. There is variation among the specific forms of hereditary ataxia as to when they typically first appear. Some ataxias are more common in certain ethnic groups.  For example, SCA3 is more common in the Portuguese population, SCA10 is more common in the Mexican population, and DRPLA is more common in Japan.', 'annotations': 'T1\tRAREDISEASE 0 22\tTownes-Brocks syndrome\nT2\tRAREDISEASE 24 27\tTBS\nT3\tDISEASE 35 70\tautosomal dominant genetic disorder\nT4\tDISEASE 117 133\timperforate anus\nT5\tSIGN 117 133\timperforate anus\nT6\tSIGN 136 149\tabnormal ears\nT7\tSIGN 166 184\thearing impairment\nT8\tSIGN 189 208\tthumb malformations\nT9\tSIGN 210 235\tAbnormalities in the feet\nT10\tSIGN 210 230;237 242\tAbnormalities in the heart\nT11\tSIGN 210 230;247 254\tAbnormalities in the kidneys\nT

In [51]:
print("Processed Test Data:", test_data)

Processed Test Data: [{'text': 'The autosomal recessive forms of glycogen storage disease IX affect males and females in equal numbers. The X-linked forms primarily affect males, although females can have symptoms, such as enlargement of the liver and, more rarely, females can have symptoms similar to those seen in males. GSD-IX types A, B and C are estimated to affect 1 in 100,000 individuals in the general population. These disorders account for approximately 25% of all glycogen storage disorders making GSD-IX one of the most common forms of these disorders. Because some affected individuals go undiagnosed or misdiagnosed, it is difficult to determine the true frequency of GSD-IX in the general population. GSD-IXd is extremely rare and its prevalence is unknown.', 'annotations': 'T1\tDISEASE 16 58\tinherited disorders of fructose metabolism\nT2\tRAREDISEASE 98 119\tEssential fructosuria\nT3\tRAREDISEASE 171 202\tHereditary fructose intolerance\nT4\tRAREDISEASE 204 207\tHFI\nT5\tRARED

In [52]:
# Convert to dictionary
def dictionary_converter(data):
    dict_data = {"text": [], "annotations": []}
    for item in data:
        dict_data["text"].append(item["text"])
        dict_data["annotations"].append(item["annotations"])
    return dict_data

# Convert the processed data to the required format
train_dictionary = dictionary_converter(train_data)
dev_dictionary = dictionary_converter(dev_data)
test_dictionary = dictionary_converter(test_data)

# Dataset Convertion
train_dataset = Dataset.from_dict(train_dictionary)
dev_dataset = Dataset.from_dict(dev_dictionary)
test_dataset = Dataset.from_dict(test_dictionary)

In [56]:
print(train_dataset)
print(dev_dataset)
print(test_dataset)

Dataset({
    features: ['text', 'annotations'],
    num_rows: 729
})
Dataset({
    features: ['text', 'annotations'],
    num_rows: 104
})
Dataset({
    features: ['text', 'annotations'],
    num_rows: 208
})


## Tokenization

In [58]:
# Use the End-of-Sequence Token as the Padding Token
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length")

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_dev = dev_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/729 [00:00<?, ? examples/s]

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

## Fine Tuning the Model

In [62]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_dev,
)

Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


## Train

In [63]:
trainer.train()

OutOfMemoryError: Caught OutOfMemoryError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/bsindala/.conda/envs/dmenv/lib/python3.11/site-packages/torch/nn/parallel/parallel_apply.py", line 96, in _worker
    output = module(*input, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/bsindala/.conda/envs/dmenv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/bsindala/.conda/envs/dmenv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/bsindala/.conda/envs/dmenv/lib/python3.11/site-packages/transformers/models/llama/modeling_llama.py", line 1163, in forward
    outputs = self.model(
              ^^^^^^^^^^^
  File "/home/bsindala/.conda/envs/dmenv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/bsindala/.conda/envs/dmenv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/bsindala/.conda/envs/dmenv/lib/python3.11/site-packages/transformers/models/llama/modeling_llama.py", line 859, in forward
    inputs_embeds = self.embed_tokens(input_ids)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/bsindala/.conda/envs/dmenv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1736, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/bsindala/.conda/envs/dmenv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1747, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/bsindala/.conda/envs/dmenv/lib/python3.11/site-packages/torch/nn/modules/sparse.py", line 190, in forward
    return F.embedding(
           ^^^^^^^^^^^^
  File "/home/bsindala/.conda/envs/dmenv/lib/python3.11/site-packages/torch/nn/functional.py", line 2551, in embedding
    return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 12.00 GiB. GPU 0 has a total capacity of 15.89 GiB of which 151.12 MiB is free. Including non-PyTorch memory, this process has 15.74 GiB memory in use. Of the allocated memory 15.37 GiB is allocated by PyTorch, and 33.02 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)


## Evaluate

In [None]:
eval_results = trainer.evaluate()
print(eval_results)

## Test

In [None]:
test_results = trainer.predict(tokenized_test)
print(test_results)

## Validation

In [None]:
predictions = test_results.predictions.argmax(-1)
labels = test_results.label_ids
report = classification_report(labels, predictions)
print(report)