In [None]:
!pip install datasets
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 KB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 KB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting multiprocess
  Downloading multiprocess-0.70.14-py39-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 KB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m42.3

In [None]:


import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import Dataset

# Load your dataset into a Pandas DataFrame
df = pd.read_csv('cleaned_radiology.csv')
print(df.columns)

# Define the GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Define a function to tokenize the input text and generate the summary
def preprocess_function(examples):
    inputs = [prefix + text for prefix, text in examples]
    targets = [summary for _, summary in examples]
    input_encodings = tokenizer(inputs, truncation=True, padding=True, max_length=512)
    target_encodings = tokenizer(targets, truncation=True, padding=True, max_length=64)
    encodings = {'input_ids': input_encodings['input_ids'], 'attention_mask': input_encodings['attention_mask'], 'decoder_input_ids': target_encodings['input_ids'], 'decoder_attention_mask': target_encodings['attention_mask']}
    return encodings

# Define a Dataset for your dataset
dataset = Dataset.from_dict({'text': [text for text in df['cleaned_text']]})

# Define a DataCollator for language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, model=model)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy = 'steps',
    eval_steps = 500,
    save_total_limit = 2,
    num_train_epochs = 1,
    learning_rate = 5e-5,
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 2,
    warmup_steps = 500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=500,
    load_best_model_at_end=True
)

# Define a Trainer object and fine-tune the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator,
    preprocess_function=preprocess_function
)

trainer.train()


Index(['Unnamed: 0', 'icustay_id', 'hr', 'category', 'description', 'text',
       'mortality', 'cleaned_text'],
      dtype='object')


TypeError: ignored

In [None]:
''' Testing '''

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_name = "output_dir/best_model"  # Replace with the path of the saved model
tokenizer_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

input_text = "Reason: CEREBRAL INFARCTION MEDICAL CONDITION: 74 year old man with cerebellar infarct hemorrhage 4th ventricle occlusion. REASON FOR THIS EXAMINATION: compare to outside hospital MRI. No contraindications for IV contrast FINAL REPORT HISTORY: Cerebellar infarction hemorrhage fourth ventricle occlusion. Compared to outside hospital MRI. COMPARISON: No previous studies in PACS. The outside hospital MRI is not available for comparison. TECHNIQUE: Noncontrast head CT. FINDINGS: There is a large subacute infarction in the right cerebellar hemisphere which extends to the vermis with associated compression of the fourth ventricle. The lateral and third ventricles are dilated. Evaluation of the lower posterior fossa for hemorrhage is limited by bone related artifact and motion artifact. Small amount of blood may be present within the right cerebellar infarction. There is a chronic infarction in the left cerebellar hemisphere. There are multiple small foci of low density in the subcortical and periventricular white matter of the cerebral hemispheres bilaterally consistent with chronic microvascular ischemia. The visualized osseous structures appear unremarkable. IMPRESSION: 1. Subacute right cerebellar infarction with compression of the fourth ventricle and moderate dilatation of the lateral and third ventricles. 2. Possible hemorrhage in the right cerebellar infarction. 3. When the outside hospital MRI is scanned into PACS an addendum to this report may be issued on request. DFDkq "
input_ids = tokenizer.encode(input_text, return_tensors="pt")
output_ids = model.generate(input_ids, num_beams=4, length_penalty=2.0, max_length=256, early_stopping=True)
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("Generated Summary:", output_text)
