In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
def pre_process_narrative(df):
    imp = df['Narrative']
    imp = imp.str.strip()
    imp = imp.replace('\n',' ', regex=True).replace('\s+', ' ', regex=True).str.strip()
    df['Narrative'] = imp
    return df

In [5]:
def generate_input(input):
    prompt = f"""Q: This is a pediatric radiology report: {input}"""
    prompt += '(1) Does the child have necrotizing enterocolitis? A. Yes. B. No. '
    prompt += '(2) Does the child have pneumatosis? A. Yes. B. No. '
    prompt += '(3) Does the child have portal venous gas? A. Yes. B. No. '
    prompt += '(4) Does the child have free air? A. Yes. B. No. '
    return prompt


def generate_instruction():
    return 'Please answer the four numbered questions. Write answer using A/B in between <answer></answer>.'
    
def get_ans(x):
    if x == 1:
        return 'A. Yes'
    else:
        return 'B. No'
def generate_output(jk_nec_features,jk_pneumatosis,jk_pvg,jk_freeair):
    temp = np.array([jk_nec_features,jk_pneumatosis,jk_pvg,jk_freeair])
    results = [get_ans(i) for i in temp]     
    answer = '<answer>\n'
    for result in results:
        answer += f' {result}\n'  
    answer += '</answer>'
    return answer

def generate_prompt(data_point):
    """Gen input text based on a 
        prompt, 
        task instruction, 
        (context info.), and answer

    :param data_point: dict: Data point
    :return: dict: tokenized prompt
    """

    # Generate prompt
    prefix_text = 'Below is an instruction that describes a task. Write a response that ' \
               'appropriately completes the request.\n\n'
    
    # Samples with additional context into.
    if data_point['input']:
        text = f"""<start_of_turn>user {prefix_text} {data_point["instruction"]} here are the inputs {data_point["input"]} <end_of_turn>\n<start_of_turn>model{data_point["output"]} <end_of_turn>"""
    # Without
    else:
        text = f"""<start_of_turn>user {prefix_text} {data_point["instruction"]} <end_of_turn>\n<start_of_turn>model{data_point["output"]} <end_of_turn>"""
    return text


def load_training_data(TRAIN_SET):
    raw_df = pd.read_csv(TRAIN_SET)
    df = raw_df[['Narrative', 'jk_nec_features', 'jk_pneumatosis', 'jk_pvg','jk_freeair', 'notes']]
    df = pre_process_narrative(df)
    df['input'] =  df['Narrative'].apply(lambda x: generate_input(x))  
    df['instruction'] =  generate_instruction()
    df['output'] = df.apply(lambda row: generate_output(row['jk_nec_features'], row['jk_pneumatosis'], row['jk_pvg'], row['jk_freeair']), axis=1)
    results = df[['input','instruction','output']]
    return results

In [6]:
results = load_training_data(TRAIN_SET)

In [7]:
results.columns

Index(['input', 'instruction', 'output'], dtype='object')

In [8]:
def generate_prompt(data_point):
    """Gen. input text based on a prompt, task instruction, (context info.), and answer

    :param data_point: dict: Data point
    :return: dict: tokenzed prompt
    """
    # Generate prompt
    prefix_text = 'Below is an instruction that describes a task. Write a response that ' \
               'appropriately completes the request.\n\n'

    if data_point['input']:
        text = f"""<start_of_turn>user {prefix_text} {data_point["instruction"]} here are the inputs {data_point["input"]} <end_of_turn>\n<start_of_turn>model{data_point["output"]} <end_of_turn>"""
    else:
        text = f"""<start_of_turn>user {prefix_text} {data_point["instruction"]} <end_of_turn>\n<start_of_turn>model{data_point["output"]} <end_of_turn>"""
    return text

results['prompt'] = results.apply(lambda row: generate_prompt(row), axis=1)

In [10]:
from datasets import Dataset

In [12]:
train_dataset = Dataset.from_pandas(results)

In [13]:
train_dataset

Dataset({
    features: ['input', 'instruction', 'output', 'prompt'],
    num_rows: 300
})

## build dataset

In [34]:
import torch
from torch.utils.data import Dataset

class AXRDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        input_text = self.data.iloc[idx]['input']
        output_text = self.data.iloc[idx]['output']
        instruction_text = self.data.iloc[idx]['instruction']
        prompt_text = self.data.iloc[idx]['prompt']
        
        return {'input': input_text, 'output': output_text, 'instruction': instruction_text, 'prompt': prompt_text}

AXR_dataset = AXRDataset(results)

# Accessing a sample from the dataset
sample = AXR_dataset[0]
print(sample)


{'input': 'Q: This is a pediatric radiology report: XR ABDOMEN PORTABLE 1 VIEW, 7/25/2020 at 0511 hours INDICATION: PICC Surveillance COMPARISON: Abdominal radiograph 7/18/2020. Chest radiograph 7/20/2020. IMPRESSION: 1. Lines and tubes: Unchanged gastric tube in the stomach and feeding tube with the tip in the ligament of Treitz. Left lower extremity PICC projects over the L1 vertebral body. 2. Nonobstructive bowel gas pattern. No evidence of pneumatosis. 3. Small left pleural effusion. Bibasilar coarse lung opacities.(1) Does the child have necrotizing enterocolitis? A. Yes. B. No. (2) Does the child have pneumatosis? A. Yes. B. No. (3) Does the child have portal venous gas? A. Yes. B. No. (4) Does the child have free air? A. Yes. B. No. ', 'output': '<answer>\n B. No\n B. No\n B. No\n B. No\n</answer>', 'instruction': 'Please answer the four numbered questions. Write answer using A/B in between <answer></answer>.', 'prompt': '<start_of_turn>user Below is an instruction that describe

In [37]:
sample['prompt']

'<start_of_turn>user Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n Please answer the four numbered questions. Write answer using A/B in between <answer></answer>. here are the inputs Q: This is a pediatric radiology report: XR ABDOMEN PORTABLE 1 VIEW, 7/25/2020 at 0511 hours INDICATION: PICC Surveillance COMPARISON: Abdominal radiograph 7/18/2020. Chest radiograph 7/20/2020. IMPRESSION: 1. Lines and tubes: Unchanged gastric tube in the stomach and feeding tube with the tip in the ligament of Treitz. Left lower extremity PICC projects over the L1 vertebral body. 2. Nonobstructive bowel gas pattern. No evidence of pneumatosis. 3. Small left pleural effusion. Bibasilar coarse lung opacities.(1) Does the child have necrotizing enterocolitis? A. Yes. B. No. (2) Does the child have pneumatosis? A. Yes. B. No. (3) Does the child have portal venous gas? A. Yes. B. No. (4) Does the child have free air? A. Yes. B. No.  <end_of_turn>

In [None]:
alpaca_prompt = """
{}


### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)