In [1]:
import json
import numpy as np
import random
import sys
from tqdm import tqdm
sys.path.append('../')
import utils

# fix random seed
np.random.seed(2023)
random.seed(2023)

def append_data(finetune_data, data, med_names, num_visits, negative_ratio):
    data_len = len(finetune_data)
    prompter = utils.Prompter() 
    if num_visits == -1:   # use all the data
        num_visits = len(data)
    data_sample = data.iloc[:num_visits]
    for idx, row in tqdm(data_sample.iterrows(), total=len(data_sample)):
        pos_med_list = eval(row['drug_name'])
        if negative_ratio == -1:            # use all the drugs
            neg_med_list = list(set(med_names) - set(pos_med_list))
        else:                               # use a subset of drugs as negative samples
            if negative_ratio * len(pos_med_list) > len(set(med_names) - set(pos_med_list)):
                    # if the number of negative samples is larger than the available drugs, use all the drugs as neg samples
                neg_med_list = list(set(med_names) - set(pos_med_list))
            else:   # randomly sample negative samples
                neg_med_list = random.sample(
                    list(set(med_names) - set(pos_med_list)), negative_ratio * len(pos_med_list))
        med_list = pos_med_list + neg_med_list
        random.shuffle(med_list)
        for med in med_list:
            input = prompter.generate_input(row, drug_candidate=med)
            output = 'Yes.' if med in pos_med_list else 'No.'
            json_item = {"input": input, "output": output}
            finetune_data.append(json_item)
    print('data appended: {}'.format(len(finetune_data) - data_len))
    return finetune_data

def get_data4finetune(num_visits, negative_ratio, output_filename):
    '''
    :param data: the original data4LLM
    :param med_names: the list of all the drugs
    :param negative_ratio: the ratio of negative samples to positive samples
    :param num_samples: the number of samples to be generated
    :param output_filename: the output filename
    :return: the finetune data
    '''
    finetune_data = []

    data_train, med_names, _ = utils.load_data(mode='train')
    finetune_data = append_data(finetune_data, \
        data_train, med_names, num_visits, negative_ratio)
    
    # val data
    data_val, med_names = utils.load_data(mode='val')
    finetune_data = append_data(finetune_data, \
        data_val, med_names, num_visits=10, negative_ratio=-1)
    
    print('all finetune data generated: {}'.format(len(finetune_data)))
    with open(output_filename, 'w') as json_file:
        json.dump(finetune_data, json_file, indent=4)
        print('finetune data saved to {}\n\n'.format(output_filename))
    return finetune_data


# :param num_visits: the number of visits to be used, -1 means all the visits
# :param negative_ratio: the ratio of negative samples to positive samples, -1 means all the drugs

# num_visits = 100
num_visits = -1    # use all the visits


for negative_ratio in [4,7,10]:
    print('----------------negative_ratio: {}----------------------'.format(negative_ratio))
    output_filename = '../data/finetune_data_ratio_{}.json'.format(negative_ratio)
    # output_filename = '../data/finetune_data_test.json'
    data = get_data4finetune(num_visits, negative_ratio, output_filename)

----------------negative_ratio: 4----------------------
Total number of drugs: 151
Train data size: 9960



100%|██████████| 9960/9960 [00:20<00:00, 475.59it/s]


data appended: 1053416
Total number of drugs: 151
Val data size: 2490



100%|██████████| 10/10 [00:00<00:00, 335.99it/s]

data appended: 1510
all finetune data generated: 1054926





finetune data saved to ../data/finetune_data_ratio_4.json


----------------negative_ratio: 7----------------------
Total number of drugs: 151
Train data size: 9960



100%|██████████| 9960/9960 [00:25<00:00, 390.43it/s]


data appended: 1325878
Total number of drugs: 151
Val data size: 2490



100%|██████████| 10/10 [00:00<00:00, 336.70it/s]

data appended: 1510
all finetune data generated: 1327388





finetune data saved to ../data/finetune_data_ratio_7.json


----------------negative_ratio: 10----------------------
Total number of drugs: 151
Train data size: 9960



100%|██████████| 9960/9960 [00:27<00:00, 365.10it/s]


data appended: 1423232
Total number of drugs: 151
Val data size: 2490



100%|██████████| 10/10 [00:00<00:00, 337.05it/s]

data appended: 1510
all finetune data generated: 1424742





finetune data saved to ../data/finetune_data_ratio_10.json




In [5]:
# 计算instruction的平均单词数
instruction_len = []
for item in data:
    instruction_len.append(len(item['input'].split()))
print('instruction average length: {}'.format(np.mean(instruction_len)))


instruction average length: 68.41432346723045


In [6]:
# 计算output为Yes的比例
output_yes = 0
for item in data:
    if item['output'] == 'Yes.':
        output_yes += 1
print('output Yes ratio: {}'.format(output_yes / len(data)))

output Yes ratio: 0.2
