In [1]:
import json
import numpy as np
import random
import sys
from tqdm import tqdm
sys.path.append('../')
import utils

# fix random seed
np.random.seed(2023)
random.seed(2023)

def append_data(finetune_data, data, med_names, num_visits, negative_ratio, med_weights=None):
    data_len = len(finetune_data)
    prompter = utils.Prompter() 
    if num_visits == -1:   # use all the data
        num_visits = len(data)
    data_sample = data.iloc[:num_visits]
    for idx, row in tqdm(data_sample.iterrows(), total=len(data_sample)):
        pos_med_list = eval(row['drug_name'])
        if negative_ratio == -1:            # use all the drugs
            neg_med_list = list(set(med_names) - set(pos_med_list))
        else:                               # use a subset of drugs as negative samples
            if negative_ratio * len(pos_med_list) > len(set(med_names) - set(pos_med_list)):
                    # if the number of negative samples is larger than the available drugs, use all the drugs as neg samples
                neg_med_list = list(set(med_names) - set(pos_med_list))
            else:   # randomly sample negative samples
                # get the index of pos_med_list in med_names
                pos_med_idx = [med_names.index(med) for med in pos_med_list]
                # get the weights of candidate negative samples
                neg_med_weights = np.array(med_weights) + 10
                neg_med_weights[pos_med_idx] = 0
                # sample negative samples
                neg_med_list = list(np.random.choice(med_names, int(negative_ratio * len(pos_med_list)), p=neg_med_weights/sum(neg_med_weights)))
                # neg_med_list = list(np.random.choice(list(set(med_names) - set(pos_med_list)), negative_ratio * len(pos_med_list)))
        med_list = pos_med_list + neg_med_list
        random.shuffle(med_list)
        for med in med_list:
            input = prompter.generate_input(row, drug_candidate=med)
            output = 'Yes.' if med in pos_med_list else 'No.'
            json_item = {"input": input, "output": output}
            finetune_data.append(json_item)
    print(f'data appended: {len(finetune_data) - data_len}')
    return finetune_data

def get_data4finetune(num_visits, negative_ratio, output_filename):
    '''
    :param data: the original data4LLM
    :param med_names: the list of all the drugs
    :param negative_ratio: the ratio of negative samples to positive samples
    :param num_samples: the number of samples to be generated
    :param output_filename: the output filename
    :return: the finetune data
    '''
    finetune_data = []

    data_train, med_names, med_weights = utils.load_data(mode='train')
    finetune_data = append_data(finetune_data, \
        data_train, med_names, num_visits, negative_ratio, med_weights)
    
    # val data
    data_val, med_names = utils.load_data(mode='val')
    finetune_data = append_data(finetune_data, \
        data_val, med_names, num_visits=10, negative_ratio=-1)
    
    print(f'all finetune data generated: {len(finetune_data)}')
    with open(output_filename, 'w') as json_file:
        json.dump(finetune_data, json_file, indent=4)
        print(f'finetune data saved to {output_filename}\n\n')
    return finetune_data


# :param num_visits: the number of visits to be used, -1 means all the visits
# :param negative_ratio: the ratio of negative samples to positive samples, -1 means all the drugs

# num_visits = 1000
num_visits = -1    # use all the visits


for negative_ratio in [2]:
    print(f'----------------negative_ratio: {negative_ratio}----------------------')
    output_filename = f'../data/finetune_data_ratio_{negative_ratio}_ips.json'
    # output_filename = '../data/finetune_data_test.json'
    data = get_data4finetune(num_visits, negative_ratio, output_filename)

----------------negative_ratio: 2----------------------
Total number of drugs: 151
Train data size: 9960



100%|██████████| 9960/9960 [00:15<00:00, 631.81it/s]


data appended: 679403
Total number of drugs: 151
Val data size: 2490



100%|██████████| 10/10 [00:00<00:00, 330.62it/s]

data appended: 1510
all finetune data generated: 680913





finetune data saved to ../data/finetune_data_ratio_2_ips.json




In [1]:
100*151/716

21.089385474860336

In [2]:
716/151

4.741721854304636

In [5]:
# 计算instruction的平均单词数
instruction_len = []
for item in data:
    instruction_len.append(len(item['input'].split()))
print('instruction average length: {}'.format(np.mean(instruction_len)))


instruction average length: 68.41432346723045


In [6]:
# 计算output为Yes的比例
output_yes = 0
for item in data:
    if item['output'] == 'Yes.':
        output_yes += 1
print('output Yes ratio: {}'.format(output_yes / len(data)))

output Yes ratio: 0.2
