In [1]:
import warnings
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)

#load the original ADR related People data and make necessary modifications to it

df = pd.read_csv('ADR_people.csv')
df = df.drop('Unnamed: 0', axis=1)
df['item_number'] = df['item_number'].astype(str)
df['create_datetime'] = pd.to_datetime(df['create_datetime'], format='%d/%m/%y')
df['item_number'] = df['item_number'].str.strip()

#load the ADR list

ADR_list = ['D52.1', 'D59.0','D59.2','D61.1', 'D64.2','D68.3','D69.5','E03.2',
            'E06.4','E15','E16.0','E23.1','E24.2','E27.3','E66.1','F11', 'F13',
            'F15', 'F19','F55','G21.0','G21.1','G21.2','G24.0','G25.0','G25.1',
            'G25.4','G25.6','G44.4','G62.0','G72.0','H26.3','H40.6','H91.0','I42.7',
            'I95.2','J70.2','J70.3','J70.4','K52.1','K71.1','K71.2','K71.6','K71.9',
            'K85.3','L10.5','L23.3','L24.4','L25.1','L27','L27.0','L27.1','L27.8',
            'L27.9','L43.2','L51','L51.1','L51.2','L51.8','L51.9','L56.0','L56.1',
            'L64.0','M10.2','M32.0','M34.2','M80.4','M81.4','M83.5','M87.1','N14.0',
            'N14.1','N14.2','N14.3','N14.4','N99.0','O35.5','O74.2','O74.3','O74.4',
            'P04.0','P04.1','P04.4','P58.4','P93','P96.1','P96.2','Q86.1','Q86.2',
            'R50.2','T36','T37','T38','T39','T39.1','T40','T40.3','T41','T42','T43',
            'T44','T45','T46','T46.0','T47','T48','T49','T50','T78','T78.2','T78.3',
            'T78.4','T78.8','T78.9','T80.1','T80.2','T80.3','T80.4','T80.5','T80.6',
            'T80.8','T80.9','T88.3','T88.6','T88.7','X40','X41','X42','X43','X44',
            'X45.5','Y40','Y41','Y42','Y42.7','Y43','Y44','Y44.0','Y44.2','Y45',
            'Y45.5','Y46','Y47','Y47.1','Y48','Y48.1','Y48.2','Y48.3','Y48.4','Y49',
            'Y50','Y51','Y52','Y53','Y54','Y55','Y55.1','Y56','Y57','Y57.5','Y57.9',
            'Y58','Y59','Y59.9','Y63','Y65.1','Y69','Y88.0','Z03.6']

# List of desired columns
desired_columns = ['Y', 'T', 'R', 'K', 'D', 'I', 'L', 'G', 'X', 'Z']

In [2]:
df

Unnamed: 0,person_id,age,gender,episode_id,drg_number,drg_version,item_number,create_datetime
0,0,104,1,168951,L63B,4.2,B96,2004-07-02
1,0,104,1,168951,L63B,4.2,n39,2004-07-02
2,0,104,1,168951,L63B,4.2,Z86,2004-07-02
3,0,104,1,195656,f62b,4.2,I25.5,2005-07-20
4,0,104,1,195656,f62b,4.2,I95.9,2005-07-20
...,...,...,...,...,...,...,...,...
123994,1659,45,0,846115,K04A,6.2,92514-39,2018-03-29
123995,1659,45,0,846115,K04A,6.2,E66.93,2018-03-29
123996,1659,45,0,846115,K04A,6.2,K52.1,2018-03-29
123997,1659,45,0,846115,K04A,6.2,U73.8,2018-03-29


In [3]:
def generate_dataset(df, ADR_list, n_days):
    df['create_datetime'] = pd.to_datetime(df['create_datetime'], format='%d/%m/%y')
    df['item_number'] = df['item_number'].str.strip()

    new_dataset = pd.DataFrame()
    labels = []

    for person_id in df['person_id'].unique():
        person_df = df[df['person_id'] == person_id]
        person_adrs = person_df[person_df['item_number'].isin(ADR_list)]['item_number'].unique()
        if len(person_adrs) > 0:
            for adr in person_adrs:
                adr_date = person_df[person_df['item_number'] == adr]['create_datetime'].max()

                prior_period_df = person_df[(person_df['create_datetime'] <= adr_date) &
                                            (person_df['create_datetime'] >= adr_date - pd.DateOffset(days=n_days)) &
                                            ~((person_df['item_number'] == adr) & (person_df['create_datetime'] == adr_date))]

                # Order by 'create_datetime' and then 'item_number'
                prior_period_sorted = prior_period_df.sort_values(['create_datetime', 'item_number'])

                # Group by 'person_id' and collect item numbers
                prior_grouped = prior_period_sorted.groupby('person_id')['item_number'].apply(list).reset_index(name='item_list')
                prior_grouped['label'] = 1

                # Append to new_dataset and labels
                new_dataset = pd.concat([new_dataset, prior_grouped])

    return new_dataset

In [4]:
generated_data = generate_dataset(df, ADR_list, 100)

In [5]:
generated_data

Unnamed: 0,person_id,item_list,label
0,0,[K31.8],1
0,1,"[1732, 1831, 3123002, 3125501, 4019, 4019, 401...",1
0,2,"[E87, I08, I10, I27, I48, I50, I65, I95, Y92, ...",1
0,3,"[13706-02, 47009-00, 49318-00, 92514-39, 92515...",1
0,4,"[I48, I50, I70, M80, R42, Z92, Z95, Z96, F40, ...",1
...,...,...,...
0,1655,"[T78.0, Y92.9]",1
0,1656,"[13706-03, 13706-05, 96199-02, 96199-19, 96202...",1
0,1657,"[E87.5, E87.7, I10, M32.1, N08.5, R11, R53, Y4...",1
0,1658,"[47459-01, 50106-00, 92511-99, 92514-99, 95550...",1


In [18]:
import numpy as np

def generate_negative_label(df, ADR_list, n_days):
    df['create_datetime'] = pd.to_datetime(df['create_datetime'], format='%d/%m/%y')
    df['item_number'] = df['item_number'].str.strip()

    # Randomly select a row
    random_row = df.sample(1)

    # Check if the 'item_number' in this row is in ADR_list
    if random_row['item_number'].values[0] not in ADR_list:
        person_id = random_row['person_id'].values[0]
        chosen_date = random_row['create_datetime'].values[0]
        chosen_item = random_row['item_number'].values[0]

        # Select the rows for this 'person_id' which are 'n_days' prior to 'chosen_date'
        prior_period_df = df[(df['person_id'] == person_id) & 
                             (df['create_datetime'] < chosen_date) &
                             (df['create_datetime'] >= chosen_date - pd.DateOffset(days=n_days)) &
                            ~((df['item_number'] == chosen_item) & (df['create_datetime'] == chosen_date))
                            ]

        # Order by 'create_datetime' and then 'item_number'
        prior_period_sorted = prior_period_df.sort_values(['create_datetime', 'item_number'])

        # Group by 'person_id' and collect item numbers
        prior_grouped = prior_period_sorted.groupby('person_id')['item_number'].apply(list).reset_index(name='item_list')
        prior_grouped['label'] = 0  # label these as 0, since they are not associated with an ADR

        return prior_grouped


def run_negative_label_generation(df, ADR_list, n_days, n):
    # Initialize an empty dataframe to store aggregated results
    aggregated_results = pd.DataFrame(columns=['person_id', 'item_list', 'label'])

    # Run generate_negative_label 'n' times
    i=0
    while i < n:
        result = generate_negative_label(df, ADR_list, n_days)
        
        # Append the result to the aggregated_results dataframe
        if result is not None:
            aggregated_results = pd.concat([aggregated_results, result])
            i+=1

    # Reset the index of the aggregated_results dataframe
    aggregated_results.reset_index(drop=True, inplace=True)

    return aggregated_results


In [31]:
results = run_negative_label_generation(df, ADR_list, 100, 4000)

In [32]:
appended_data = pd.concat([generated_data, results])

# Reset index
appended_data.reset_index(drop=True, inplace=True)

# Save 'appended_data' to a csv file
appended_data.to_csv('randomly_generated_data.csv', index=False)

In [33]:
appended_data

Unnamed: 0,person_id,item_list,label
0,0,[K31.8],1
1,1,"[1732, 1831, 3123002, 3125501, 4019, 4019, 401...",1
2,2,"[E87, I08, I10, I27, I48, I50, I65, I95, Y92, ...",1
3,3,"[13706-02, 47009-00, 49318-00, 92514-39, 92515...",1
4,4,"[I48, I50, I70, M80, R42, Z92, Z95, Z96, F40, ...",1
...,...,...,...
4854,91,"[96200-00, 96200-00, 96200-00, Z51.1, Z51.1, Z...",0
4855,1501,[Z45.81],0
4856,903,"[M45.02, M45.04, M45.08, M45.02]",0
4857,1276,"[13100-08, 13100-08, 13100-08, 13100-08, 13100...",0
