* Extract 3 columns from all transactions: Patient_ID, Drug_ID, Dispense_Week
* All Drug_IDs in drug_with_illness_lookup_CN.csv have corresponding ATC codes
* Remove transactions of drugs unseen in drug_with_illness_lookup_CN.csv

Input:
* Merged/patient_1.txt ~ patient_50.txt
* drug_with_illness_lookup_CN.csv

Output:
* train_16.pkl, train_16_ground_truth.pkl
* test_16.pkl
* train_15.pkl, train_15_ground_truth.pkl
* test_15.pkl, test_15_ground_truth.pkl

In [1]:
import numpy as np
import pandas as pd

import pickle

def load_data(filename):
    with open(filename, 'rb') as fin:
        return pickle.load(fin)
    
def save_data(obj, filename):
    with open(filename, 'wb+') as fout:
        pickle.dump(obj, fout)
        
import os
root_path = 'C:/Users/yuanl4/Documents/MelbDatathon2017/'

def check_directory(directory):
    assert os.path.isdir(directory), "Directory doesn't exist: %s" % directory

check_directory(root_path + 'Merged')

In [2]:
def read_patients(i):
    df = pd.read_csv(root_path + 'Merged/patient_%d.txt' % i, sep='\t', 
                     parse_dates = ['Dispense_Week'],
                     usecols=['Patient_ID', 'Drug_ID', 'Dispense_Week'])
    df.columns = ['PID', 'DID', 'DispWeek']
    assert set(np.ediff1d(df.PID.values)) == {0, 1} # assume patient IDs are sorted and continuous
    return df

def convert_to_matrix(df):
    assert np.in1d(df.dtypes.values, [np.dtype('int64'), np.dtype('<M8[ns]')]).all() # data dtype is int64 or date

    matrix = np.zeros(df.shape, dtype=np.int32)

    year2000 = np.datetime64('2000-01-01')
    for i, (col_dtype, col_name) in enumerate(zip(df.dtypes.values, df.columns)):
        if col_dtype == np.dtype('int64'):
            matrix[:, i] = df[col_name].values
        else:
            matrix[:, i] = (df[col_name].values - year2000).astype('timedelta64[D]')
    return matrix, df.index.values

def read_data(file_range):
    transactions = []
    for i in file_range:
        print(i, end=' ')
        transactions.append(convert_to_matrix(read_patients(i))[0])
    print('finish loading')

    transactions = np.vstack(transactions)
    assert set(np.diff(transactions[:, 0])) == {0, 1} # assume patient IDs are sorted and continuous

    transactions[:, 0] -= transactions[:, 0].min() # now patient ID starts at 0
    return transactions

In [3]:
drug_lookup = pd.read_csv('drug_with_illness_lookup_CN.csv', index_col='DID', encoding='utf-16', sep='\t')
all_drugs = drug_lookup.index.values
diabetes_drugs = all_drugs[drug_lookup.ChronicIllness=='Diabetes']

In [4]:
%%time
transactions = read_data(range(1, 26)) # 1~25

num_patients = transactions[-1, 0] + 1
print('patients in the training set:', num_patients)

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 finish loading
patients in the training set: 279200
Wall time: 1min 29s


In [5]:
print(transactions.shape, end=' -> ')
transactions = transactions[np.in1d(transactions[:, 1], all_drugs)]
print(transactions.shape)

(34332818, 3) -> (34204142, 3)


In [6]:
year2016 = (np.datetime64('2016-01-01') - np.datetime64('2000-01-01')).astype('timedelta64[D]').astype(int)
year2015 = (np.datetime64('2015-01-01') - np.datetime64('2000-01-01')).astype('timedelta64[D]').astype(int)

# columns: ['PID', 'DID', 'DispWeek']
pre2016_mask = transactions[:, 2] < year2016
transactions_pre2016  = transactions[pre2016_mask]
transactions_2016     = transactions[~pre2016_mask]

pre2015_mask = transactions_pre2016[:, 2] < year2015
transactions_pre2015  = transactions_pre2016[pre2015_mask]
transactions_2015     = transactions_pre2016[~pre2015_mask]

print(transactions_pre2015.shape)
print(transactions_2015.shape)
print(transactions_pre2016.shape)
print(transactions_2016.shape)

(21731890, 3)
(5890452, 3)
(27622342, 3)
(6581800, 3)


In [7]:
save_data(transactions_pre2015, root_path + 'train_15.pkl')
save_data(transactions_pre2016, root_path + 'train_16.pkl')

In [8]:
def get_ground_truth(transactions):
    return np.bincount(transactions[:, 0], np.in1d(transactions[:, 1], diabetes_drugs)) > 0
ground_truth_15 = get_ground_truth(transactions_2015)
ground_truth_16 = get_ground_truth(transactions_2016)

In [9]:
save_data(ground_truth_15, root_path + 'train_15_ground_truth.pkl')
save_data(ground_truth_16, root_path + 'train_16_ground_truth.pkl')

In [10]:
%%time
transactions = read_data(range(26, 51)) # 26~50

num_patients = transactions[-1, 0] + 1
print('patients in the test set:', num_patients)

26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 finish loading
patients in the test set: 279152
Wall time: 42.1 s


In [11]:
print(transactions.shape, end=' -> ')
transactions = transactions[np.in1d(transactions[:, 1], all_drugs)]
print(transactions.shape)

(27692655, 3) -> (27600052, 3)


In [12]:
# columns: ['PID', 'DID', 'DispWeek']
transactions_pre2016  = transactions

pre2015_mask = transactions_pre2016[:, 2] < year2015
transactions_pre2015  = transactions_pre2016[pre2015_mask]
transactions_2015     = transactions_pre2016[~pre2015_mask]

print(transactions_pre2015.shape)
print(transactions_2015.shape)
print(transactions_pre2016.shape)

(21713283, 3)
(5886769, 3)
(27600052, 3)


In [13]:
save_data(transactions_pre2015, root_path + 'test_15.pkl')
save_data(transactions_pre2016, root_path + 'test_16.pkl')

In [14]:
ground_truth_15 = get_ground_truth(transactions_2015)

In [15]:
save_data(ground_truth_15, root_path + 'test_15_ground_truth.pkl')