This file is part of the submission of the Chair for Computer Aided
Medical Procedures, Technische Universität München, Germany to the
Prostate Cancer DREAM Challenge 2015.

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.

__Change the path to where the challenges raw data is located!__ Data must be gzipped.

In [None]:
base_dir = '../../prostate_cancer_challenge/data_training'
test_base_dir = '../../prostate_cancer_challenge/data_final_scoring'
tables = ['CoreTable_{0}.csv.gz', 'LesionMeasure_{0}.csv.gz',
          'PriorMed_{0}.csv.gz', 'LabValue_{0}.csv.gz',
          'MedHistory_{0}.csv.gz', 'VitalSign_{0}.csv.gz']
filenames = [s.format('training') for s in tables]
test_filenames = [s.format('validation') for s in tables]

__Change the path to were the `survial` Python package is located!__

In [None]:
import sys
sys.path.append("..")

In [None]:
import csv
import itertools
from os.path import join

import numpy
import pandas

from dream_utils import *
from survival.io import writearff
from survival.util import safe_concat

In [None]:
def read_csv(filename, extra_na_values=[]):
    return pandas.read_csv(filename, quoting=csv.QUOTE_NONNUMERIC, index_col=None,
                           na_values=["", "."] + extra_na_values, encoding='latin1', low_memory=False,
                           compression='gzip')

## Core Table

It includes dependent variables for the 2 Subchallenges as well as summarized clinical covariates. These clinical covariates are created from sets of standardized longitudinal data tables (including but not limited to the 5 longitudinal data tables released for the Challenge) which cover information about demographics, co-existing disease conditions, prior treatment of the tumor and other co-existing conditions, important baseline lab results and vital sign, lesion measure and early response to therapy.

In [None]:
ecog_value_map = {0: 'Fully active',
                  1: 'Restricted activity',
                  2: 'No activity', 3: 'No activity', 4: 'No activity'}
#                  2: 'No activity, capable of selfcare',
#                  3: 'limited selfcare',
#                  4: 'bed bound'}

In [None]:
def read_core_table(filename, drop_columns=None, log_transform_columns=None):
    core_full = read_csv(filename)
    core_full.rename(columns={'NA.': 'NA'}, inplace=True)
    core_full.replace({'ECOG_C': ecog_value_map}, inplace=True)
    if drop_columns is not None:
        core_full.drop(drop_columns, axis=1, inplace=True)

    height_labels = ['>=140-160', '>=160-180', '>=180-200']
    if pandas.isnull(core_full.HGTBLCAT).all():
        core_full.HGTBLCAT = pandas.cut(core_full.HEIGHTBL, [0, 160, 180, float('inf')], labels=height_labels)
    else:
        raw_height = pandas.Categorical(core_full.HGTBLCAT, categories=height_labels, ordered=True)
        core_full.HGTBLCAT = raw_height

    bins = [0,] + list(range(50, 141, 10)) + [(float('inf'))]
    weight_labels = [">=40-50"]
    for i in range(1, len(bins) - 2):
        weight_labels.append(">=%d-%d" % (bins[i], bins[i + 1]))
    weight_labels.append(">=140-150")

    if pandas.isnull(core_full.WGTBLCAT).all():
        core_full.WGTBLCAT = pandas.cut(core_full.WEIGHTBL, bins, labels=weight_labels)
    else:
        raw_weight = pandas.Categorical(core_full.WGTBLCAT, categories=weight_labels, ordered=True)
        core_full.WGTBLCAT = raw_weight

    age_labels = ["18-64", "65-74", ">=75"]
    if pandas.isnull(core_full.AGEGRP2).all():
        bins = [18, 65, 75, float('inf')]
        age_num = core_full.AGEGRP.replace({">=85": "85"}).astype(int)
        core_full.AGEGRP2 = pandas.cut(age_num, bins, labels=age_labels)
    else:
        raw_age = pandas.Categorical(core_full.AGEGRP2, categories=age_labels, ordered=True)
        core_full.AGEGRP2 = pandas.Series(raw_age)

    if "GLEAS_DX" in core_full.columns:
        gleason_score_labels = numpy.arange(2, 11, dtype=int).astype(numpy.unicode)
        codes = core_full.GLEAS_DX.map(lambda x: int(x)-2 if pandas.notnull(x) else -1)
        raw_gleas = pandas.Categorical.from_codes(codes, categories=gleason_score_labels, ordered=True, name="GLEAS_DX")
        core_full.GLEAS_DX = pandas.Series(raw_gleas)

    if "ECOG_C" in core_full.columns:
        core_full.ECOG_C = pandas.Categorical(core_full.ECOG_C,
                                              categories=["Fully active", "Restricted activity", "No activity"],
                                              ordered=True)

    visceral_mask = (core_full.LUNGS == 'Y') | (core_full.LIVER == 'Y') | (core_full.ADRENAL == 'Y') | (core_full.PANCREAS.astype(object) == 'Y')
    visceral_metastases = pandas.Series('N', index=core_full.index, name="Visceral Metastases")
    visceral_metastases[visceral_mask] = 'Y'
    core_full["Visceral Metastases"] = visceral_metastases

    cols = core_full.columns.to_series()
    start_idx = int(numpy.flatnonzero(cols == 'NON_TARGET'))
    core_full.iloc[:, start_idx:] = core_full.iloc[:, start_idx:].fillna('N')

    treatment = pandas.Series(index=core_full.index, name="Treatment", dtype=object)
    has_docetaxel = pandas.notnull(core_full.TRT2_ID)
    has_prednisone = pandas.notnull(core_full.TRT3_ID)
    core_full.drop(["TRT1_ID", "TRT2_ID", "TRT3_ID"], axis=1, inplace=True)

    treatment[(has_docetaxel) | (has_prednisone)] = "DOCETAXEL or PREDNISONE"
    treatment.fillna('PLACEBO', inplace=True)
    core_full["Treatment"] = pandas.Series(pandas.Categorical(treatment, ordered=False))

    core_full.set_index('RPT', inplace=True)

    if core_full['DEATH'].notnull().any():
        core_q1_labels = core_full.loc[:, ['DEATH', 'LKADT_P']]
        death = core_full.DEATH == 'YES'
        core_q1_labels['DEATH'] = pandas.Categorical(death.astype(int), ordered=False)
        core_full.drop(['DEATH', 'LKADT_P'], axis=1, inplace=True)
    else:
        core_q1_labels = None

    if core_full['DISCONT'].notnull().any():
        core_q2_labels = core_full.loc[:, ['DISCONT', 'ENDTRS_C', 'ENTRT_PC']]
        discont = numpy.zeros(core_q2_labels.shape[0], dtype=int)
        discont[(core_q2_labels["DISCONT"] == 1).values] = 1
        discont[core_q2_labels["DISCONT"].isnull().values] = -1
        core_q2_labels['DISCONT'] = pandas.Categorical.from_codes(discont, categories=["0", "1"], ordered=False)
        core_full.drop(['DISCONT', 'ENDTRS_C', 'ENTRT_PC'], axis=1, inplace=True)
    else:
        core_q2_labels = None

    if log_transform_columns is None:
        log_columns, _ = detect_and_correct_skewness(core_full)
    else:
        log_transform(core_full, log_transform_columns, inplace=True)
        log_columns = log_transform_columns
    print("Log-transformed %d columns" % len(log_columns))
    
    return core_full, core_q1_labels, core_q2_labels, log_columns

In [None]:
other_cols = ['PER_REF', 'LKADT_REF', 'LKADT_PER', 'DOMAIN']
core, train_q1_labels, train_q2_labels, core_log_transform = read_core_table(join(base_dir, filenames[0]),
                                                                             other_cols)

core_useless_cols = get_useless_columns(core)
core.drop(core_useless_cols[core_useless_cols].index, axis=1, inplace=True)
core.shape

In [None]:
f = join(test_base_dir, test_filenames[0])
core_test, _, _, _ = read_core_table(f, other_cols, log_transform_columns=core_log_transform)
core_test.shape

## PriorMed

It includes all medications a patient took or has taken before first treatment of the trial.

In [None]:
def check_units_consistency(table, groupby_col, unit_col):
    n_units = table.loc[:, [groupby_col, unit_col]].groupby(groupby_col).aggregate(lambda x: x[unit_col].nunique())
    if (n_units.iloc[:, 0] >= 2).any():
        print(n_units[n_units.iloc[:, 0] >= 2])
        raise ValueError('Measurements have multiple units')
    return True

In [None]:
def get_routes(series):
    d = {'INHALATION, NASAL, BOTH (IHNB)': 'INHALATION',
     'INHALATION, ORAL (IHO)': 'INHALATION',
     'INHALATION, UNSPECIFIED (IH)': 'INHALATION',
     'INTRAMUSCULAR (IM)': 'INTRAMUSCULAR',
     'INTRAVENOUS (IV)': 'INTRAVENOUS',
     'INTRAVENOUS (NOT OTHERWISE SPECIFIED)': 'INTRAVENOUS',
     'ORAL (PO)': 'ORAL',
     'PER RECTAL (PR)': 'RECTAL',
     'SUBCUTANEOUS (SC)': 'SUBCUTANEOUS'}
    return series.replace(d).replace(" \(.+\) ?", regex=True, value="")

In [None]:
def fix_chemical_class(table, inplace=False):
    d = {'ACE INHIBITORS': ['ACE INHIBITORS AND CALCIUM CHANNEL BLOCKERS',
                            'ACE INHIBITORS AND DIURETICS',
                            'ACE INHIBITORS, PLAIN',
                            'ACE INHIBITORS, COMBINATIONS'],
         'ANALGESICS': ['OTHER ANALGESICS AND ANTIPYRETICS'],
         'ANTIINFLAMMATORY PREPARATIONS': ['ANTIINFL. PREP., NON-STEROIDS FOR TOPICAL USE',
                                           'ANTIINFLAMMATORY PREPARATIONS, NON-STEROIDS FOR TO'],
         'ANTIINFLAMMATORY AGENTS, NON-STEROIDS': ['ANTIINFLAMM/ANTIRHEUMATIC PRODUCTS, NON-STEROID',
                                                   'OTHER ANTIINFL./ANTIRHEUMATIC AGENTS, NON-STEROIDS',
                                                   'OTHER ANTIINFLAMMATORY AND ANTIRHEUMATIC AGENTS, N',
                                                   'ANTIINFLAMMATORY AND ANTIRHEUMATIC PRODUCTS, NON-S',
                                                   'OTHER ANTIINFLAM/ANTIRHEUMATIC AGENTS,NON-STEROIDS'],
         'ANTIINFLAMMATORY PREPARATIONS': ['ANTIINFLAM PREPS, NON-STEROIDS FOR TOPICAL USE'],
         'ANTIINFLAMMATORY PRODUCTS FOR VAGINAL ADMINISTRAT.': ['ANTIINFLAMMATORY PRODUCTS FOR VAGINAL ADMINISTRATI'],
         'ANGIOTENSIN II ANTAGONISTS': ['ANGIOTENSIN II ANTAGONISTS AND DIURETICS',
                                        'ANGIOTENSIN II ANTAGONISTS, PLAIN',
                                        'ANGIOTENSIN II ANTAGONISTS, COMBINATIONS',
                                        'ANGIOTENSIN II ANTAGONISTS AND CALCIUM CHANNEL BLO'],
         'ANTI-ANDROGENS': ['ANTIANDROGENS, PLAIN'],
         'ANTIHISTAMINES': ['ANTIHISTAMINES FOR TOPICAL USE',
                            'OTHER ANTIHISTAMINES FOR SYSTEMIC USE',
                            'ETHERS, CHEMICALLY CLOSE TO ANTIHISTAMINES'],
         'ASCORBIC ACID (VITAMIN C)': ['ASCORBIC ACID (VITAMIN C), COMBINATIONS',
                                       'ASCORBIC ACID (VITAMIN C), PLAIN'],
         'BARBITURATES': ['BARBITURATES AND DERIVATIVES',
                          'BARBITURATES, COMBINATIONS',
                          'BARBITURATES, PLAIN'],
         'BELLADONNA ALKALOIDS': ['BELLADONNA ALKALOIDS, SEMISYNTHETIC, QUATERNARY AM',
                                  'BELLADONNA ALKALOIDS, TERTIARY AMINES',
                                  'BELLADONNA ALKALOIDS,SEMISYNTH,QUATERN AMMON COMPS'],
         'BENZODIAZEPINE RELATED DRUGS': ['BENZODIAZEPINE DERIVATIVES'],
         'BETA BLOCKING AGENTS': ['BETA BLOCKING AGENTS, SELECTIVE',
                                  'BETA BLOCKING AGENTS, NON-SELECTIVE',
                                  'BETA BLOCKING AGENTS, SELECTIVE, AND OTHER ANTIHYP',
                                  'BETA BLOCKING AGENTS, SELECTIVE, AND THIAZIDES',
                                  'BETA BLOCKING AGENTS,SELECTIVE,AND OTHER DIURETICS',
                                  'BETA BLOCKING AGENTS, SELECTIVE, AND OTHER DIURETI',
                                  'B BLOCKING AGENT,SELECTIVE/OTHER ANTIHYPERTENSIVE',
                                  'BETA BLOCKING AGENTS AND OTHER DIURETICS'],
         'BIGUANIDES': ['BIGUANIDES AND AMIDINES'],
         'CORTICOSTEROIDS': ['CORTICOSTEROIDS, WEAK (GROUP I)'],
         'CALCIUM': ['CALCIUM COMPOUNDS',
                     'CALCIUM, COMBINATIONS WITH OTHER DRUGS'],
         'COMB AND COMPL. OF ALUMIN., CALC. AND MAGNES. COMP': ['COMBINATIONS AND COMPLEXES OF ALUMINIUM, CALCIUM A',
                                                                'COMB/COMPLEXES ALUMINIUM, CALCIUM, MAGNESIUM COMPS'],
         'COMB OF SULFONAMIDES/TRIMETHOPRIM INCL DERIVATIVES': ['COMBINATIONS OF SULFONAMIDES AND TRIMETHOPRIM, INC',
                                                                'COMB.SULFONAMIDES & TRIMETHOPRIM INCL. DERIVATIVES'],
         'CORTICOSTEROIDS': ['CORTICOSTEROIDS ACTING LOCALLY',
                             'CORTICOSTEROIDS FOR LOCAL ORAL TREATMENT',
                             'CORTICOSTEROIDS, COMBINATIONS FOR TREATMENT OF ACN',
                             'CORTICOSTEROIDS, MODERATELY POTENT (GROUP II)',
                             'CORTICOSTEROIDS, PLAIN',
                             'CORTICOSTEROIDS, POTENT (GROUP III)',
                             'CORTICOSTEROIDS, WEAK (GROUP I)',
                             'CORTICOSTEROIDS FOR SYSTEMIC USE, COMBINATIONS',
                             'CORTICOSTEROIDS, POTENT, OTHER COMBINATIONS',
                             'CORTICOSTEROIDS, VERY POTENT (GROUP IV)',
                             'CORTICOSTEROIDS, MODERAT. POTENT, COMB W/ANTIBIOT.'],
         'FOLIC ACID AND DERIVATIVES': ['FOLIC ACID ANALOGUES'],
         'GONADOTROPIN-RELEASING HORMONES': ['GONADOTROPIN RELEASING HORMONE ANALOGUES'],
         'GENERAL NUTRIENTS': ['OTHER NUTRIENTS', 'OTHER COMBINATIONS OF NUTRIENTS'],
         'HMG COA REDUCTASE INHIBITORS': ['HMG COA REDUCTASE INHIBITORS IN COMBINATION WITH O',
                                          'HMG COA REDUCTASE INHIBITORS, OTHER COMBINATIONS'],
         'HYPNOTICS AND SEDATIVES': ['HYPNOTICS & SEDATIVES COMB., EXCL BARBITURATES'],
         'IMIDAZOLE DERIVATIVES': ['IMIDAZOLE AND TRIAZOLE DERIVATIVES'],
         'INSULINS AND ANALOGUES': ['INSULINS AND ANALOGUES, LONG-ACTING',
                                    'INSULINS AND ANALOGUES FOR INJECTION, FAST-ACTING',
                                    'INSULINS AND ANALOGUES FOR INJECTION, INTERMEDIATE',
                                    'INSULINS AND ANALOGUES FOR INJECTION, LONG-ACTING',
                                    'INSULINS AND ANALOGUES FOR INHALATION',
                                    'INSULINS AND ANALOGUES, FAST-ACTING',
                                    'INSULINS AND ANALOGUES, INTERMEDIATE-ACTING',
                                    'INSULINS AND ANAL.,INTERM.-ACTING,COMB.W/FAST ACT.',
                                    'INSULINS AND ANALOGUES FOR INJ,INTERMEDIATE-ACTING',
                                    'INSULINS/ANALOGUES FOR INJ,INTERMED-ACT+FAST-ACTIN'],
         'IRON TRIVALENT': ['IRON TRIVALENT, ORAL PREPARATIONS',
                            'IRON TRIVALENT, PARENTERAL PREPARATIONS'],
         'LAXATIVES': ['CONTACT LAXATIVES', 'OSMOTICALLY ACTING LAXATIVES'],
         'MAGNESIUM': ['MAGNESIUM COMPOUNDS', 'MAGNESIUM COMPS'],
         'MULTIVITAMINS': ['MULTIVITAMINS WITH MINERALS',
                           'MULTIVITAMINS, OTHER COMBINATIONS',
                           'MULTIVITAMINS, PLAIN'],
         'OPIUM ALKALOIDS': ['NATURAL OPIUM ALKALOIDS',
                             'OPIUM ALKALOIDS AND DERIVATIVES',
                             'OPIUM DERIVATIVES AND EXPECTORANTS'],
         'QUINOLINE DERIVATIVES': ['QUININE AND DERIVATIVES'],
         'SULFONAMIDES': ['SULFONAMIDES, PLAIN',
                          'SULFONAMIDES, UREA DERIVATIVES'],
         'SEROTONIN ANTAGONISTS': ['SELECTIVE SEROTONIN (5HT1) AGONISTS',
                                   'SEROTONIN (5HT3) ANTAGONISTS'],
         'SYNT ANTICHOLIN,ESTERS WITH TERTIARY AMINO GROUP': ['SYNTH ANTICHOLINERGICS,ESTERS/TERTIARY AMINO GROUP'],
         'THIAZIDES': ['THIAZIDES AND POTASSIUM IN COMBINATION', 'THIAZIDES, PLAIN'],
         'ANESTHETICS': ['OPIOID ANESTHETICS',
                         'ANESTHETICS, GENERAL',
                         'ANESTHETICS FOR TOPICAL USE',
                         'ANESTHETICS, LOCAL'],
         'ADRENERGICS AND OTH.DRUGS FOR OBSTRUCT.AIRWAY DIS.': ['ADRENERGICS AND OTH.DRUGS FOR OBSTRUC.AIRWAY DISEA',
                                                                'ADRENERGICS AND OTHER DRUGS FOR OBSTRUCTIVE AIRWAY',
                                                                'ADRENERGICS/OTHER DRUGS FOR OBSTR AIRWAY DISEASES',
                                                                'ADRENERGIC AND DOPAMINERGIC AGENTS'],
         'AGENTS FOR TREATMENT OF HEMORRHOIDS': ['OTHER AGENTS FOR TREATMENT OF HEMORRHOIDS AND ANAL',
                                                 'TREATMENT OF HEMORRHOIDS+ANAL FISSURES,TOPICAL USE'],
         'AMINO ACIDS': ['AMINO ACIDS AND DERIVATIVES',
                         'AMINO ACIDS, INCL. COMBINATIONS WITH POLYPEPTIDES'],
         'ANTACIDS': ['ANTACIDS WITH ANTIFLATULENTS',
                      'ANTACIDS WITH SODIUM BICARBONATE',
                      'ANTACIDS, OTHER COMBINATIONS'],
         'ANTIARRHYTHMICS': ['ANTIARRHYTHMICS, CLASS IA',
                             'ANTIARRHYTHMICS, CLASS IC',
                             'ANTIARRHYTHMICS, CLASS III'],
         'ANTIDEPRESSANTS': ['ANTIDEPRESSANTS IN COMBINATION WITH PSYCHOLEPTICS',
                             'OTHER ANTIDEPRESSANTS'],
         'ANTIBIOTICS': ['OTHER ANTIBIOTICS FOR TOPICAL USE',
                         'COMBINATIONS OF PENICILLINS, INCL. BETA-LACTAMASE',
                         'BETA-LACTAMASE RESISTANT PENICILLINS',
                         'PENICILLINS WITH EXTENDED SPECTRUM',
                         'FIRST-GENERATION CEPHALOSPORINS',
                         'THIRD-GENERATION CEPHALOSPORINS',
                         'COMBS OF PENICILLINS INCL BETA-LACTAMASE INHIBITOR',
                         'COMB OF PENICILLINS, INCL. BETA-LACTAMASE INHIB.'],
         'ANTICHOLINERGICS': ['SYNTHETIC ANTICHOLINERGICS',
                              'SYNTHETIC ANTICHOLINERGIC AGENTS IN COMBINATION WI',
                              'SYNTHETIC ANTICHOLINERGICS, ESTERS WITH TERTIARY A',
                              'SYNTHETIC ANTICHOLINERGICS, QUATERNARY AMMONIUM CO',
                              'SYNTHETIC ANTICHOLINERGICS, ESTERS WITH TERTIARY A'],
         'ANTIINFECTIVES': ['ANTIINFECT. AND ANTISEPT. FOR LOCAL ORAL TREATMENT',
                            'ANTIINFECTIVES AND ANTISEPTICS FOR LOCAL ORAL TREA',
                            'OTHER ANTIINFECTIVES',
                            'OTHER ANTIINFECTIVES AND ANTISEPTICS'],
         'ESTROGENS': ['NATURAL AND SEMISYNTHETIC ESTROGENS, PLAIN',
                       'SYNTHETIC ESTROGENS, PLAIN'],
         'HEPARINS': ['HEPARIN GROUP', 'HEPARINS OR HEPARINOIDS FOR TOPICAL USE'],
         'PHENOTHIAZINES': ['PHENOTHIAZINE DERIVATIVES',
                            'PHENOTHIAZINES WITH ALIPHATIC SIDE-CHAIN',
                            'PHENOTHIAZINES WITH PIPERAZINE STRUCTURE'],
         'LIPID MODIFYING AGENTS': ['OTHER LIPID MODIFYING AGENTS'],
         'NUCLEOSIDE REVERSE TRANSCRIPTASE INHIBITORS': ['NUCLEOSIDES AND NUCLEOTIDES EXCL REV.TRANSCR.INHIB',
                                                         'NUCLEOSIDES AND NUCLEOTIDES EXCL. REVERSE TRANSCRI'],
         'OTHER THERAPEUTIC PRODUCTS': ['ALL OTHER THERAPEUTIC PRODUCTS'],
         'OTHER DRUGS AFFECTING BONE STRUCTURE AND MINERALIZ': ['OTHER DRUG AFFECTING BONE STRUCTURE/MINERALIZATION'],
         'SYMPATHOMIMETICS': ['SYMPATHOMIMETICS IN GLAUCOMA THERAPY','SYMPATHOMIMETICS, PLAIN'],
         'VITAMINS': ['VITAMINS WITH MINERALS', 'VITAMINS, OTHER COMBINATIONS', 'COMBINATIONS OF VITAMINS'],
         'VITAMIN B-COMPLEX': ['VITAMIN B-COMPLEX WITH VITAMIN C',
                               'VITAMIN B-COMPLEX, PLAIN',
                               'VITAMIN B-COMPLEX, INCL. COMBINATIONS',
                               'VITAMIN B-COMPLEX WITH MINERALS',
                               'VITAMIN B-COMPLEX, OTHER COMBINATIONS'],
         'XANTHINES': ['XANTHINE DERIVATIVES'],
         'ZINC': ['ZINC BANDAGES', 'ZINC PRODUCTS'],
        }
    replace = {}
    for key, items in d.items():
        for item in items:
            replace[item] = key
    return table.replace(replace, inplace=inplace, regex=False)

In [None]:
def get_med(group):
    duration_val = group['CMDURN']
    duration_comp = (group['CMENDT_PC'] - group['CMSTDT_PC']).abs()
    duration_val = duration_val.fillna(duration_comp)

    duration = duration_val.mean() if duration_val.notnull().any() else numpy.nan
    n_meds = group.shape[0]

    return pandas.Series({"size": n_meds,
                          "duration": duration})

def safe_has_yes(x):
    if x.isnull().all():
        return False
    return (x == 'YES').any()


def discretize_duration(table):
    intervals = {"HORMONOTHERAPY duration": {"interval": 365, "max": 3},
                 "OPIUM ALKALOIDS duration": {"interval": 30.5, "max": 2},
                 "GONADOTROPIN-RELEASING HORMONES duration": {"interval": 365, "max": 3},
                 "GLUCOCORTICOIDS duration": {"interval": 30.5, "max": 2},
                 "BISPHOSPHONATES duration": {"interval": 365, "max": 2},
                 "ANTI-ANDROGENS duration": {"interval": 365, "max": 3}}

    def _discretize(x):
        if x.name not in intervals:
            return x
        x_positive = x[x > 0]

        bins = numpy.concatenate((numpy.arange(intervals[x.name]["max"] + 1, dtype=float), [float("inf")]))
        val = intervals[x.name]["interval"]
        unit = "years" if val == 365 else "months"
        labels = ["<1 " + unit]
        for i in range(1, len(bins) - 2):
            labels.append("%d-%d %s" % (bins[i], bins[i + 1], unit))
        labels.append(">=%d %s" % (bins[-2], unit))

        v = pandas.cut(x_positive.floordiv(val), bins, labels=labels, include_lowest=True, right=False)
        codes = numpy.zeros(x.shape, dtype=int)
        codes[numpy.where(x > 0)] = v.cat.codes + 1
        labels = ["NONE"] + labels

        x_new = pandas.Series(pandas.Categorical.from_codes(codes, categories=labels, ordered=True),
                              index=x.index, name=x.name)
        return x_new
    
    return table.apply(_discretize, reduce=False)
         

def read_priormed_table(filename, chem_classes_included=None, routes_included=None):
    med_dirty = read_csv(filename, extra_na_values=["..."])

    med = fix_chemical_class(med_dirty)
    med.CMSCAT.replace({"HORMONAL THERAPY": "HORMONOTHERAPY"}, inplace=True)

    do_drop_useless = chem_classes_included is None
    if do_drop_useless:
        chem_classes = med.cmatc4.value_counts()
        chem_classes_included = chem_classes[chem_classes > 60]
        
        routes = get_routes(med.CMROUTE).value_counts()
        routes_included = routes[routes > 10]

#    if intents_included is None:
#        intents = med.CMINTENT.value_counts()
#        intents_included = intents[intents > 100]

    check_units_consistency(med, 'cmatc4', 'CMDURU')
        
    chemical_mask = med.cmatc4.isin(chem_classes_included.index)
    assert chemical_mask.any()
    chemical_grouped = med[chemical_mask].groupby(['RPT', 'cmatc4'])

    s = chemical_grouped.apply(get_med).reset_index()

    n_meds = s.pivot(index='RPT', columns='cmatc4', values='size')
    n_meds.fillna(value=0, inplace=True)

    duration = s.pivot(index='RPT', columns='cmatc4', values='duration')
    duration_unique = duration.apply(lambda x: x.nunique())
    duration.drop((duration_unique[duration_unique < 80]).index, axis=1, inplace=True)
    duration.fillna(value=0, inplace=True)

    diuretics = med_dirty.loc[:, ["RPT", "cmatc4"]].groupby('RPT').agg(
        lambda x: x.str.contains("DIURET").any())
    diuretics.rename(columns={"cmatc4": "DIURETICS"}, inplace=True)

    routes_mask = med.CMROUTE.isin(routes_included.index)
    assert routes_mask.any()
    routes_grouped = med[routes_mask].groupby(['RPT', 'CMROUTE']).size()
    routes_grouped.name = 'size'
    s = routes_grouped.reset_index().pivot(index='RPT', columns='CMROUTE', values='size')
    has_route = (s > 0)

#    intent_mask = med.CMINTENT.isin(intents_included.index)
#    assert intent_mask.any()
#    intent_grouped = med[intent_mask].groupby(['RPT', 'CMINTENT'])

#    s = intent_grouped.apply(get_med).reset_index()
#    n_intents = s.pivot(index='RPT', columns='CMINTENT', values='size')
#    n_intents.fillna(value=0, inplace=True)

#    intent_duration = s.pivot(index='RPT', columns='CMINTENT', values='duration')
#    intent_duration.fillna(value=0, inplace=True)

    hormone_grouped = med[med.CMSCAT == "HORMONOTHERAPY"].groupby(['RPT']).apply(get_med)
    horomontherapy = pandas.Series(False, index=hormone_grouped.index)
    horomontherapy[hormone_grouped["size"] > 0] = True
    hormone_grouped["HORMONOTHERAPY"] = horomontherapy
    hormone_grouped["duration"].fillna(0, inplace=True)
    hormone_grouped = hormone_grouped.drop("size", axis=1).rename(columns={"duration": "HORMONOTHERAPY duration"})

    if not do_drop_useless:
        d = chem_classes_included.index.difference(n_meds.columns)
        new_meds = pandas.DataFrame(0, index=n_meds.index, columns=d)
        n_meds = safe_concat((n_meds, new_meds), axis=1)

        new_duration = pandas.DataFrame(0, index=duration.index, columns=d)
        duration = safe_concat((duration, new_duration), axis=1)
        
        d = routes_included.index.difference(has_route.columns)
        new_routes = pandas.DataFrame(False, index=has_route.index, columns=d)
        has_route = safe_concat((has_route, new_routes), axis=1)

    pre_meds = safe_concat(((n_meds > 0),
                            diuretics,
                            duration.add_suffix(' duration'),
                            has_route,
                            hormone_grouped), axis=1)
    pre_meds = discretize_duration(pre_meds)

    adverse_event = med.loc[:, ['RPT', 'CM_AE']].groupby('RPT').aggregate(safe_has_yes)
    pre_meds['Treatment for Adverse Event'] = adverse_event.astype(bool)

    n_chem_classes = med.loc[:, ['RPT', 'cmatc4']].groupby('RPT').aggregate(lambda x: x.nunique())
    pre_meds['Total chemical classes'] = safe_anscombe(n_chem_classes)

#    total_intents = med.loc[:, ['RPT', 'CMINTENT']].groupby('RPT').aggregate(lambda x: x.nunique())
#    pre_meds['Total intents'] = total_intents

    def _fillna(x):
        if pandas.core.common.is_numeric_dtype(x.dtype):
            return x.fillna(0)
        elif pandas.core.common.is_categorical_dtype(x.dtype):
            return x.fillna(x.cat.categories[0])
        else:
            return x.fillna(False)

    pre_meds = pre_meds.apply(_fillna, axis=0)

    if do_drop_useless:
        drop_useless(pre_meds, suffix=["duration"], inplace=True)
        mask = chem_classes_included.index.isin(pre_meds.columns)
        chem_classes_included = chem_classes_included[mask]

        mask = routes_included.index.isin(pre_meds.columns)
        routes_included = routes_included[mask]

    return pre_meds, chem_classes_included, routes_included

In [None]:
def write_csv(counts, fp):
    start = ""
    for i in counts.index:
        val = counts.loc[i]
        if start != i[0]:
            start = i[0]
            fp.write("\t\n")
        if pandas.isnull(val):
            val = 0
        fp.write("\"%s\"\t%d\n" % (i, val))

def write_premed_chemical_classes():
    ll = read_csv(join(base_dir, filenames[2]), extra_na_values=["..."])
    ll_test = read_csv(join(test_base_dir, test_filenames[2]), extra_na_values=["..."])
    fix_chemical_class(ll, inplace=True)
    fix_chemical_class(ll_test, inplace=True)

    m = [ll.STUDYID == "ASCENT2",
         ll.STUDYID == "CELGENE",
         ll.STUDYID == "EFC6546"]

    cc = ll_test.loc[:, "cmatc4"].value_counts()
    cv = cc.sort_index()

    for m1 in m:
        df = ll.loc[m1, ["cmatc4", "CMSCAT", "STUDYID"]]
        cc = df.cmatc4.value_counts()
        cv_new = cc.sort_index()
        cv = pandas.concat((cv, cv_new), axis=1)

    cv = cv.sum(axis=1)
    with open("premed-all.csv", "w") as fp:
        write_csv(cv, fp)

    return cv

#chem_list = write_premed_chemical_classes()
#chem_list.sort(ascending=False)
#chem_list.head(50)

In [None]:
pre_meds, chem_classes_included, routes_included = read_priormed_table(join(base_dir, filenames[2]))
print(pre_meds.shape)
chem_classes_included

In [None]:
pre_meds_test, _, _ = read_priormed_table(join(test_base_dir, test_filenames[2]),
                                       chem_classes_included,
                                       routes_included)
pre_meds_test = pre_meds_test.loc[:, pre_meds.columns]
pre_meds_test.shape

## LabValue

It includes all lab tests a patient took from screening up to 84 days after first treatment date.

Check that measurement unit is consistent among all patients.

In [None]:
def fix_testosterone_limits(lab):
    """Fix wrong reference ranges in TESTO by using original reference range instead of standardized
    
    See http://support.sagebase.org/sagebase/topics/unusually-high-reference-range-limits-in-labvalue-table#reply_15637022
    """
    patient_ids = (lab.RPT.isin(['VEN-951001401', 'VEN-951001601'])) & (lab.LBTESTCD == 'TESTO')
    lab.loc[patient_ids, ['LBSTNRLO', 'LBSTNRHI']] = lab.loc[patient_ids, ['LBORNRLO', 'LBORNRHI']].values
    return lab

In [None]:
def fix_total_bilirubin_limits(lab):
    """Fix wrong upper reference limits of total bilirubin measurements.

    Values have already been converted from mg/dl to UMOL/L (1 mg/dl = 17.1 UMOL/L)

    See http://support.sagebase.org/sagebase/topics/unusually-high-reference-range-limits-in-labvalue-table#reply_15660569
    """
    patient_ids = (lab.LBTESTCD == 'TBILI') & (lab.LBSTNRHI > 185)

    lab.loc[patient_ids, 'LBSTNRHI'] = lab.loc[patient_ids, 'LBORNRHI'].values
    return lab

In [None]:
def get_range(x, mean_value):
    ref_range = x.loc[:, ['LBSTNRLO', 'LBSTNRHI']]
#    rr = ref_range.apply(lambda y: y.nunique())
#    assert (rr < 2).all()

    # we assume that for each patient and marker, the reference ranges
    # are the same for all measurements
    rr = ref_range.iloc[0, :]
    m = pandas.notnull(rr)
    if m.all():
        if mean_value < rr['LBSTNRLO']:
            b = 'lowered'
        elif mean_value > rr['LBSTNRHI']:
            b = 'elevated'
        else:
            b = 'normal'
    elif m[0]:
        # only lower limit
        if mean_value <= rr['LBSTNRLO']:
            b = 'normal'
        else:
            b = 'elevated'
    elif m[1]:
        # only upper limit
        if mean_value <= rr['LBSTNRHI']:
            b = 'normal'
        else:
            b = 'elevated'
    else:
        b = None

    return b

def safe_mean(x):
    # check if we can convert string result to float,
    # otherwise use numeric result
    try:
        v = x['LBSTRESC'].astype(float)
    except ValueError:
        try:
            # strip smaller sign from start
            v = x['LBSTRESC'].str.lstrip('<').astype(float)
        except ValueError:
            # fallback to integer value
            v = x['LBSTRESN']

    mean_value = v.mean()
    b = get_range(x, mean_value)

    return pandas.Series({'value': mean_value, 'range': b})


def impute_testo_range(lab_baseline):
    def get_most_common(x):
        v = x.value_counts()
        if len(v) == 0:
            return numpy.nan
        else:
            return v.index[0]

    m = lab_baseline.LBTESTCD == 'TESTO'
    common = lab_baseline.loc[m, ['STUDYID', 'LBSTNRLO', 'LBSTNRHI']].groupby('STUDYID').aggregate(get_most_common)
    common.dropna(inplace=True)

    tofill = pandas.DataFrame(index=lab_baseline.index, columns=['LBSTNRLO', 'LBSTNRHI'])
    for row in range(common.shape[0]):
        value = common.iloc[row, :]
        study = common.index[row]

        tofill.loc[m & (lab_baseline.STUDYID == study), :] = value.values

    return lab_baseline.fillna(tofill)


def aggregate_categorical(x, categories):
    scat = pandas.Categorical(x["LBSTRESC"].values, categories=categories[x.name[1]],
                                            ordered=True)
    return pandas.Series({"value": scat.max()})


def to_ordered_categorical(x, categories):
    if isinstance(categories, dict):
        categories = categories[x.name]
    categories = pandas.Index(categories)
    m = categories.isin(pandas.Index(x.unique()))
    return pandas.Series(pandas.Categorical(x.values, categories=categories[m], ordered=True),
                         name=x.name, index=x.index)


def read_labvalue_table(filename, lab_tests_included=None, log_transform_columns=None, date_quantiles=None):
    lab = read_csv(filename)
    lab_baseline_mask = lab.LBBLFL == 'Y'
    lab_baseline = lab[lab_baseline_mask]

    # Fix issues
    lab_baseline.loc[lab_baseline.LBTEST == 'SODIUM', 'LBTESTCD'] = 'SODIUM'
    fix_testosterone_limits(lab_baseline)
    fix_total_bilirubin_limits(lab_baseline)
    lab_baseline = impute_testo_range(lab_baseline)

    check_units_consistency(lab_baseline, 'LBTESTCD', 'LBSTRESU')

    do_drop_useless = lab_tests_included is None
    if do_drop_useless:
        lab_tests = lab_baseline.LBTESTCD.value_counts()
        lab_tests_included = lab_tests[lab_tests > 100]

    lab_tests_categories = {"BACT": ["1+", "2+", "3+", "4+"],
                            "EPISQCE": ["1+", "2+", "3+", "4+"],
                            "MUCTHR": ["1+", "2+", "3+", "4+"],
                            "OCCBLD": ["NEGATIVE", "TRACE", "SMALL", "MODERATE", "LARGE"],
                            "RBCQL": ["ABSENT", "+", "++", "+++"],
                            "WBCQL": ["ABSENT", "+", "++", "+++"],
                            "PH": ['5', '5.5', '6', '6.5', '7', '7.5', '8', '8.5']}
    tests_categorical = list(lab_tests_categories.keys())

    mask = lab_baseline.LBTESTCD.isin(lab_tests_included.index.difference(tests_categorical))
    columns = ['RPT', 'LBTESTCD', 'LBSTRESC', 'LBSTRESN', 'LBSTNRLO', 'LBSTNRHI', 'LBDT_PC']
    lab_grouped = lab_baseline.loc[mask, columns].groupby(['RPT', 'LBTESTCD'])

    s = lab_grouped.apply(safe_mean).reset_index()
    pre_lab_values = s.pivot(index='RPT', columns='LBTESTCD', values='value')

    mask_cat = lab_baseline.LBTESTCD.isin(tests_categorical)
    cat_grouped = lab_baseline.loc[mask_cat, ['RPT', 'LBTESTCD', 'LBSTRESC']].groupby(['RPT', 'LBTESTCD'])
    s_cat = cat_grouped.apply(lambda x: aggregate_categorical(x, lab_tests_categories))
    pre_lab_cat = s_cat.reset_index().pivot(index='RPT', columns='LBTESTCD', values='value')
    pre_lab_cat = pre_lab_cat.apply(lambda x: to_ordered_categorical(x, lab_tests_categories))

    pre_lab_ranges = s.pivot(index='RPT', columns='LBTESTCD', values='range')
    pre_lab_ranges[pandas.isnull(pre_lab_values)] = "NONE"
    pre_lab_ranges = pre_lab_ranges.apply(lambda x: to_ordered_categorical(x, ["NONE", "lowered", "normal", "elevated"]))

    mask_all = lab_baseline.LBTESTCD.isin(lab_tests_included.index)
    n_date = lab_baseline.loc[mask_all, ['RPT', 'LBTESTCD', 'LBDT_PC']].groupby(['RPT', 'LBTESTCD']).min()

    pre_lab_dates = n_date.reset_index().pivot(index='RPT', columns='LBTESTCD', values='LBDT_PC')
    # Split date into quartiles
    if date_quantiles is None:
        pre_lab_dates, date_quantiles = cut_quantiles(pre_lab_dates, 4)
    else:
        pre_lab_dates, _ = cut_quantiles(pre_lab_dates, date_quantiles)

    if log_transform_columns is None:
        log_columns, new_columns = detect_and_correct_skewness(pre_lab_values)
    else:
        log_trans_cols = pre_lab_values.columns.intersection(log_transform_columns)
        _, new_columns = log_transform(pre_lab_values, log_trans_cols, inplace=True)
        log_columns = log_trans_cols

    print("Log-transformed %d columns" % len(log_columns))
    pre_lab_ranges.rename(columns=new_columns, inplace=True)
    pre_lab_dates.rename(columns=new_columns, inplace=True)

    pre_lab = safe_concat((pre_lab_values,
                           pre_lab_cat,
                           pre_lab_ranges.add_suffix(' range'),
                           pre_lab_dates.add_suffix(' date')), axis=1)

#    pre_lab['N lab tests'] = safe_anscombe(lab_baseline.groupby('RPT').size())

    if do_drop_useless:
        drop_useless(pre_lab, suffix=["range", "date"], inplace=True)
    
    return pre_lab, lab_tests_included, log_columns, date_quantiles

In [None]:
pre_lab, lab_tests_included, lab_tests_log_columns, lab_tests_date_quantiles = read_labvalue_table(join(base_dir, filenames[3]))
print(pre_lab.shape)
lab_tests_included

In [None]:
pre_lab_test, _, _, _ = read_labvalue_table(join(test_base_dir, test_filenames[3]),
                                      lab_tests_included, lab_tests_log_columns, lab_tests_date_quantiles)
pre_lab_test.shape

## VitalSign

It includes all vital sign a patient took from screening up to 84 days after reference day.

Convert height to centimeters

In [None]:
def convert_height_to_cm(vitals_baseline):
    height_inch_mask = (vitals_baseline.VSTESTCD == 'HEIGHT') & (vitals_baseline.VSSTRESU == 'IN')
    vitals_baseline.loc[height_inch_mask, 'VSSTRESC'] = vitals_baseline.loc[height_inch_mask, 'VSSTRESC'].astype(float) * 2.54
    vitals_baseline.loc[height_inch_mask, 'VSSTRESU'] = 'CM'

Convert weight to kilogram

In [None]:
def convert_weight_to_kg(vitals_baseline):
    weight_lbs_mask = (vitals_baseline.VSTESTCD == 'WEIGHT') & (vitals_baseline.VSSTRESU != 'KG')
    vitals_baseline.loc[weight_lbs_mask, 'VSSTRESC'] = vitals_baseline.loc[weight_lbs_mask, 'VSSTRESC'].astype(float) * 0.45359237
    vitals_baseline.loc[weight_lbs_mask, 'VSSTRESU'] = 'KG'

Convert temperature to °C

In [None]:
def convert_temp_to_c(vitals_baseline):
    temp_f_mask = (vitals_baseline.VSTESTCD == 'TEMP') & (vitals_baseline.VSSTRESU == '°F')
    vitals_baseline.loc[temp_f_mask, 'VSSTRESC'] = (vitals_baseline.loc[temp_f_mask, 'VSSTRESC'].astype(float) - 32) / 1.8
    vitals_baseline.loc[temp_f_mask, 'VSSTRESU'] = '°C'

In [None]:
def get_vitals(group):
    df = group.loc[:, ['VSTESTCD', 'VSSTRESC']]
    df.VSSTRESC = df.VSSTRESC.astype(float)
    g = df.groupby('VSTESTCD')
    res = g.mean()
    res.loc['Date Collected', :] = group['VSDT_PC'].mean()
    return res


def read_vitalsign_table(filename, vitals_included=None, date_quantiles=None):
    vitals = read_csv(filename)
    vitals_baseline_mask = vitals.VSBLFL == 'Y'
    vitals_baseline = vitals[vitals_baseline_mask]
    
    convert_height_to_cm(vitals_baseline)
    convert_weight_to_kg(vitals_baseline)
    convert_temp_to_c(vitals_baseline)
    
    check_units_consistency(vitals_baseline, 'VSTESTCD', 'VSSTRESU')

    do_drop_useless = vitals_included is None
    if do_drop_useless:
        vitals_included = vitals_baseline['VSTESTCD'].unique()

    columns = ['RPT', 'VSTESTCD', 'VSSTRESC', 'VSDT_PC']
    vitals_grouped = vitals_baseline.loc[:, columns].groupby('RPT')
    s = vitals_grouped.apply(get_vitals)
    pre_vitals = s.reset_index().pivot(index='RPT', columns='VSTESTCD', values='VSSTRESC')

    if 'WEIGHT' in pre_vitals.columns and 'HEIGHT' in pre_vitals.columns:
        pre_vitals['BMI'] = pre_vitals.WEIGHT / (pre_vitals.HEIGHT * pre_vitals.HEIGHT) * 10000

    if "ECOG" in pre_vitals.columns:
        pre_vitals.replace({'ECOG': ecog_value_map}, inplace=True)
        pre_vitals["ECOG"] = pandas.Categorical(pre_vitals["ECOG"],
                                                categories=["Fully active", "Restricted activity", "No activity"],
                                                ordered=True)

    # Split date into quartiles
    if date_quantiles is None:
        pre_vitals_date, date_quantiles = cut_quantiles(pre_vitals["Date Collected"], 4)
    else:
        pre_vitals_date, _ = cut_quantiles(pre_vitals["Date Collected"], date_quantiles)
    pre_vitals["Date Collected"] = pre_vitals_date

    if do_drop_useless:
        vitals_useless = get_useless_columns(pre_vitals)
        pre_vitals.drop(vitals_useless[vitals_useless].index, axis=1, inplace=True)
    
    return pre_vitals, vitals_included, date_quantiles

In [None]:
pre_vitals, vitals_included, vitals_date_quantiles = read_vitalsign_table(join(base_dir, filenames[5]))
print(pre_vitals.shape)
vitals_included

In [None]:
pre_vitals_test, _, _ = read_vitalsign_table(join(test_base_dir, test_filenames[5]),
                                             vitals_included, vitals_date_quantiles)
print(pre_vitals_test.shape)

## MedHistory

It includes all medical diagnoses patients provided at screening, which covers co-existing conditions patients have.

*Missing data*: ASCENT2 trial did not provide event level medical history table, it only provided summary level pre-specified co-morbidity variables, which have been included in CoreTable. Therefore, there are no event level data for ASCENT2 in MedicalHistory table

In [None]:
def read_medhistory_table(filename, diagnosis_included=None, date_quantiles=None):
    comorb = read_csv(filename)

    replace = {"INGUINAL HERNIA REPAIR": "INGUINAL HERNIA",
               "CATARACT OPERATION": "CATARACT",
               "ABDOMINAL PAIN LOWER": "ABDOMINAL PAIN",
               "ABDOMINAL PAIN UPPER": "ABDOMINAL PAIN",
               "ALCOHOL ABUSE": "ALCOHOLISM",
               "ALCOHOL DETOXIFICATION": "ALCOHOLISM",
               "ALCOHOLIC": "ALCOHOLISM",
               "ANXIETY DISORDER": "ANXIETY",
               "ANAEMIA MACROCYTIC": "ANAEMIA",
               "ANGINA UNSTABLE": "ANGINA PECTORIS",
               "ARTERIOSCLEROSIS CORONARY ARTERY": "ARTERIOSCLEROSIS",
               "BLADDER CANCER STAGE 0, WITH CANCER IN SITU": "BLADDER CANCER",
               "BLADDER NECK OBSTRUCTION": "BLADDER OBSTRUCTION",
               "BLADDER NECK OPERATION": "BLADDER OPERATION",
               "BLADDER OUTLET OBSTRUCTION": "BLADDER OBSTRUCTION",
               "BRONCHITIS CHRONIC": "BRONCHITIS",
               "BUNDLE BRANCH BLOCK LEFT": "BUNDLE BRANCH BLOCK",
               "BUNDLE BRANCH BLOCK RIGHT": "BUNDLE BRANCH BLOCK",
               "CARDIAC FAILURE CHRONIC": "CARDIAC FAILURE",
               "CARDIAC FAILURE CONGESTIVE": "CARDIAC FAILURE",
               "CENTRAL VENOUS CATHETERISATION": "CATHETERISATION VENOUS",
               "CHOLECYSTITIS ACUTE": "CHOLECYSTITIS",
               "CHOLECYSTITIS CHRONIC": "CHOLECYSTITIS",
               "CIRCUMCISED": "CIRCUMCISION",
               "COLON CANCER STAGE I": "COLON CANCER",
               "COLOSTOMY CLOSURE": "COLOSTOMY",
               "CONJUNCTIVITIS ALLERGIC": "CONJUNCTIVITIS",
               "DIVERTICULUM INTESTINAL": "DIVERTICULUM",
               "DYSPNOEA EXERTIONAL": "DYSPNOEA",
               "GASTRITIS EROSIVE": "GASTRITIS",
               "MALIGNANT MELANOMA IN SITU": "MALIGNANT MELANOMA",
               "PERCUTANEOUS CORONARY INTERVENTION": "CORONARY ANGIOPLASTY",
               "PROSTATE CANCER METASTATIC": "PROSTATE CANCER",
               "SQUAMOUS CELL CARCINOMA OF SKIN": "SQUAMOUS CELL CARCINOMA",
               "TOBACCO ABUSE": "TOBACCO USER",
               "TYPE 2 DIABETES MELLITUS": "DIABETES MELLITUS",}

    comorb.MHDECOD.replace(replace, inplace=True)

    do_drop_useless = diagnosis_included is None
    if do_drop_useless:
        diagnosis = comorb.MHDECOD.value_counts()

        mandatory_diagnosis = pandas.Index(["CANCER PAIN",
                                            "PROSTATE CANCER"])
        diagnosis_included = diagnosis[(diagnosis > 20) | (diagnosis.index.isin(mandatory_diagnosis))]

    mask = comorb.MHDECOD.isin(diagnosis_included.index)
    comorb_grouped = comorb.loc[mask, ['RPT', 'MHDECOD', 'MHSTDT_P']].groupby(['RPT', 'MHDECOD'])

    n_comorbidities = comorb_grouped.size()
    n_comorbidities.name = 'size'
    n_comorbidities.fillna(value=0, inplace=True)

    n_date = comorb_grouped['MHSTDT_P'].min()
    n_date.name = 'date'

    s1 = n_comorbidities.reset_index().pivot(index='RPT', columns='MHDECOD', values='size')
    has_comorb = (s1 > 0)
    comorb_dates = n_date.reset_index().pivot(index='RPT', columns='MHDECOD', values='date')
    # Split date into quartiles
    if date_quantiles is None:
        comorb_dates_q, date_quantiles = cut_quantiles(comorb_dates, 4)
    else:
        comorb_dates_q, _ = cut_quantiles(comorb_dates, date_quantiles)

    # Only available for CELGENE, but we want exact date
    if "PROSTATE CANCER" in comorb_dates.columns:
        comorb_dates_q["PROSTATE CANCER"] = safe_anscombe(comorb_dates["PROSTATE CANCER"])

    if "CANCER PAIN" in comorb_dates.columns:
        comorb_dates_q["CANCER PAIN"] = safe_anscombe(comorb_dates["CANCER PAIN"])

    if not do_drop_useless:
        d = diagnosis_included.index.difference(has_comorb.columns)
        new_comorb = pandas.DataFrame(False, index=has_comorb.index, columns=d)
        has_comorb = safe_concat((has_comorb, new_comorb), axis=1)

        new_dates = pandas.DataFrame("NONE", index=has_comorb.index, columns=d)
        if "CANCER PAIN" in d:
            new_dates["CANCER PAIN"] = pandas.Series(0, index=has_comorb.index, name="CANCER PAIN", dtype=float)
        if "PROSTATE CANCER" in d:
            new_dates["PROSTATE CANCER"] = pandas.Series(0, index=has_comorb.index, name="PROSTATE PAIN", dtype=float)

        comorb_dates_q = safe_concat((comorb_dates_q, new_dates), axis=1)

    pre_comorbidities = safe_concat((has_comorb,
                                     comorb_dates_q.add_suffix(' date')), axis=1)

    is_celgene = pre_comorbidities.index.to_series().str.startswith("CELG")
    pre_comorbidities["PROSTATE CANCER"] = pre_comorbidities["PROSTATE CANCER"].astype(object)
    pre_comorbidities.loc[-is_celgene, "PROSTATE CANCER"] = None
    pre_comorbidities['N comorbidities'] = safe_anscombe(comorb.groupby('RPT').size())

    if do_drop_useless:
        drop_useless(pre_comorbidities, suffix=["date"], inplace=True)
        mask = diagnosis_included.index.isin(pre_comorbidities.columns)
        diagnosis_included = diagnosis_included[mask]
    
    return pre_comorbidities, diagnosis_included, date_quantiles

In [None]:
pre_comorbidities, diagnosis_included, diagnosis_date_included = read_medhistory_table(join(base_dir, filenames[4]))
print(pre_comorbidities.shape)
diagnosis_included

In [None]:
pre_comorbidities_test, _, _ = read_medhistory_table(join(test_base_dir, test_filenames[4]),
                                                  diagnosis_included,
                                                  diagnosis_date_included)
assert len(pre_comorbidities.columns.sym_diff(pre_comorbidities_test.columns)) == 0
pre_comorbidities_test.shape

## LesionMeasure

LesionMeasure is a longitudinal data table that includes event level data.

*Missing Data*: StudyID="ASCENT2" did not provide event level lesion data, it only provides summary level lesion location variables, which have been included in CoreTable. Therefore, there are no event level data for ASCENT2 in this table.

In [None]:
def get_count(x):
    if pandas.isnull(x).all():
        return x

    s = x.notnull().sum()
    if s == 0:
        return "0"
    elif s == 1:
        return "1"
    else:
        return "2+"


def get_presence(x):
    if pandas.isnull(x).all():
        return x

    return (x == "YES").any()


def aggregate_lesion_measure(frame, column, count=False):
    grouped = frame.groupby(["RPT", column])

    if count:
        func = get_count
        na_value = "0"
    else:
        func = get_presence
        na_value = False

    measure = grouped['LSSTRESC'].aggregate(func)
    measure.name = 'value'
    values = measure.reset_index().pivot(index='RPT', columns=column, values='value')
    values.fillna(na_value, inplace=True)
    if count:
        values = values.apply(lambda x: to_ordered_categorical(x, ["0", "1", "2+"]))

    return values


def read_lesionmeasure_table(filename, locations_included=None, date_quantiles=None):
    lesion = read_csv(filename)

    if numpy.issubdtype(lesion.VISIT, numpy.int):
        lesion_baseline_mask = lesion.VISIT == 1
    else:
        lesion_baseline_mask = lesion.VISIT == 'SCREENING'

    lesion_baseline = lesion.loc[lesion_baseline_mask, :]
    
    check_units_consistency(lesion_baseline, 'LSLOC', 'LSSTRESU')

    lesion_baseline.LSLOC.replace({"LUNG": "LUNGS",
                                   "OTHER NON-MEASUREABLE DISEASE - BONE LESIONS": "BONE",
                                   "OTHER NON-MEASUREABLE DISEASE - PLEURAL EFFUSION": "PLEURA"}, inplace=True)

    # Fill missing values in LSLOC2 from LSLOC
    lesion_baseline.LSLOC2.fillna(lesion_baseline.LSLOC, inplace=True)

    do_drop_useless = locations_included is None
    if do_drop_useless:
        locations = lesion_baseline.LSLOC2.value_counts()
        locations_included = locations[locations > 10]

    # LSTESTCD == "LENGTH" is only defined for target lesions
    # LSTESTCD == "PRESENCE" is only defined for non-target lesions
    # check whether target or non-target lesion in respective organ is present
    mask = lesion_baseline.LSLOC.isin(locations_included.index)
    assert mask.any()

    # count number of lesions
    lesion_count = aggregate_lesion_measure(lesion_baseline.loc[mask, ['RPT', 'LSLOC2', 'LSSTRESC', 'LSDT_PC']],
                                            "LSLOC2", count=True)

    n_date = lesion_baseline.loc[mask, ['RPT', 'LSLOC2', 'LSDT_PC']].groupby(['RPT', 'LSLOC2']).min()
    n_date.rename(columns={"LSDT_PC": "date"}, inplace=True)

    dates = n_date.reset_index().pivot(index='RPT', columns='LSLOC2', values='date')
    # Split date into quartiles
    if date_quantiles is None:
        lesion_date, date_quantiles = cut_quantiles(dates, 4)
    else:
        lesion_date, _ = cut_quantiles(dates, date_quantiles)

    if not do_drop_useless:
        d = locations_included.index.difference(lesion_count.columns)
        new_locations = pandas.DataFrame("0", index=lesion_count.index, columns=d)
        new_locations = new_locations.apply(lambda x: to_ordered_categorical(x, ["0"]))
        lesion_count = safe_concat((lesion_count, new_locations), axis=1)

        new_dates = pandas.DataFrame("NONE", index=lesion_date.index, columns=d)
        new_dates = new_dates.apply(lambda x: to_ordered_categorical(x, ["NONE"]))
        lesion_date = safe_concat((lesion_date, new_dates), axis=1)

    pre_lesions = safe_concat((lesion_count,
                               lesion_date.add_suffix(' date')), axis=1)

    # min/max length of target lesion
    length_mask = lesion_baseline.LSTESTCD == "LENGTH"
    if (lesion_baseline.LSSTRESU.dropna() == "CM").all():
        # convert CM to MM
        values = lesion_baseline.loc[length_mask, "LSSTRESC"].map(lambda x: float(x) * 10.)
        lesion_length = pandas.concat((lesion_baseline.loc[length_mask, "RPT"], values), axis=1)
    else:
        lesion_length = lesion_baseline.loc[length_mask, ["RPT", "LSSTRESN"]]

    length_grouped = lesion_length.groupby('RPT')
    pre_lesions["MIN LESION SIZE"] = length_grouped.min()
    pre_lesions["MAX LESION SIZE"] = length_grouped.max()

    def _fillna(x):
        if pandas.core.common.is_numeric_dtype(x.dtype):
            return x.fillna(0)
        elif pandas.core.common.is_categorical_dtype(x.dtype):
            return x.fillna(x.cat.categories[0])

        assert x.notnull().all()
        return x

    pre_lesions = pre_lesions.apply(_fillna, axis=0)

    n_target = lesion_baseline.loc[lesion_baseline.LSCAT == "TARGET", :].groupby('RPT').size()
    bins = numpy.concatenate((numpy.arange(6), [float("inf")]))
    labels = ["{0} lesions".format(i)  for i in range(5)] + [">=5 lesions"]
    pre_lesions['N target lesions'] = pandas.cut(n_target, bins, labels=labels,
                                                 right=False, include_lowest=True)
    pre_lesions['N target lesions'].fillna("0 lesions", inplace=True)

    n_non_target = lesion_baseline.loc[lesion_baseline.LSCAT == "NON-TARGET", :].groupby('RPT').size()
    bins = numpy.concatenate((numpy.arange(11), [float("inf")]))
    labels = ["{0} lesions".format(i) for i in range(10)] + [">=10 lesions"]
    pre_lesions['N non-target lesions'] = pandas.cut(n_non_target, bins, labels=labels,
                                                     right=False, include_lowest=True)
    pre_lesions['N non-target lesions'].fillna("0 lesions", inplace=True)

    if do_drop_useless:
        drop_useless(pre_lesions, suffix=["date"], inplace=True)
        mask = locations_included.index.isin(pre_lesions.columns)
        locations_included = locations_included[mask]
    
    return pre_lesions, locations_included, date_quantiles

In [None]:
pre_lesions, locations_included, location_data_quantiles = read_lesionmeasure_table(
    join(base_dir, filenames[1]))
print(pre_lesions.shape)
locations_included

In [None]:
pre_lesions_test, _, _ = read_lesionmeasure_table(join(test_base_dir, test_filenames[1]),
                                                  locations_included, location_data_quantiles)
assert len(pre_comorbidities.columns.sym_diff(pre_comorbidities_test.columns)) == 0
pre_lesions_test.shape

## Concatenate Tables

In [None]:
pre_meds = pre_meds.add_prefix('PreMed ')
pre_lab = pre_lab.add_prefix('Lab ')
pre_comorbidities = pre_comorbidities.add_prefix('MedHistory ')
pre_vitals = pre_vitals.add_prefix('VitalSign ')
pre_lesions = pre_lesions.add_prefix('LesionMeasure ')

pre_meds_test = pre_meds_test.add_prefix('PreMed ')
pre_lab_test = pre_lab_test.add_prefix('Lab ')
pre_comorbidities_test = pre_comorbidities_test.add_prefix('MedHistory ')
pre_vitals_test = pre_vitals_test.add_prefix('VitalSign ')
pre_lesions_test = pre_lesions_test.add_prefix('LesionMeasure ')

In [None]:
train_all = safe_concat((core,
                         pre_meds,
                         pre_lab,
                         pre_comorbidities,
                         pre_vitals,
                         pre_lesions), axis=1)
test_all = safe_concat((core_test,
                        pre_meds_test,
                        pre_lab_test,
                        pre_comorbidities_test,
                        pre_vitals_test,
                        pre_lesions_test), axis=1)

We have to ensure that categorical variables have the same categories for training and testing data even if some of the categories do not appear in the data.

In [None]:
train_reg = pandas.Categorical(train_all["REGION_C"], ordered=False)
test_reg = pandas.Categorical(test_all["REGION_C"], ordered=False)
new_regions = test_reg.categories.difference(train_reg.categories)

# Encode regions we never saw during training as 'OTHER'
updates = {"REGION_C": {s: "OTHER" for s in new_regions}}
if "elevated" in test_all['Lab LYM range'].unique():
    test_all["Lab LYM range"].cat.remove_categories("elevated", inplace=True)

if "normal" in test_all['Lab LYM range'].unique():
    test_all["Lab SPGRV range"].cat.remove_categories("normal", inplace=True)

test_all.replace(updates, inplace=True)

In [None]:
train_cat = train_all.select_dtypes(include=["object", "category"])
test_cat = test_all.select_dtypes(include=["object", "category"])

columns = train_cat.columns.copy().union(test_cat.columns)

for col in columns:
    if col in {'AGEGRP', 'STUDYID'}:
        continue

    if col in test_cat.columns and col in train_cat.columns:
        if pandas.core.common.is_categorical_dtype(train_cat[col].dtype) and pandas.core.common.is_object_dtype(test_cat[col].dtype):
            rcat = train_cat[col].cat
            test_all[col] = pandas.Categorical(test_cat[col],
                                               categories=rcat.categories, ordered=rcat.ordered)

        if test_all[col].dtype != train_all[col].dtype:
            raise TypeError("%s: dtype mismatch: %s vs. %s" % (col, test_all[col].dtype, train_all[col].dtype))

        if pandas.core.common.is_categorical_dtype(test_all[col].dtype):
            if test_all[col].cat.ordered != train_all[col].cat.ordered:
                raise ValueError("%s: disagreement on whether category is ordered: "
                                 "%s vs. %s" % (test_all[col].cat.ordered, train_all[col].cat.ordered))

            if test_all[col].cat.categories.equals(train_all[col].cat.categories):
                continue
                
            cats_test = test_all[col].cat.categories
            cats_train = train_all[col].cat.categories
            ordered = test_all[col].cat.ordered
        else:
            if pandas.notnull(test_all[col]).any():
                cats_test = set(test_all[col].dropna().unique())
            else:
                cats_test = set()
                
            if pandas.notnull(train_all[col]).any():
                cats_train = set(train_all[col].dropna().unique())
            else:
                cats_train = set()
                
            ordered = False

        no_train_cats = cats_test.difference(cats_train)
        if len(no_train_cats) > 0:
            print("%s: %d categories are not in training data: %s" % (col, len(no_train_cats), no_train_cats))

        cats = sorted(list(cats_train.union(cats_test)))
#        print("%s -> categories=%s, ordered=%s" % (col, cats, ordered))

        test_new = pandas.Categorical(test_all[col].astype("object"),
                                      categories=cats, ordered=ordered)

        train_new = pandas.Categorical(train_all[col].astype("object"),
                                       categories=cats, ordered=ordered)
        
        test_all[col] = pandas.Series(test_new, index=test_all[col].index, name=test_all[col].name)
        train_all[col] = pandas.Series(train_new, index=train_all[col].index, name=train_all[col].name)

_, updates = transfer_categories(train_all, test_all)
assert len(updates) == 0

Drop categorical columns with only one category having more than 10 samples.

In [None]:
drop_cat_columns = []
for col in columns:
    if col in train_cat.columns and (train_cat[col].value_counts() >= 10).sum() <= 1:
        drop_cat_columns.append(col)

print("Dropping %d categorical columns" % len(drop_cat_columns))
train_all.drop(drop_cat_columns, axis=1, inplace=True)

test_all.drop(test_all.columns.intersection(drop_cat_columns), axis=1, inplace=True)

# Partition data according to studies

In [None]:
train_test_all = safe_concat((train_all.copy(), test_all.copy()))

print(train_all.shape)
print(test_all.shape)
print(train_test_all.shape)

Count how often features are available from one study, but not the others. Since we have three studies, the maximum number of patterns is 8.

In [None]:
variables = train_test_all.columns - ['STUDYID']
missing_values = get_missing_values_per_study(train_test_all, variables, only_missing=False)
# if feature has more than 80% missing, we count it as completely missing
missing_values_mask = missing_values > .8

def _mask_to_int(x):
    val = 0
    for i, b in enumerate(x):
        if b:
            val |= 1 << i
    return val

missing_patterns = missing_values_mask.apply(_mask_to_int, axis=1, reduce=True)
missing_patterns.name = 'pattern'
counts = missing_patterns.value_counts()
counts.name = 'count'

pattern_map = {}
for p in itertools.product(range(2), repeat=4):
    val = 0
    for i, b in enumerate(p):
        if b != 0:
            val |= 1 << i
    pattern_map[val] = pandas.Series(p, index=missing_values_mask.columns, dtype=bool)

pattern_masks = pandas.DataFrame(pattern_map).T
count_patterns_full = pandas.concat((counts, pattern_masks), axis=1)
count_patterns_full.sort('count', ascending=False)
count_patterns = count_patterns_full[count_patterns_full.AZ == False]
count_patterns

## Writing Output

In [None]:
def print_stats(df, useless_cols):
    print("%d useless columns" % len(useless_cols))
    print("shape = (%d, %d)" % df.shape)
    m = pandas.isnull(df).sum(axis=1)
    n_complete_samples = (m == 0).sum()
    p_missing = 100. * m.sum() / numpy.prod(df.shape)
    print("%.2f%% total missing values " % p_missing)
    print("%d (%.2f%%) complete samples" % (n_complete_samples, 100. * n_complete_samples / df.shape[0]))

In [None]:
def partition_data(data_all, data_partition, max_missing=0.3, drop_extra_columns={}):
    study_prefix = {'EFC6546': 'VEN-', 'CELGENE': 'CELG-', 'ASCENT2': 'ASC'}
    patient_id = data_all.index.to_series()

    n_rows, n_cols = data_partition.shape
    cols = data_partition.columns

    datasets = {}
    for i in range(n_rows):
        row = data_partition.iloc[i, :]
        patients_included = set()
        names = []
        for j in range(n_cols):
            if row.iloc[j]:
                prefix = study_prefix[cols[j]]
                study_mask = patient_id.str.startswith(prefix)
                patients_included.update(data_all[study_mask].index)
                names.append(cols[j])

        if len(patients_included) > 0:
            key = "_".join(names)
            print(key)

            part_data_full = data_all.loc[patients_included, :]
            m = part_data_full.apply(lambda x: pandas.isnull(x).sum())
            m /= part_data_full.shape[0]

            useless_cols = m[m > max_missing].index
            if key in drop_extra_columns:
                useless_cols = useless_cols.union(pandas.Index(drop_extra_columns[key]))

            part_data = drop_useless(part_data_full, useless_cols=useless_cols,
                                     suffix=["date", "range", "duration"])

            print_stats(part_data, part_data_full.columns.difference(part_data.columns))
            print()
            
            datasets[key] = part_data

    return datasets

In [None]:
drop_extra = {"ASCENT2": ["REGION_C"],
              "ASCENT2_CELGENE_EFC6546": ["Lab log TESTO", "Lab log TESTO range", "Lab log TESTO date",
                                          "Lab log PROTUR range"],
              "ASCENT2_CELGENE": ["Lab log TESTO range"],
              "CELGENE": ["Lab PH", "LesionMeasure ADRENAL", "Lab log UROBIL", "Lab log UROBIL date"],
              "CELGENE_EFC6546": ["CCRC", "Lab CCRC", "Lab CCRC date", "Lab CCRC range",
                                  "Lab log PROTUR range", "Lab log TESTO range"],
              "EFC6546": ["Lab UREA", "Lab UREA date", "Lab UREA range", "Lab log UPCR", "Lab log UPCR date"]}

data_frames_new = partition_data(train_all.copy(), -pattern_masks[pattern_masks.AZ].drop("AZ", axis=1),
                                 drop_extra_columns=drop_extra)

In [None]:
for filename, dat in data_frames_new.items():
    writearff(safe_concat((dat, train_q1_labels.loc[dat.index, :], train_q2_labels.loc[dat.index, :]), axis=1),
                   filename + ".arff", index=True)

In [None]:
writearff(safe_concat((train_all, train_q1_labels, train_q2_labels), axis=1), 'dream_train_all.arff', index=True)
writearff(test_all, 'dream_test_all.arff', index=True)

## Diagnostic Plots and Statistics (Optional)

In [None]:
import math
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn
seaborn.set(style='whitegrid')

In [None]:
def concat_train_test(data_train, data_test, add_studyid=True):
    train_part = train_all.loc[:, data_train.columns]
    test_part = test_all.loc[:, data_test.columns]
    if add_studyid:
        train_part = safe_concat((train_all.STUDYID, train_part), axis=1)
        test_part = safe_concat((test_all.STUDYID, test_part), axis=1)

    df = safe_concat((train_part, test_part))
    return df

core_train_test = concat_train_test(core, core_test, add_studyid=False)
meds_train_test = concat_train_test(pre_meds, pre_meds_test)
lab_train_test = concat_train_test(pre_lab, pre_lab_test)
comorbidities_train_test = concat_train_test(pre_comorbidities, pre_comorbidities_test)
vitals_train_test = concat_train_test(pre_vitals, pre_vitals_test)
lesions_train_test = concat_train_test(pre_lesions, pre_lesions_test)

In [None]:
def compare_distributions(core_all, n_cols=5, figsize=(20, 22)):
    core_numeric = core_all.select_dtypes(include=[numpy.number])
    if core_numeric.shape[1] == 0:
        raise ValueError('DataFrame has no float columns')

    studies = core_all.STUDYID.unique()
    studies.sort()
    colors = seaborn.color_palette(n_colors=len(studies))
    colors = [seaborn.utils.desaturate(c, .7) for c in colors]
    color_map = dict([(studies[i], colors[i]) for i in range(len(studies))])

    plt.figure(figsize=figsize)
    n_rows = math.ceil(core_numeric.shape[1] / n_cols)
    for i, col in enumerate(core_numeric.columns):
        ax = plt.subplot(n_rows, n_cols, i + 1)
        v = core_all.loc[:, [col, 'STUDYID']].dropna()
        if v.shape[0] == 0:
            print(col + ' is empty')
            continue
        _studies = v.iloc[:, 1].unique()
        _col = [color_map[c] for c in _studies]

        seaborn.violinplot(v.iloc[:, 0], groupby=v.iloc[:, 1], ax=ax, color=_col)
        ax.set_title(col)
        ax.set_ylabel("")
        ax.set_xlabel("")

In [None]:
compare_distributions(core_train_test)
plt.tight_layout()
plt.savefig('CoreTable-features-plot.pdf')

In [None]:
compare_distributions(lab_train_test, n_cols=5, figsize=(20, 40))
plt.tight_layout()
plt.savefig('LabValue-features-plot.pdf')

In [None]:
compare_distributions(vitals_train_test, figsize=(16, 7))
plt.tight_layout()
plt.savefig('VitalSign-features-plot.pdf')

In [None]:
compare_distributions(lesions_train_test, figsize=(16, 6))
plt.tight_layout()
plt.savefig('LesionMeasure-features-plot.pdf')

In [None]:
def missing_values_heatmap(table, variables, only_missing=True, figsize=None):
    values = get_missing_values_per_study(table, variables, only_missing=only_missing)

    if figsize is None:
        figsize = (1.5 * values.shape[1], .5 * values.shape[0])

    plt.figure(figsize=figsize)
    return seaborn.heatmap(values, annot=True)

In [None]:
variables = core_train_test.columns - ['STUDYID']
missing_values_heatmap(core_train_test, variables)
plt.tight_layout()
plt.savefig('CoreTable-missing-values.pdf')

In [None]:
variables = lab_train_test.columns - ['STUDYID']
missing_values_heatmap(lab_train_test, variables)
plt.tight_layout()
plt.savefig('LabValue-missing-values.pdf')

In [None]:
variables = comorbidities_train_test.columns - ['STUDYID']
missing_values_heatmap(comorbidities_train_test, variables, figsize=(7, 40))
plt.tight_layout()
plt.savefig('MedHistory-missing-values.pdf')

In [None]:
variables = lesions_train_test.columns - ['STUDYID']
missing_values_heatmap(lesions_train_test, variables)
plt.tight_layout()
plt.savefig('LesionMeasure-missing-values.pdf')

In [None]:
variables = train_test_all.columns - ['STUDYID']
missing_values_heatmap(train_test_all, variables, figsize=(9, 90))
plt.tight_layout()
plt.savefig('All-missing-values.pdf')

In [None]:
n_partitions = len(data_frames_new)
names = sorted(list(data_frames_new.keys()))
cross_tab = pandas.DataFrame(numpy.zeros((n_partitions, n_partitions), dtype=int),
                             index=names, columns=names)

for nam1 in names:
    df1 = data_frames_new[nam1]
    for nam2 in names:
        if nam1 == nam2:
            continue
        df2 = data_frames_new[nam2]
        diff = df1.columns.sym_diff(df2.columns)
        cross_tab.loc[nam1, nam2] = len(diff)

seaborn.heatmap(cross_tab, annot=True, fmt="d", square=True)
plt.title('Difference in features between partitions')

plt.tight_layout()
plt.savefig('difference-partitions.pdf')

In [None]:
%load_ext rpy2.ipython
%R suppressMessages(library(VIM))
from pandas.rpy.common import convert_to_r_dataframe

In [None]:
def to_r_dataframe(dat):
    short_columns = {}
    for col in dat.columns:
        if len(col) > 24:
            newcol = col[:14] + ".." + col[-7:]
        else:
            newcol = col
        short_columns[col] = newcol

    idx = pandas.Index(short_columns.values())
    if not idx.is_unique:
        c = idx.value_counts()
        dups = c[c > 1].index
        upd = {}
        checked = {}
        for k, v in short_columns.items():
            if v in dups:
                num = checked[v] if v in checked else 1
                upd[k] = v + ".%d" % num
                checked[v] = num + 1

        short_columns.update(upd)
        idx = pandas.Index(short_columns.values())

    if not idx.is_unique:
        raise ValueError('duplicate columns after shortening names')

    dat = dat.rename(columns=short_columns)
    for col in dat.select_dtypes(include=['category']).columns:
        dat[col] = dat[col].astype(object, copy=False)

    return convert_to_r_dataframe(dat, strings_as_factors=True)

In [None]:
%%capture --no-stdout
for filename, dat in data_frames_new.items():
    rdat = to_r_dataframe(dat)
    %Rpush rdat
    %Rpush filename
    %R pdf(paste(filename, "pdf", sep="."), 18, 10); m <- aggr(rdat, plot=FALSE); plot(m, combined=TRUE, sortVars=TRUE, cex.axis=.3, numbers=TRUE, only.miss=TRUE); dev.off();