In [1]:
import random
from pathlib import Path
import numpy as np
import pandas as pd
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
from sklearn.model_selection import train_test_split

SEED = np.random.seed(0)
DATA = Path('data')
TARGETS = ['participants', 'interventions', 'outcomes']
SUBSET = 'Train' # 'Test'
N_TRAIN_DOCS = 4500
SAVE_LOC = Path('data/split')

SAVE_LOC.mkdir(exist_ok=True, parents=True)

!pip install jupyternotify
%load_ext jupyternotify



Error processing line 7 of c:\users\tommy\anaconda3\lib\site-packages\pywin32.pth:

  Traceback (most recent call last):
    File "c:\users\tommy\anaconda3\lib\site.py", line 168, in addpackage
      exec(line)
    File "<string>", line 1, in <module>
  ModuleNotFoundError: No module named 'pywin32_bootstrap'

Remainder of file ignored


<IPython.core.display.Javascript object>

# Load feature set and labels

In [2]:
FEATURES = ['base', 'pubmedFT', 'pos'] # 'rawFT', 'clust' // base not needed?
feat_paths = [f'data\\features\\{feature}.pkl' for feature in FEATURES]
labels_path = DATA / 'raw' / 'labels.pkl'

print(f"Loading feature set from {', '.join(feat_paths)}")
feats_in = pd.concat([pd.read_pickle(path) for path in feat_paths], axis=1)
      
print(f'Loading labels from {labels_path}')
labels_in = pd.read_pickle(labels_path)
      
# data_mem = sum(sys.getsizeof(i) for i in [X,y]) # slow command
# print(f'Loaded {data_mem / (10**9)} GB')

data = pd.concat([feats_in.drop('Word',axis=1), labels_in],axis=1)

assert not data.columns.duplicated().any()
      
print('Data load complete.')

Loading feature set from data\features\base.pkl, data\features\pubmedFT.pkl, data\features\pos.pkl
Loading labels from data\raw\labels.pkl
Data load complete.


# Train/Test split

Withhold part of the data for evaluation. Create a script for this.

In [3]:
%%notify

feats = [col for col in data.columns if col not in TARGETS]
targets = TARGETS
hold_size = 10 # 493
k_folds = 5 # only simulated for now

doc_ids = list(data.index.unique(level='doc'))

train_val_idx, hold_idx = train_test_split(doc_ids, train_size=N_TRAIN_DOCS, test_size=hold_size)

word = data
test_hold = data.loc[(hold_idx, slice(None)),:] # slow; comment when testing
train_val = data.loc[(train_val_idx, slice(None)),:]

<IPython.core.display.Javascript object>

Hotcode any numerical columns. Note that this may not be the preferred format for all models (for example, Keras can generate sparser embeddings for many-level categorical variables), so skip this step if you do not want a hotcoded dataset.

In [4]:
%%notify

def hotcode(df):
    

    num_cols = df._get_numeric_data().columns
    cat_cols = (set(df.columns) - set(num_cols)) - {'Word'}
    
    print(cat_cols)
    dummies = pd.get_dummies(df[cat_cols])
    
    print('hotcode complete.')
    # assert check that type is numeric for all
    
    return pd.concat([dummies, df[num_cols]], axis=1)

print('hotcoding categorical columns...')
try:
    train_val = hotcode(train_val)
except ValueError:
    print('No categorical values found in data')

hotcoding categorical columns...
set()
No categorical values found in data


<IPython.core.display.Javascript object>

Sample a subset of the indices to downsample the data. This is useful if you need to work on a computer with less memory or want to train faster on a smaller dataset.

In [5]:
n_docs = 1000

doc_ids = data.index.unique('doc').values.tolist()
ds = random.sample(doc_ids, n_docs)
train_ds = train_val.loc[(ds,slice(None)),:]

Make the required directories and save the data.

In [6]:
train_val.to_parquet(SAVE_LOC / 'train.parquet') # add mkdir
test_hold.to_parquet(SAVE_LOC / 'test.parquet')

train_ds.to_parquet(SAVE_LOC / f'train_{n_docs}')

In [7]:
train_val

Unnamed: 0_level_0,Unnamed: 1_level_0,PM_0,PM_1,PM_2,PM_3,PM_4,PM_5,PM_6,PM_7,PM_8,PM_9,...,POS_LAG_1_VBP,POS_LAG_1_VBZ,POS_LAG_1_WDT,POS_LAG_1_WP,POS_LAG_1_WP$,POS_LAG_1_WRB,Word,interventions,outcomes,participants
doc,idx,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
10037531,0,0.130401,-0.372222,0.161583,0.161873,0.136234,-0.548734,0.277584,0.688972,-0.602796,-0.772061,...,0,0,0,0,0,0,Xylitol,1,0,0
10037531,1,0.148498,-0.097264,-0.081713,-0.206262,-0.312260,0.011514,-0.457917,-0.309768,0.487952,0.159454,...,0,0,0,0,0,0,for,0,0,0
10037531,2,-0.061460,-0.099788,0.186533,-0.349889,-0.202952,0.240505,-0.242104,0.124544,0.709853,-0.171734,...,0,0,0,0,0,0,prevention,0,0,0
10037531,3,0.104165,-0.057073,-0.080237,-0.144472,-0.223653,-0.031189,-0.470893,-0.171195,0.424745,0.082970,...,0,0,0,0,0,0,of,0,0,0
10037531,4,0.055679,0.417647,-0.434107,0.320950,-0.081468,0.450244,-0.026610,0.483254,0.738914,0.309706,...,0,0,0,0,0,0,acute,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9989713,289,-0.482327,-0.020039,0.253585,-0.096272,0.212909,0.478674,0.159902,0.234866,1.577489,-0.230714,...,0,0,0,0,0,0,randomised,0,0,0
9989713,290,0.272813,0.066905,-0.238704,-0.165099,0.000402,0.109383,-0.587040,-0.276197,0.278902,-0.087827,...,0,0,0,0,0,0,",",0,0,0
9989713,291,-0.116538,0.463041,-0.158457,0.337768,-0.209759,0.854119,-0.224265,0.123816,0.603451,0.353388,...,0,0,0,0,0,0,clinical,0,0,0
9989713,292,-1.272327,-0.116690,0.162349,-0.350024,-0.083843,0.396636,-0.026688,-0.275437,1.399622,-0.381075,...,0,0,0,0,0,0,trials,0,0,0


In [8]:
test_hold

Unnamed: 0_level_0,Unnamed: 1_level_0,PM_0,PM_1,PM_2,PM_3,PM_4,PM_5,PM_6,PM_7,PM_8,PM_9,...,POS_LAG_1_VBP,POS_LAG_1_VBZ,POS_LAG_1_WDT,POS_LAG_1_WP,POS_LAG_1_WP$,POS_LAG_1_WRB,Word,interventions,outcomes,participants
doc,idx,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
10706930,0,-0.149054,-0.311682,0.074064,-0.440401,-0.400086,-0.465720,-0.682138,-0.172527,0.178458,0.487644,...,0,0,0,0,0,0,The,0,0,0
10706930,1,-0.530567,-0.082598,-0.357803,0.088315,0.071414,-0.262325,-0.720139,-0.200256,0.318682,-0.439443,...,0,0,0,0,0,0,effects,0,0,0
10706930,2,0.104165,-0.057073,-0.080237,-0.144472,-0.223653,-0.031189,-0.470893,-0.171195,0.424745,0.082970,...,0,0,0,0,0,0,of,0,0,0
10706930,3,0.386100,0.347622,0.148117,-0.308401,0.523525,-0.940647,-0.271191,0.149066,0.439976,-0.074651,...,0,0,0,0,0,0,hormone,1,0,0
10706930,4,-0.169209,-0.394513,0.095485,-0.015367,-0.109848,0.148666,-0.178792,-0.191102,0.613943,0.371670,...,0,0,0,0,0,0,replacement,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8499152,210,0.554219,-0.201609,-0.231630,0.330166,0.240120,-0.253849,-0.585778,0.473615,0.687896,0.606392,...,0,0,0,0,0,0,months,0,0,0
8499152,211,0.112392,-0.066400,-0.163844,-0.156797,-0.017793,0.242825,-0.319322,0.005598,0.443859,0.068058,...,0,0,0,0,0,0,in,0,0,0
8499152,212,-0.030288,0.467081,-0.104541,-0.138336,-0.414619,0.178526,-0.372544,-0.133503,0.468907,-0.649523,...,0,0,0,0,0,0,stage,0,0,0
8499152,213,0.529757,-0.269583,-0.121444,0.148675,0.087328,-0.033680,0.007509,0.054687,0.839631,0.082502,...,0,0,0,0,0,0,II,0,0,0


In [9]:
x=pd.read_parquet('data/split/train.parquet')
x