In [1]:
import random
from pathlib import Path
import numpy as np
import pandas as pd
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
from sklearn.model_selection import train_test_split

SEED = np.random.seed(0)
DATA = Path('data')
TARGETS = ['participants', 'interventions', 'outcomes']
SUBSET = 'Train' # 'Test'
N_TRAIN_DOCS = 4500
SAVE_LOC = Path('data/split')

SAVE_LOC.mkdir(exist_ok=True, parents=True)

%load_ext jupyternotify

<IPython.core.display.Javascript object>

# Load feature set and labels

In [2]:
FEATURES = ['base', 'pubmedFT', 'pos'] # 'rawFT', 'clust' // base not needed?
feat_paths = [f'data\\features\\{feature}.pkl' for feature in FEATURES]
labels_path = DATA / 'raw' / 'labels.pkl'

print(f"Loading feature set from {', '.join(feat_paths)}")
feats_in = pd.concat([pd.read_pickle(path) for path in feat_paths], axis=1)
      
print(f'Loading labels from {labels_path}')
labels_in = pd.read_pickle(labels_path)
      
# data_mem = sum(sys.getsizeof(i) for i in [X,y]) # slow command
# print(f'Loaded {data_mem / (10**9)} GB')

data = pd.concat([feats_in.drop('Word',axis=1), labels_in],axis=1)

assert not data.columns.duplicated().any()
      
print('Data load complete.')

Loading feature set from data\features\base.pkl, data\features\pubmedFT.pkl, data\features\pos.pkl
Loading labels from data\raw\labels.pkl
Data load complete.


# Train/Test split

Withhold part of the data for evaluation. Create a script for this.

In [3]:
%%notify

feats = [col for col in data.columns if col not in TARGETS]
targets = TARGETS
hold_size = 10 # 493
k_folds = 5 # only simulated for now

doc_ids = list(data.index.unique(level='doc'))

train_val_idx, hold_idx = train_test_split(doc_ids, train_size=N_TRAIN_DOCS, test_size=hold_size)

word = data
test_hold = data.loc[(hold_idx, slice(None)),:] # slow; comment when testing
train_val = data.loc[(train_val_idx, slice(None)),:]

<IPython.core.display.Javascript object>

Hotcode any numerical columns. Note that this may not be the preferred format for all models (for example, Keras can generate sparser embeddings for many-level categorical variables), so skip this step if you do not want a hotcoded dataset.

In [4]:
%%notify

def hotcode(df):
    

    num_cols = df._get_numeric_data().columns
    cat_cols = (set(df.columns) - set(num_cols)) - {'Word'}
    
    print(cat_cols)
    dummies = pd.get_dummies(df[cat_cols])
    
    print('hotcode complete.')
    # assert check that type is numeric for all
    
    return pd.concat([dummies, df[num_cols]], axis=1)

print('hotcoding categorical columns...')
try:
    train_val = hotcode(train_val)
except ValueError:
    print('No categorical values found in data')

hotcoding categorical columns...
set()
No categorical values found in data


<IPython.core.display.Javascript object>

Make the required directories and save the data.

In [6]:
train_val.to_parquet(SAVE_LOC / 'train.parquet') # add mkdir
test_hold.to_parquet(SAVE_LOC / 'test.parquet')

train_ds.to_parquet(SAVE_LOC / f'train_{n_docs}.parquet')

OSError: IOError: . Detail: Python exception: KeyboardInterrupt

In [7]:
train_ds

Unnamed: 0_level_0,Unnamed: 1_level_0,PM_0,PM_1,PM_2,PM_3,PM_4,PM_5,PM_6,PM_7,PM_8,PM_9,...,POS_LAG_1_VBP,POS_LAG_1_VBZ,POS_LAG_1_WDT,POS_LAG_1_WP,POS_LAG_1_WP$,POS_LAG_1_WRB,Word,interventions,outcomes,participants
doc,idx,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
10070173,0,-0.130776,0.240341,-0.174982,-0.044375,-0.184811,0.283461,0.187891,0.461987,0.431069,0.425610,...,0,0,0,0,0,0,Comparison,0,0,0
10070173,1,0.104165,-0.057073,-0.080237,-0.144472,-0.223653,-0.031189,-0.470893,-0.171195,0.424745,0.082970,...,0,0,0,0,0,0,of,0,0,0
10070173,2,0.821748,-0.054989,0.397504,0.419562,0.336511,0.135107,-0.003567,-0.068503,0.008196,-0.460025,...,0,0,0,0,0,0,budesonide,1,0,0
10070173,3,1.223036,0.500320,0.429439,0.841295,-0.195195,-0.202918,-0.570242,-0.264314,0.149682,0.299892,...,0,0,0,0,0,0,Turbuhaler,1,0,0
10070173,4,-0.095962,0.351654,-0.179439,0.068773,-0.385094,-0.131106,-0.435841,0.245588,0.681084,0.204600,...,0,0,0,0,0,0,with,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9989713,289,-0.482327,-0.020039,0.253585,-0.096272,0.212909,0.478674,0.159902,0.234866,1.577489,-0.230714,...,0,0,0,0,0,0,randomised,0,0,0
9989713,290,0.272813,0.066905,-0.238704,-0.165099,0.000402,0.109383,-0.587040,-0.276197,0.278902,-0.087827,...,0,0,0,0,0,0,",",0,0,0
9989713,291,-0.116538,0.463041,-0.158457,0.337768,-0.209759,0.854119,-0.224265,0.123816,0.603451,0.353388,...,0,0,0,0,0,0,clinical,0,0,0
9989713,292,-1.272327,-0.116690,0.162349,-0.350024,-0.083843,0.396636,-0.026688,-0.275437,1.399622,-0.381075,...,0,0,0,0,0,0,trials,0,0,0


# Data saving for Clusterings (temporary)

In [None]:
x = [pd.read_parquet(f'data\\features\\PubMed_{k}_10x300.parquet') for k in [4,8,12]]

In [None]:
x = pd.concat(x, axis=1)

In [None]:
lags = [-2,-1,1,2]
for k in [4,8,12]:
    for lag in lags:
        method = 'bfill' if lag > 0 else 'ffill'
        
        key = f'knn_{k}_lag_{str(lag)}'
        
        x[key] = x[f'knn_{k}'].groupby('doc').shift(lag).fillna(method=method)

In [None]:
x = x.astype(int).astype(str)
x.head(1000)

In [None]:
x = pd.get_dummies(x)

In [None]:
x.to_parquet('data/features/km_4-8-12_10x300.parquet')

# Create indices for K-fold cross-validation

These might be best saved to a file (and loaded by the model scripts) if cluster columns are used, as the kNN should be loaded to fit a prediction on the features when running models, else it will have seen the validation set data. For now, I'll train kNN on a smaller subsample of the text to see if the method has potential.

In [None]:
For now, I'll 