In [85]:
import pandas as pd
import numpy as np
from sklearn.model_selection import *
pd.set_option('max_row',1000)

## train_test_split

In [161]:
X = np.array(range(10))
Y = np.array([0]*6+[1]*4)
print('X: ', X)
print('Y: ', Y)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, shuffle=False)
print('Train data: ',X_train, 'Train label',y_train)
print('Test data: ',X_test, 'Test label',y_test)

X:  [0 1 2 3 4 5 6 7 8 9]
Y:  [0 0 0 0 0 0 1 1 1 1]
Train data:  [0 1 2 3 4 5] Train label [0 0 0 0 0 0]
Test data:  [6 7 8 9] Test label [1 1 1 1]


In [169]:
X = np.array(range(10))
Y = np.array([0]*6+[1]*4)
print('X: ', X)
print('Y: ', Y)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, shuffle=True, random_state=43)
print('Train data: ',X_train, 'Train label',y_train)
print('Test data: ',X_test, 'Test label',y_test)

X:  [0 1 2 3 4 5 6 7 8 9]
Y:  [0 0 0 0 0 0 1 1 1 1]
Train data:  [2 5 1 7 0 4] Train label [0 0 0 1 0 0]
Test data:  [3 9 6 8] Test label [0 1 1 1]


In [170]:
X = np.array(range(10))
Y = np.array([0]*6+[1]*4)
print('X: ', X)
print('Y: ', Y)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, shuffle=True, random_state=43, stratify=Y)
print('Train data: ',X_train, 'Train label',y_train)
print('Test data: ',X_test, 'Test label',y_test)

X:  [0 1 2 3 4 5 6 7 8 9]
Y:  [0 0 0 0 0 0 1 1 1 1]
Train data:  [7 5 1 9 3 2] Train label [1 0 0 1 0 0]
Test data:  [8 4 0 6] Test label [1 0 0 1]


## ShuffleSplit(BaseShuffleSplit)

In [172]:
X = np.array(range(10))
Y = np.array([0]*6+[1]*4)
print('X: ', X)
print('Y: ', Y)
ss = ShuffleSplit(test_size=0.33, random_state=43)
print('Number of data split: ', ss.get_n_splits(X))
train_index, test_index = next(ss.split(X))
print("TRAIN index:", train_index, "TEST index:", test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = Y[train_index], Y[test_index]
print('Train data: ',X_train, 'Train label',y_train)
print('Test data: ',X_test, 'Test label',y_test)

X:  [0 1 2 3 4 5 6 7 8 9]
Y:  [0 0 0 0 0 0 1 1 1 1]
Number of data split:  10
TRAIN index: [2 5 1 7 0 4] TEST index: [3 9 6 8]
Train data:  [2 5 1 7 0 4] Train label [0 0 0 1 0 0]
Test data:  [3 9 6 8] Test label [0 1 1 1]


## StratifiedShuffleSplit(BaseShuffleSplit)

In [174]:
X = np.array(range(10))
Y = np.array([0]*6+[1]*4)
print('X: ', X)
print('Y: ', Y)
sss = StratifiedShuffleSplit(test_size=0.33, random_state=43)
print('Number of data split: ', sss.get_n_splits(X))
train_index, test_index = next(sss.split(X, Y))
print("TRAIN index:", train_index, "TEST index:", test_index)
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = Y[train_index], Y[test_index]
print('Train data: ',X_train, 'Train label',y_train)
print('Test data: ',X_test, 'Test label',y_test)

X:  [0 1 2 3 4 5 6 7 8 9]
Y:  [0 0 0 0 0 0 1 1 1 1]
Number of data split:  10
TRAIN index: [7 5 1 9 3 2] TEST index: [8 4 0 6]
Train data:  [7 5 1 9 3 2] Train label [1 0 0 1 0 0]
Test data:  [8 4 0 6] Test label [1 0 0 1]


## GroupShuffleSplit(ShuffleSplit)

## LeaveOneOut

In [98]:
X = np.array(range(10))
Y = np.array([1]*5+[0]*5)
print('X: ', X)
print('Y: ', Y)
loo = LeaveOneOut()
print('Number of data split: ', loo.get_n_splits(X))
for train_index, test_index in loo.split(X):
    print("TRAIN index:", train_index, "TEST index:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    print('Train data: ',X_train, 'Train label',y_train)
    print('Test data: ',X_test, 'Test label',y_test)

X:  [0 1 2 3 4 5 6 7 8 9]
Y:  [1 1 1 1 1 0 0 0 0 0]
Number of data split:  10
TRAIN index: [1 2 3 4 5 6 7 8 9] TEST index: [0]
Train data:  [1 2 3 4 5 6 7 8 9] Train label [1 1 1 1 0 0 0 0 0]
Test data:  [0] Test label [1]
TRAIN index: [0 2 3 4 5 6 7 8 9] TEST index: [1]
Train data:  [0 2 3 4 5 6 7 8 9] Train label [1 1 1 1 0 0 0 0 0]
Test data:  [1] Test label [1]
TRAIN index: [0 1 3 4 5 6 7 8 9] TEST index: [2]
Train data:  [0 1 3 4 5 6 7 8 9] Train label [1 1 1 1 0 0 0 0 0]
Test data:  [2] Test label [1]
TRAIN index: [0 1 2 4 5 6 7 8 9] TEST index: [3]
Train data:  [0 1 2 4 5 6 7 8 9] Train label [1 1 1 1 0 0 0 0 0]
Test data:  [3] Test label [1]
TRAIN index: [0 1 2 3 5 6 7 8 9] TEST index: [4]
Train data:  [0 1 2 3 5 6 7 8 9] Train label [1 1 1 1 0 0 0 0 0]
Test data:  [4] Test label [1]
TRAIN index: [0 1 2 3 4 6 7 8 9] TEST index: [5]
Train data:  [0 1 2 3 4 6 7 8 9] Train label [1 1 1 1 1 0 0 0 0]
Test data:  [5] Test label [0]
TRAIN index: [0 1 2 3 4 5 7 8 9] TEST index: [6]
Tra

## LeavePOut

In [99]:
X = np.array(range(10))
Y = np.array([1]*5+[0]*5)
print('X: ', X)
print('Y: ', Y)
lpo = LeavePOut(2)
print('Number of data split: ', lpo.get_n_splits(X))
for train_index, test_index in lpo.split(X):
    print("TRAIN index:", train_index, "TEST index:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    print('Train data: ',X_train, 'Train label',y_train)
    print('Test data: ',X_test, 'Test label',y_test)

X:  [0 1 2 3 4 5 6 7 8 9]
Y:  [1 1 1 1 1 0 0 0 0 0]
Number of data split:  45
TRAIN index: [2 3 4 5 6 7 8 9] TEST index: [0 1]
Train data:  [2 3 4 5 6 7 8 9] Train label [1 1 1 0 0 0 0 0]
Test data:  [0 1] Test label [1 1]
TRAIN index: [1 3 4 5 6 7 8 9] TEST index: [0 2]
Train data:  [1 3 4 5 6 7 8 9] Train label [1 1 1 0 0 0 0 0]
Test data:  [0 2] Test label [1 1]
TRAIN index: [1 2 4 5 6 7 8 9] TEST index: [0 3]
Train data:  [1 2 4 5 6 7 8 9] Train label [1 1 1 0 0 0 0 0]
Test data:  [0 3] Test label [1 1]
TRAIN index: [1 2 3 5 6 7 8 9] TEST index: [0 4]
Train data:  [1 2 3 5 6 7 8 9] Train label [1 1 1 0 0 0 0 0]
Test data:  [0 4] Test label [1 1]
TRAIN index: [1 2 3 4 6 7 8 9] TEST index: [0 5]
Train data:  [1 2 3 4 6 7 8 9] Train label [1 1 1 1 0 0 0 0]
Test data:  [0 5] Test label [1 0]
TRAIN index: [1 2 3 4 5 7 8 9] TEST index: [0 6]
Train data:  [1 2 3 4 5 7 8 9] Train label [1 1 1 1 0 0 0 0]
Test data:  [0 6] Test label [1 0]
TRAIN index: [1 2 3 4 5 6 8 9] TEST index: [0 7]
Tra

## KFold

In [105]:
X = np.array(range(15))
Y = np.array([0]*5+[1]*5+[2]*5)
print('X: ', X)
print('Y: ', Y)
kf = KFold(n_splits=5)
print('Number of data split: ', lpo.get_n_splits(X))
for train_index, test_index in kf.split(X):
    print("TRAIN index:", train_index, "TEST index:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    print('Train data: ',X_train, 'Train label',y_train)
    print('Test data: ',X_test, 'Test label',y_test)

X:  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]
Y:  [0 0 0 0 0 1 1 1 1 1 2 2 2 2 2]
Number of data split:  5
TRAIN index: [ 3  4  5  6  7  8  9 10 11 12 13 14] TEST index: [0 1 2]
Train data:  [ 3  4  5  6  7  8  9 10 11 12 13 14] Train label [0 0 1 1 1 1 1 2 2 2 2 2]
Test data:  [0 1 2] Test label [0 0 0]
TRAIN index: [ 0  1  2  6  7  8  9 10 11 12 13 14] TEST index: [3 4 5]
Train data:  [ 0  1  2  6  7  8  9 10 11 12 13 14] Train label [0 0 0 1 1 1 1 2 2 2 2 2]
Test data:  [3 4 5] Test label [0 0 1]
TRAIN index: [ 0  1  2  3  4  5  9 10 11 12 13 14] TEST index: [6 7 8]
Train data:  [ 0  1  2  3  4  5  9 10 11 12 13 14] Train label [0 0 0 0 0 1 1 2 2 2 2 2]
Test data:  [6 7 8] Test label [1 1 1]
TRAIN index: [ 0  1  2  3  4  5  6  7  8 12 13 14] TEST index: [ 9 10 11]
Train data:  [ 0  1  2  3  4  5  6  7  8 12 13 14] Train label [0 0 0 0 0 1 1 1 1 2 2 2]
Test data:  [ 9 10 11] Test label [1 2 2]
TRAIN index: [ 0  1  2  3  4  5  6  7  8  9 10 11] TEST index: [12 13 14]
Train data: 

In [106]:
X = np.array(range(15))
Y = np.array([0]*5+[1]*5+[2]*5)
print('X: ', X)
print('Y: ', Y)
kf = KFold(n_splits=5, shuffle=True, random_state=42)
print('Number of data split: ', lpo.get_n_splits(X))
for train_index, test_index in kf.split(X):
    print("TRAIN index:", train_index, "TEST index:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    print('Train data: ',X_train, 'Train label',y_train)
    print('Test data: ',X_test, 'Test label',y_test)

X:  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]
Y:  [0 0 0 0 0 1 1 1 1 1 2 2 2 2 2]
Number of data split:  5
TRAIN index: [ 1  2  3  4  5  6  7  8 10 12 13 14] TEST index: [ 0  9 11]
Train data:  [ 1  2  3  4  5  6  7  8 10 12 13 14] Train label [0 0 0 0 1 1 1 1 2 2 2 2]
Test data:  [ 0  9 11] Test label [0 1 2]
TRAIN index: [ 0  1  2  3  4  6  7  9 10 11 12 14] TEST index: [ 5  8 13]
Train data:  [ 0  1  2  3  4  6  7  9 10 11 12 14] Train label [0 0 0 0 0 1 1 1 2 2 2 2]
Test data:  [ 5  8 13] Test label [1 1 2]
TRAIN index: [ 0  3  4  5  6  7  8  9 10 11 12 13] TEST index: [ 1  2 14]
Train data:  [ 0  3  4  5  6  7  8  9 10 11 12 13] Train label [0 0 0 1 1 1 1 1 2 2 2 2]
Test data:  [ 1  2 14] Test label [0 0 2]
TRAIN index: [ 0  1  2  3  5  6  8  9 11 12 13 14] TEST index: [ 4  7 10]
Train data:  [ 0  1  2  3  5  6  8  9 11 12 13 14] Train label [0 0 0 0 1 1 1 1 2 2 2 2]
Test data:  [ 4  7 10] Test label [0 1 2]
TRAIN index: [ 0  1  2  4  5  7  8  9 10 11 13 14] TEST index: [ 3  

## GroupKFold

In [125]:
X = np.array(range(15))
Y = np.array([x%3 for x in range(10)]+[0,1,0,1,0])
groups = np.array([x%4 for x in range(10)] + [3]*5)
print('X: ', X)
print('Y: ', Y)
print('Groups: ', groups)
gkf = GroupKFold(n_splits=4)
print('Number of data split: ', gkf.get_n_splits(X))
for train_index, test_index in gkf.split(X,groups=groups):
    print("TRAIN index:", train_index, "TEST index:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    X_train_group = groups[train_index]
    X_test_group = groups[test_index]
    print('Train data: ',X_train, '\nTrain label',y_train, '\nTrain group',X_train_group)
    print('Test data: ',X_test, '\nTest label',y_test, '\nTest group',X_test_group)

X:  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]
Y:  [0 1 2 0 1 2 0 1 2 0 0 1 0 1 0]
Groups:  [0 1 2 3 0 1 2 3 0 1 3 3 3 3 3]
Number of data split:  4
TRAIN index: [0 1 2 4 5 6 8 9] TEST index: [ 3  7 10 11 12 13 14]
Train data:  [0 1 2 4 5 6 8 9] 
Train label [0 1 2 1 2 0 2 0] 
Train group [0 1 2 0 1 2 0 1]
Test data:  [ 3  7 10 11 12 13 14] 
Test label [0 1 0 1 0 1 0] 
Test group [3 3 3 3 3 3 3]
TRAIN index: [ 0  2  3  4  6  7  8 10 11 12 13 14] TEST index: [1 5 9]
Train data:  [ 0  2  3  4  6  7  8 10 11 12 13 14] 
Train label [0 2 0 1 0 1 2 0 1 0 1 0] 
Train group [0 2 3 0 2 3 0 3 3 3 3 3]
Test data:  [1 5 9] 
Test label [1 2 0] 
Test group [1 1 1]
TRAIN index: [ 1  2  3  5  6  7  9 10 11 12 13 14] TEST index: [0 4 8]
Train data:  [ 1  2  3  5  6  7  9 10 11 12 13 14] 
Train label [1 2 0 2 0 1 0 0 1 0 1 0] 
Train group [1 2 3 1 2 3 1 3 3 3 3 3]
Test data:  [0 4 8] 
Test label [0 1 2] 
Test group [0 0 0]
TRAIN index: [ 0  1  3  4  5  7  8  9 10 11 12 13 14] TEST index: [2 6]
Train

## LeaveOneGroupOut

In [153]:
X = np.array(range(15))
Y = np.array([x%3 for x in range(10)]+[0,1,0,1,0])
groups = np.array([x%4 for x in range(10)] + [3]*5)
print('X: ', X)
print('Y: ', Y)
print('Groups: ', groups)
logo = LeaveOneGroupOut()
print('Number of data split: ', logo.get_n_splits(groups=groups))
for train_index, test_index in logo.split(X,Y,groups=groups):
    print("TRAIN index:", train_index, "TEST index:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    X_train_group = groups[train_index]
    X_test_group = groups[test_index]
    print('Train data: ',X_train, '\nTrain label',y_train, '\nTrain group',X_train_group)
    print('Test data: ',X_test, '\nTest label',y_test, '\nTest group',X_test_group)

X:  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]
Y:  [0 1 2 0 1 2 0 1 2 0 0 1 0 1 0]
Groups:  [0 1 2 3 0 1 2 3 0 1 3 3 3 3 3]
Number of data split:  4
TRAIN index: [ 1  2  3  5  6  7  9 10 11 12 13 14] TEST index: [0 4 8]
Train data:  [ 1  2  3  5  6  7  9 10 11 12 13 14] 
Train label [1 2 0 2 0 1 0 0 1 0 1 0] 
Train group [1 2 3 1 2 3 1 3 3 3 3 3]
Test data:  [0 4 8] 
Test label [0 1 2] 
Test group [0 0 0]
TRAIN index: [ 0  2  3  4  6  7  8 10 11 12 13 14] TEST index: [1 5 9]
Train data:  [ 0  2  3  4  6  7  8 10 11 12 13 14] 
Train label [0 2 0 1 0 1 2 0 1 0 1 0] 
Train group [0 2 3 0 2 3 0 3 3 3 3 3]
Test data:  [1 5 9] 
Test label [1 2 0] 
Test group [1 1 1]
TRAIN index: [ 0  1  3  4  5  7  8  9 10 11 12 13 14] TEST index: [2 6]
Train data:  [ 0  1  3  4  5  7  8  9 10 11 12 13 14] 
Train label [0 1 0 1 2 1 2 0 0 1 0 1 0] 
Train group [0 1 3 0 1 3 0 1 3 3 3 3 3]
Test data:  [2 6] 
Test label [2 0] 
Test group [2 2]
TRAIN index: [0 1 2 4 5 6 8 9] TEST index: [ 3  7 10 11 12 13 14]

## LeavePGroupsOut

In [155]:
X = np.array(range(15))
Y = np.array([x%3 for x in range(10)]+[0,1,0,1,0])
groups = np.array([x%4 for x in range(10)] + [3]*5)
print('X: ', X)
print('Y: ', Y)
print('Groups: ', groups)
lpgo = LeavePGroupsOut(n_groups=2)
print('Number of data split: ', lpgo.get_n_splits(groups=groups))
for train_index, test_index in lpgo.split(X,Y,groups=groups):
    print("TRAIN index:", train_index, "TEST index:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    X_train_group = groups[train_index]
    X_test_group = groups[test_index]
    print('Train data: ',X_train, '\nTrain label',y_train, '\nTrain group',X_train_group)
    print('Test data: ',X_test, '\nTest label',y_test, '\nTest group',X_test_group)

X:  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]
Y:  [0 1 2 0 1 2 0 1 2 0 0 1 0 1 0]
Groups:  [0 1 2 3 0 1 2 3 0 1 3 3 3 3 3]
Number of data split:  6
TRAIN index: [ 2  3  6  7 10 11 12 13 14] TEST index: [0 1 4 5 8 9]
Train data:  [ 2  3  6  7 10 11 12 13 14] 
Train label [2 0 0 1 0 1 0 1 0] 
Train group [2 3 2 3 3 3 3 3 3]
Test data:  [0 1 4 5 8 9] 
Test label [0 1 1 2 2 0] 
Test group [0 1 0 1 0 1]
TRAIN index: [ 1  3  5  7  9 10 11 12 13 14] TEST index: [0 2 4 6 8]
Train data:  [ 1  3  5  7  9 10 11 12 13 14] 
Train label [1 0 2 1 0 0 1 0 1 0] 
Train group [1 3 1 3 1 3 3 3 3 3]
Test data:  [0 2 4 6 8] 
Test label [0 2 1 0 2] 
Test group [0 2 0 2 0]
TRAIN index: [1 2 5 6 9] TEST index: [ 0  3  4  7  8 10 11 12 13 14]
Train data:  [1 2 5 6 9] 
Train label [1 2 2 0 0] 
Train group [1 2 1 2 1]
Test data:  [ 0  3  4  7  8 10 11 12 13 14] 
Test label [0 0 1 1 2 0 1 0 1 0] 
Test group [0 3 0 3 0 3 3 3 3 3]
TRAIN index: [ 0  3  4  7  8 10 11 12 13 14] TEST index: [1 2 5 6 9]
Train data: 

## StraitifiedKFold

In [133]:
X = np.array(range(12))
Y = np.array([x%3 for x in range(10)]+[2,2])
print('X: ', X)
print('Y: ', Y)
skf = StratifiedKFold(n_splits=3)
print('Number of data split: ', skf.get_n_splits(X))
for train_index, test_index in skf.split(X,Y):
    print("TRAIN index:", train_index, "TEST index:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    print('Train data: ',X_train, '\nTrain label',y_train)
    print('Test data: ',X_test, '\nTest label',y_test)

X:  [ 0  1  2  3  4  5  6  7  8  9 10 11]
Y:  [0 1 2 0 1 2 0 1 2 0 2 2]
Number of data split:  3
TRAIN index: [ 4  5  6  7  8  9 10 11] TEST index: [0 1 2 3]
Train data:  [ 4  5  6  7  8  9 10 11] 
Train label [1 2 0 1 2 0 2 2]
Test data:  [0 1 2 3] 
Test label [0 1 2 0]
TRAIN index: [ 0  1  2  3  7  9 10 11] TEST index: [4 5 6 8]
Train data:  [ 0  1  2  3  7  9 10 11] 
Train label [0 1 2 0 1 0 2 2]
Test data:  [4 5 6 8] 
Test label [1 2 0 2]
TRAIN index: [0 1 2 3 4 5 6 8] TEST index: [ 7  9 10 11]
Train data:  [0 1 2 3 4 5 6 8] 
Train label [0 1 2 0 1 2 0 2]
Test data:  [ 7  9 10 11] 
Test label [1 0 2 2]


## TimeSeriesSplit

In [148]:
X = np.array(range(10))
Y = np.array([x%2 for x in range(10)])
print('X: ', X)
print('Y: ', Y)
tscv = TimeSeriesSplit(max_train_size=None,n_splits=5)
print('Number of data split: ', tscv.get_n_splits(X))
for train_index, test_index in tscv.split(X,Y):
    print("TRAIN index:", train_index, "TEST index:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    print('Train data: ',X_train, '\nTrain label',y_train)
    print('Test data: ',X_test, '\nTest label',y_test)

X:  [0 1 2 3 4 5 6 7 8 9]
Y:  [0 1 0 1 0 1 0 1 0 1]
Number of data split:  5
TRAIN index: [0 1 2 3 4] TEST index: [5]
Train data:  [0 1 2 3 4] 
Train label [0 1 0 1 0]
Test data:  [5] 
Test label [1]
TRAIN index: [0 1 2 3 4 5] TEST index: [6]
Train data:  [0 1 2 3 4 5] 
Train label [0 1 0 1 0 1]
Test data:  [6] 
Test label [0]
TRAIN index: [0 1 2 3 4 5 6] TEST index: [7]
Train data:  [0 1 2 3 4 5 6] 
Train label [0 1 0 1 0 1 0]
Test data:  [7] 
Test label [1]
TRAIN index: [0 1 2 3 4 5 6 7] TEST index: [8]
Train data:  [0 1 2 3 4 5 6 7] 
Train label [0 1 0 1 0 1 0 1]
Test data:  [8] 
Test label [0]
TRAIN index: [0 1 2 3 4 5 6 7 8] TEST index: [9]
Train data:  [0 1 2 3 4 5 6 7 8] 
Train label [0 1 0 1 0 1 0 1 0]
Test data:  [9] 
Test label [1]


In [103]:
# data = pd.read_csv('/home/david/Downloads/CS1001-301_7196.csv')
# filtered = {
#     'HGB': {'1':[10000,10],'2':[10,8],'3':[8,-100]},
#     'Platelet Counts' : {'1':[100000,75], '2':[75,50],'3':[50,25], '4':[25,-1000]},
#     'WBC Count' : {'1':[10,3],'2':[3,2], '3':[2,1], '4':[1,-10], '3_1':[100000,10]},
#     'Neutrophil Count': {'1':[100,1.5],'2':[1.5,1],'3':[1,0.5],'4':[0.5,-10]},
#     'Lymphocyte Count':{'1':[100,0.8],'2':[0.8,0.5],'3':[0.5,0.2],'4':[0.2, -10]}
# }
# def label_level(d, col):
#     if d == np.nan:
#         return d
#     if col in filtered:
#         cat = filtered[col]
#         for k, v in cat.items():
#             if d >= v[1] and d < v[0]:
#                 return k
#         return np.nan
#     else:
#         return d
# data['HGB_level'] = data.apply(lambda x : label_level(x.HGB, 'HGB'), axis=1)
# data['Platelet_level'] = data.apply(lambda x : label_level(x['Platelet Counts'], 'Platelet Counts'), axis=1)
# data['WBC_level'] = data.apply(lambda x : label_level(x['WBC Count'], 'WBC Count'), axis=1)
# data['Neutrophil_level'] = data.apply(lambda x : label_level(x['Neutrophil Count'], 'Neutrophil Count'), axis=1)
# data['Lymphocyte_level'] = data.apply(lambda x : label_level(x['Lymphocyte Count'], 'Lymphocyte Count'), axis=1)
# data[['HGB','HGB_level','Platelet Counts','Platelet_level','WBC Count','WBC_level','Neutrophil Count','Neutrophil_level','Lymphocyte Count','Lymphocyte_level']].info()
# data.to_csv('/home/david/Downloads/CS1001-301_7196_labeled.csv')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 842 entries, 0 to 841
Data columns (total 10 columns):
HGB                 748 non-null float64
HGB_level           748 non-null object
Platelet Counts     750 non-null float64
Platelet_level      750 non-null object
WBC Count           748 non-null float64
WBC_level           748 non-null object
Neutrophil Count    748 non-null float64
Neutrophil_level    748 non-null object
Lymphocyte Count    748 non-null float64
Lymphocyte_level    748 non-null object
dtypes: float64(5), object(5)
memory usage: 65.9+ KB
