In [1]:
!pip install -r requirements.txt



In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
DATA_PATH = 'data/'

In [4]:
atrain, atest = [pd.read_csv(DATA_PATH + f'aortaP_{s}_data.csv', index_col=0) for s in ['train', 'test']]
btrain, btest = [pd.read_csv(DATA_PATH + f'brachP_{s}_data.csv', index_col=0) for s in ['train', 'test']]
atarget, btarget = atrain.iloc[:, -1], btrain.iloc[:, -1]
atrain, btrain = atrain.iloc[:, :-1], btrain.iloc[:, :-1]
atrain, btrain, atest, btest = [df.astype(np.float32) for df in [atrain, btrain, atest, btest]]

In [5]:
assert all(atarget == btarget)
target = atarget

In [6]:
atrain.shape, atest.shape, btrain.shape, btest.shape

((3499, 336), (875, 336), (3499, 336), (875, 336))

In [7]:
import lightgbm as lgb

In [8]:
def runFeatures(df):
    feat = {}
    feat['mean'] = df.T.mean()
    feat['median'] = df.T.median()
    feat['stdev'] = df.T.std()
    
    feat['max_r20'] = df.T.rolling(20, min_periods = 3, center = True, win_type = 'triang').mean().max()
    feat['min_r20'] = df.T.rolling(20, min_periods = 3, center = True, win_type = 'triang').mean().min()
    feat['max_ewm30'] = df.T.ewm(span=30).mean().max()
    feat['min_ewm30'] = df.T.ewm(span=30).mean().min()

    feat['firstewm10'] = df.T[::-1].ewm(span=10).mean().fillna(method = 'ffill').iloc[-1]
    feat['lastewm10'] = df.T.ewm(span=10).mean().fillna(method = 'ffill').iloc[-1]
    feat['firstewm30'] = df.T[::-1].ewm(span=30).mean().fillna(method = 'ffill').iloc[-1]
    feat['lastewm30'] = df.T.ewm(span=30).mean().fillna(method = 'ffill').iloc[-1]

    feat['meanabsdiff'] = df.T.diff().abs().mean()

    return pd.DataFrame(feat)

In [9]:
def runAllFeatures(atrain, btrain):
    adf = runFeatures(atrain)
    adf.columns = [f'a_{c}' for c in adf.columns]
    bdf = runFeatures(btrain)
    bdf.columns = [f'b_{c}' for c in bdf.columns]
    tdf = pd.concat([adf, bdf,], axis=1)
    return tdf

In [10]:
tdf = runAllFeatures(atrain, btrain)
tdf.tail(8)

Unnamed: 0,a_mean,a_median,a_stdev,a_max_r20,a_min_r20,a_max_ewm30,a_min_ewm30,a_firstewm10,a_lastewm10,a_firstewm30,...,b_stdev,b_max_r20,b_min_r20,b_max_ewm30,b_min_ewm30,b_firstewm10,b_lastewm10,b_firstewm30,b_lastewm30,b_meanabsdiff
3491,95.785461,96.193024,10.582276,113.568757,79.868611,110.521947,78.979973,81.883529,80.129814,87.046176,...,13.005561,119.813388,77.576203,116.610765,75.428825,81.4501,78.175649,90.936297,78.850223,2.94644
3492,91.398071,91.765991,6.816,100.866398,80.167324,100.033894,79.544778,82.392845,80.850808,87.706613,...,12.493203,113.507155,76.46427,111.846512,73.831055,77.892712,79.198529,85.359482,79.96723,3.205593
3493,104.798164,103.567177,17.08507,128.527304,65.767959,127.876834,63.465934,68.013147,79.01806,77.68188,...,19.629419,134.284995,63.312244,130.86816,59.856607,67.183561,78.537929,84.692676,79.26462,3.248784
3494,101.027969,101.815971,6.963031,112.262706,82.330051,110.611216,78.800478,83.265197,93.828022,88.199102,...,10.545702,121.618725,80.38705,120.038513,79.121662,81.627378,90.07736,88.861085,90.300931,3.515811
3495,106.158798,106.004547,13.691897,127.073255,81.599095,125.912171,80.030341,82.357834,85.091342,89.226353,...,15.439773,130.161855,77.862243,128.85857,74.754324,80.148845,83.960711,90.913693,85.395695,3.309429
3496,96.395981,97.170319,7.505884,106.59387,80.269318,105.907235,76.350769,81.183748,83.952113,86.168668,...,11.818682,118.261009,76.992445,115.90658,75.600597,78.562577,79.70113,86.368741,81.381965,3.142799
3497,95.903717,94.296356,11.65923,114.38754,78.323137,113.852085,76.224052,80.576456,78.240534,86.461969,...,14.87372,121.398666,76.535616,119.257061,73.337807,78.237501,77.051683,91.111958,78.175587,3.347089
3498,89.505211,90.046722,8.729355,102.940285,74.566757,101.932975,72.818278,76.464661,74.784074,81.926921,...,12.877909,113.170721,71.420284,111.39411,71.078721,74.406511,71.268536,83.932905,72.327795,3.184667


In [11]:
lgb_params = {
    'n_estimators': 200,
    'num_leaves': 20,    
    'learning_rate': 0.05,          
    'colsample_bynode': 0.9,
    'subsample': 0.9,
    'subsample_freq': 1,
    'reg_lambda': 0.1,
    'linear_lambda': 0.01,
    'linear_tree': True,
    'extra_trees': True,
    'min_child_weight': 0.01,
}

In [12]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [13]:
yps = []; models = [];
for i in range(8):
    folds = list(StratifiedKFold(n_splits = 5, shuffle = True, random_state = i).split(atrain, target))
    p = lgb_params.copy()    
    for train_fold, test_fold in folds:
        model = Pipeline([('scaler', StandardScaler()), 
                             ('lgb', lgb.LGBMClassifier(**p, verbose = -1,))])
        x, y = tdf, target
        x_train, x_test = x.iloc[train_fold], x.iloc[test_fold]
        y_train, y_test = y.iloc[train_fold], y.iloc[test_fold]
        model.fit(x_train, y_train)
        yp = model.predict_proba(x_test)
        yp = pd.DataFrame(yp, index=y_test.index)
        yps.append(yp)
        models.append(model)
        print('.', end = '')
yp = pd.concat(yps)
yp = yp.groupby(yp.index).mean()
yp = yp.sort_index()

........................................

In [14]:
roc_auc_score(target, yp, multi_class='ovr'), accuracy_score(target, yp.idxmax(axis=1))

(0.9243741561397906, 0.670477279222635)

In [15]:
test_df = runAllFeatures(atest, btest)
test_df.tail(8)

Unnamed: 0,a_mean,a_median,a_stdev,a_max_r20,a_min_r20,a_max_ewm30,a_min_ewm30,a_firstewm10,a_lastewm10,a_firstewm30,...,b_stdev,b_max_r20,b_min_r20,b_max_ewm30,b_min_ewm30,b_firstewm10,b_lastewm10,b_firstewm30,b_lastewm30,b_meanabsdiff
867,91.317284,92.056534,6.69265,99.768226,73.582243,99.511895,68.94249,74.561667,81.69415,80.027156,...,9.725534,106.690365,72.371733,106.067368,70.115738,73.554127,80.467477,80.748387,80.669732,3.215725
868,104.84874,105.055389,8.622636,116.453923,82.383796,115.933473,80.927375,83.744011,92.92786,88.89692,...,10.140232,119.780806,78.817953,118.407988,75.710808,81.045929,92.592237,91.311692,93.488625,3.066062
869,108.45488,107.884651,25.630835,148.15261,64.01119,145.963564,60.137051,66.610489,72.02143,78.28059,...,29.835073,151.485146,61.855143,149.796069,58.126743,66.882164,68.87458,89.220933,70.276882,3.377351
870,90.877319,91.424629,6.349154,99.191273,75.49144,98.701298,74.250003,76.698603,83.052961,81.244345,...,9.02656,107.92536,72.540155,105.300233,70.676544,73.963119,80.022313,82.154742,81.574744,3.164873
871,88.117043,88.727112,7.641936,99.397914,65.620121,98.490509,63.854432,67.50558,78.05672,74.291974,...,11.760089,110.415931,63.05349,107.667568,62.250556,65.009412,74.557869,74.872446,75.995625,3.391141
872,87.017677,86.987892,5.05139,95.00442,77.408088,94.93815,73.092743,78.180205,79.935214,81.834595,...,8.055858,102.971634,76.519545,101.476272,74.527946,77.272236,78.532704,82.863404,79.054083,3.169646
873,94.924934,95.159325,6.078005,102.607064,79.004753,102.339752,73.628166,80.281324,85.610199,86.498278,...,10.098614,113.593955,76.847736,112.159712,74.958191,78.183716,85.140308,85.930994,84.992546,3.348258
874,89.371819,90.072044,9.222563,102.212955,71.54353,101.722092,71.260273,72.372268,76.139581,76.852054,...,11.684813,109.556365,69.347095,107.950216,67.908078,70.480696,73.977379,79.885431,74.727673,3.36082


In [16]:
test_yps = []
for model in models:
    yp = model.predict_proba(test_df)
    yp = pd.DataFrame(yp, index=atest.index)
    test_yps.append(yp)

yp = pd.concat(test_yps)
yp = yp.groupby(yp.index).mean()
test_yp = yp.sort_index()

In [17]:
preds = test_yp.idxmax(axis=1).to_dict()
with open('data/TrueFitAI.json', 'w') as f:
    f.write('{\n')
    for k, v in preds.items():
        f.write(f'    {k}: {v},\n')
    f.seek(f.tell() - 2, 0)
    f.write('\n')
    f.write('}\n')


In [18]:
# just the first 10 lines
!cat data/TrueFitAI.json  | head -n 10

{
    0: 5,
    1: 1,
    2: 0,
    3: 5,
    4: 2,
    5: 4,
    6: 1,
    7: 0,
    8: 5,


In [19]:
# just the first 10 lines
!cat data/TrueFitAI.json  | tail -n 10

    866: 4,
    867: 3,
    868: 5,
    869: 5,
    870: 2,
    871: 0,
    872: 3,
    873: 1,
    874: 5
}


In [20]:
target.value_counts().sort_index()

target
0    582
1    583
2    593
3    582
4    576
5    583
Name: count, dtype: int64

In [21]:
pd.Series(preds).value_counts().sort_index()

0    148
1    150
2    156
3    110
4    151
5    160
Name: count, dtype: int64