# Combiom: searching for combinatorial biomarkers

Algorithm to discover *combinatorial biomarkers* using linear and kernel ridge regression, Theil-Sen estimator, and machine learning.

## Importing packages

In [172]:
import itertools as it
import sklearn.linear_model as sk_lm
import sklearn.preprocessing as sk_pr
import sklearn.kernel_ridge as sk_kr
import multiprocessing as mp
import pandas as pd
import csv
import numpy as np
import re
import string as st
import numexpr as ne
import datetime as dt

## Combiom functions

In [164]:
def init_iterators(parameters):

    iterators = {
            'a': list(range(parameters)),
            '1/a': list(range(parameters)),
            'a/b': list(it.permutations(range(parameters), 2)),
            'a/b/c': list(iter((x, y, z) for x, y, z in it.permutations(range(parameters), 3) if z > y)),
            'a*b/c': list(it.combinations(range(parameters), 3)),
            'a*b/c/d': list(iter((a, b, c, d) for a, b, c, d in it.permutations(range(parameters), 4) if b > a and d > c))
    }
    return iterators
        
    
def search(marker_names, marker_data, target_num, target_data, target_names, iterators, iterator_type='all'):

    results, temp = [], []
    queue = iterators.keys() if iterator_type == 'all' else iterator_type
        
    Pool = mp.Pool(mp.cpu_count())

    for it in queue:

        r = Pool.apply_async(__iterate_combinations, args=(marker_names, marker_data,
                                                           target_num, target_data, target_names,
                                                           iterators, it))
        temp.append(r)

    for r in temp:
        results.append(r.get())

    Pool.close()
    Pool.join()

    print('Search was successfully finished')
    return results


def to_dataframe(data):
        '''Data returned by .search() method'''

        # Importing results and adding to a list
        cols = ['Biomarker', 'Target marker',
                'M1', 'M2', 'M3', 'M4',
                'TS', 'R', 'KR', 'MID', 'TID', 'Type']
        dfl = [pd.DataFrame(i, columns=cols) for i in data]

        df = pd.concat(dfl)
        df = df.rename(columns={'TS': 'Theil-Sen Score', 'R': 'Ridge Score', 'KR': 'Kernel Ridge Score',
                                'M1': 'Marker 1', 'M2': 'Marker 2', 'M3': 'Marker 3', 'M4': 'Marker 4'})

        return df


def __iterate_combinations(marker_names, marker_data, target_num, target_data, target_names, iterators, iterator_type):

        output = {'Biomarker': [], 'Target marker': [],
                  'M1': [], 'M2': [], 'M3': [], 'M4': [],
                  'TS': [], 'R': [], 'KR': [],
                  'MID': [], 'TID': [], 'Type': []}

        print('Search:', iterator_type, 'started')
        
        for a, t in it.product(iterators[iterator_type], range(target_num)):

            list_ids = np.array([a]).ravel()
            list_ops = list(re.sub(r"\w", "", iterator_type))

            # Naming a combinatorial marker
            marker_name = __name(marker_names, iterator_type, list_ids)

            # Naming simple markers and padding list with np.nan to make it 4-element
            simple_markers_names = np.pad(marker_names[[list_ids]].astype('object'),
                                          (0, 4-len(list_ids)), mode='constant', constant_values=(0, np.nan))
            
            # Calculating marker
            marker_values = __calc_marker(marker_data, iterator_type, list_ids)

            # Preprocessing data: transforming NaN values into neighbours-mean
            # and normalizing data
            if np.isnan(marker_values).any():
                marker_values = __transform_nan(marker_values)
            marker_values_norm = __normalize(marker_values)
            marker_values_norm = marker_values_norm.reshape(-1, 1)

            # Marker IDs joining into a string
            mid = ', '.join(map(str, list_ids))

            # Naming a target
            target_name = target_names[t]
            target_values = target_data[t].reshape(-1, 1)
            tid = str(t)

            # Regression
            ts_score, ridge_score, kr_score = __regression(marker_values_norm, target_values)

            # Processing results
            if any(z > 0.8 for z in (np.around(ts_score, 1), np.around(ridge_score, 1), np.around(kr_score, 1))):

                for n, v in zip(['Biomarker', 'Target marker',
                                 'M1', 'M2',
                                 'M3', 'M4',
                                 'TS', 'R', 'KR', 'MID', 'TID', 'Type'],
                                [marker_name, target_name,
                                 simple_markers_names[0], simple_markers_names[1],
                                 simple_markers_names[2], simple_markers_names[3],
                                 ts_score, ridge_score, kr_score, mid, tid, iterator_type]):

                    output[n].append(v)

        print('Search:', iterator_type, 'finished')

        return output


def __regression(x, y, regressor_return=False):

    # TheilSen Regression
    ts_y = y.ravel()
    ts = sk_lm.TheilSenRegressor()
    ts.fit(x, ts_y)

    # r squared
    ts_y_pred = ts.predict(x)
    ts_y_mean = np.mean(ts_y)
    ts_ssr = np.sum((ts_y_pred - ts_y_mean) ** 2)
    ts_sst = np.sum((ts_y - ts_y_mean) ** 2)
    #ts_score = np.absolute(ts.score(index_norm, ts_y))
    ts_score = ts_ssr / ts_sst

    # Ridge Regression
    # Normalizating & reshaping
    ridge = sk_lm.Ridge(alpha=0.01, normalize=False)
    ridge.fit(x, y)
    ridge_score = np.absolute(ridge.score(x, y))

    # Kernel Ridge Regression
    kernel_ridge = sk_kr.KernelRidge(kernel='rbf', alpha=0.01)
    kernel_ridge.fit(x, y)
    kr_score = np.absolute(kernel_ridge.score(x, y))

    if regressor_return:
        return (ts, ridge, kernel_ridge)
    else:
        return (ts_score, ridge_score, kr_score)



def __transform_nan(x, strategy='mean'):

        imp = sk_pr.Imputer(missing_values='NaN', strategy=strategy)
        x = imp.fit_transform(x.reshape(-1, 1))

        return x


def __normalize(x, sample=None):

    if sample is None:
        sample = x
    return (x - np.mean(sample)) / np.std(sample)


def __name(marker_names, marker_type, ids):

    data = {}
    for l, p in zip(st.ascii_letters[: ids.size], marker_names[ids]):
        data[l] = p

    rep = dict((re.escape(k), v) for k, v in data.items())
    pattern = re.compile("|".join(rep.keys()))
    text = pattern.sub(lambda m: rep[re.escape(m.group(0))], marker_type)

    return text


def __calc_marker(marker_data, marker_type, ids):

    data = {}

    for l, p in zip(st.ascii_letters[: ids.size], marker_data[ids]):
        data[l] = p

    # Example: a*b/c (type), {a: ..., b: ..., c: ...} (data)
    marker = ne.evaluate(marker_type, data)
        
    return marker


def predict(marker_values, marker_data, markers_names, marker_names, marker_type, target_data, target_names, target_name):

    marker_ids = np.array([np.where(markers_names == n)[0][0] for n in marker_names])

    target_values = target_data[np.where(target_names == target_name)[0][0]].reshape(-1, 1)

    d = {}
    for i, (l, p) in enumerate(zip(st.ascii_letters[: len(marker_values)], marker_values)):
        d[l] = p

    marker_value = ne.evaluate(marker_type, d)

    # Calculating marker
    marker_train_values = __calc_marker(marker_data, marker_type, marker_ids)

    # Preprocessing data: transforming NaN values into neighbours-mean
    # and normalizing data 
    #marker_value = self.__transform_nan(marker_values)
    marker_value_norm = __normalize(marker_value, marker_train_values)
    marker_train_values_norm = __normalize(marker_train_values)
    marker_train_values_norm = marker_train_values_norm.reshape(-1, 1)

    # regression
    ts, ridge, kernel_ridge = __regression(marker_train_values_norm, target_values, regressor_return=True)
    return (10**ts.predict(marker_value_norm),
            10**ridge.predict(marker_value_norm),
            10**kernel_ridge.predict(marker_value_norm))


def retrain(self, marker_names, marker_type, target_name):

    marker_ids = []

    for n in markers_names:
        marker_ids.append(np.where(self.marker_names == n)[0][0])

    target_data = self.target_data[np.where(self.target_names == target_name)[0][0]]

    d = {}
    for i, (l, p) in enumerate(zip(st.ascii_letters[: len(marker_ids)], self.marker_data[[marker_ids]])):
        d[l] = p

    x = ne.evaluate(marker_type, d)

    # Calculating marker
    marker_value = self.__calc_marker(marker_type, marker_ids)

    # Preprocessing data: transforming NaN values into neighbours-mean
    # and normalizing data 
    marker_value = self.__transform_nan(marker_value)
    marker_value_norm = self.__normalize(marker_value)

    # regression
    ts, ridge, kernel_ridge = self.__regression(index_norm, targetdata, regressor_return=True)
    return (ts, ridge, kernel_ridge)


## 1 Biochemical Dataset
### 1.1 Importing and preprocessing

In [91]:
# Parameter names
bio_csv_names = csv.reader(open('./BiochemData/parameters-names.csv'), delimiter=';')
bio_marker_names = np.array(list(bio_csv_names)).flatten()

# Setting shape of data array
bio_volunteers = 10
bio_parameters = 11
bio_points = 8

# Initializing data array with zeros
bio_data = np.zeros(shape=(bio_volunteers, bio_parameters, bio_points))

# Timepoints at which each biochemical parameter was measured (in hours after experiment)
# Null stands for "before experiment", 1 - in an hour after experiment, so on...
# bio_timepoints = np.array([0, 1, 1*24, 2*24, 3*24, 5*24, 7*24, 9*24])

# Reading numeric data from csv-files. One file per participant
# Each file has a shape of N x M
#     where N is the number of parameters and M is the number of timepoints at which they were measured
for i in np.arange(bio_volunteers):
    bio_csv_file = csv.reader(open('./BiochemData/%d.csv' % i), delimiter=';')
    bio_data[i, :, :] = np.array(list(bio_csv_file), np.float64)

# Shape of data array
np.shape(bio_data)

(10, 11, 8)

### 1.2 Target markers

In [92]:
# Reading data for creatine kinase
bio_csv_ck = csv.reader(open('./BiochemData/creatine-kinase.csv'), delimiter=';')
bio_ckdata = np.array(list(bio_csv_ck), np.float64)

# Reading data for aspartate transferase
bio_csv_ast = csv.reader(open('./BiochemData/aspartate-transferase.csv'), delimiter=';')
bio_astdata = np.array(list(bio_csv_ast), np.float64)

# Reading data for myoglobin
bio_csv_mg = csv.reader(open('./BiochemData/myoglobin.csv'), delimiter=';')
bio_mgdata = np.array(list(bio_csv_mg), np.float64)
bio_mgdata[bio_mgdata == 0] = np.nan

imp = sk_pr.Imputer(missing_values='NaN', strategy='mean')
bio_mgdata = imp.fit_transform(bio_mgdata)

# Reading data for albumin
bio_csv_alb = csv.reader(open('./BiochemData/albumin.csv'), delimiter=';')
bio_albdata = np.array(list(bio_csv_alb), np.float64)
bio_albdata[bio_albdata == 0] = np.nan

# Processing creatinekinase and AST data
# Making an array of maximum values and taking a LOG10 element-wisely
bio_ckmax = np.log10([np.max(z) for z in bio_ckdata])
bio_astmax = np.log10([np.max(z) for z in bio_astdata])
bio_mgmax = np.log10([np.max(z) for z in bio_mgdata])

bio_alb = np.log10(bio_albdata[np.arange(bio_albdata.shape[0]), [np.argmax(z) for z in bio_ckdata]])

bio_ckmg = bio_ckmax / np.log10(bio_mgdata[np.arange(bio_mgdata.shape[0]), [np.argmax(z) for z in bio_ckdata]])
bio_ckast = bio_ckmax / np.log10(bio_astdata[np.arange(bio_astdata.shape[0]), [np.argmax(z) for z in bio_ckdata]])
bio_ckalb = bio_ckmax / bio_alb
bio_mgalb = bio_mgmax / bio_alb

# Target markers
bio_target_names = np.array(['Creatinekinase', 'AST', 'Myoglobin',
                             'Creatinekinase / AST', 'Creatinekinase / Myoglobin',
                             'Creatinekinase / Albumin', 'Myoglobin / Albumin'])

bio_target_data = np.vstack((bio_ckmax, bio_astmax, bio_mgmax, bio_ckmg, bio_ckast, bio_ckalb, bio_mgalb))

# Making an array of 1-hour measures
bio_marker_data = bio_data[:,:,1].T

# Shapes of marker arrays
print('Shape of marker_names', np.shape(bio_marker_names))
print('Shape of marker_data', np.shape(bio_marker_data))

Shape of marker_names (11,)
Shape of marker_data (11, 10)


In [93]:
# Shapes of target arrays
print('Shape of target_names', np.shape(bio_target_names))
print('Shape of target_data', np.shape(bio_target_data))

Shape of target_names (7,)
Shape of target_data (7, 10)


### 1.3 Searching

In [97]:
iterators = init_iterators(bio_parameters)

bio_results = search(bio_marker_names, bio_marker_data, bio_target_names.size, bio_target_data, bio_target_names, iterators)

Search: a*b/c started
Search: a/b/c started


  **self._backend_args)
  **self._backend_args)


Search: 1/a started
Search: a*b/c/d started


  **self._backend_args)
  **self._backend_args)


Search: 1/a finished
Search: a started
Search: a finished
Search: a/b started
Search: a/b finished
Search: a*b/c finished
Search: a/b/c finished
Search: a*b/c/d finished
Search was successfully finished


### 1.4 Results. Exporting to Pandas and Pickle

In [98]:
# Creating DataFrame
bio_df = to_dataframe(bio_results)

# Export to Pickle file
bio_df.to_pickle('./biochem_db.pickle')

### 1.5 Exporting to Excel

#### Top combinatorial biochemical markers in each group

In [118]:
biochem_max_group = bio_df.loc[bio_df.groupby(['Target marker'])['Kernel Ridge Score'].idxmax()].reset_index(drop=True)\
                              [['Biomarker', 'Target marker', 'Theil-Sen Score', 'Ridge Score', 'Kernel Ridge Score', 'Type']]
biochem_max_group

Unnamed: 0,Biomarker,Target marker,Theil-Sen Score,Ridge Score,Kernel Ridge Score,Type
0,Urea*Glucose/Uric acid/MCFA,AST,0.847223,0.85064,0.990792,a*b/c/d
1,TAG*Phosphate/Urea/Uric acid,Creatinekinase,0.85297,0.281999,0.991675,a*b/c/d
2,TAG*Chloride/Urea/Bilirubin,Creatinekinase / AST,0.022768,0.012325,0.988465,a*b/c/d
3,TAG*Phosphate/Urea/Uric acid,Creatinekinase / Albumin,0.750298,0.27533,0.991216,a*b/c/d
4,TAG*MCFA/Chloride/Uric acid,Creatinekinase / Myoglobin,0.118217,0.149132,0.981006,a*b/c/d
5,Urea*Glucose/Uric acid/MCFA,Myoglobin,1.275538,0.400837,0.985968,a*b/c/d
6,Urea*Glucose/Uric acid/MCFA,Myoglobin / Albumin,1.125844,0.414401,0.988661,a*b/c/d


In [102]:
# Initialize Pandas Excel writer with XlsxWriter
writer = pd.ExcelWriter('./combinatorial_biochemical_biomarkers.xlsx', engine='xlsxwriter')

# Convert Dataframe to Excel object
biochem_max_group.to_excel(writer, sheet_name='Target Max')

#### Combinatorial biomarkers sorted by the number of target markers they correlate with.

In [103]:
biochem_max_top = bio_df.groupby(('Biomarker'))\
                        .agg({'Target marker': 'count', 'Kernel Ridge Score': 'mean', 'Theil-Sen Score': 'mean', 'Ridge Score': 'mean'})\
                        .sort_values(['Target marker', 'Kernel Ridge Score'], ascending=False)\
                        .head(10)\
                        [['Target marker', 'Kernel Ridge Score', 'Ridge Score', 'Theil-Sen Score']]\
                        .reset_index()
biochem_max_top

Unnamed: 0,Biomarker,Target marker,Kernel Ridge Score,Ridge Score,Theil-Sen Score
0,1/MCFA,7,0.871264,0.307921,0.611971
1,TAG*Phosphate/Lactate/Uric acid,7,0.858034,0.424495,0.90473
2,Phosphate*Chloride/Lactate/Bilirubin,7,0.814122,0.275579,1.082766
3,Lactate*Bilirubin/Creatinine/Urea,7,0.756006,0.271455,1.609417
4,Urea*Chloride/Creatinine/Lactate,7,0.73032,0.324502,0.713174
5,Creatinine*Bilirubin/Lactate/MCFA,7,0.465168,0.126523,1.025088
6,Glucose*Uric acid/Cholesterol/MCFA,6,0.947046,0.169089,0.123946
7,Creatinine/Chloride/Glucose,6,0.936062,0.138021,0.944984
8,Cholesterol*Phosphate/Glucose,6,0.915915,0.237433,0.22308
9,Creatinine*Urea/Cholesterol/MCFA,6,0.900253,0.454116,0.912222


In [104]:
# Convert the dataframe to an XlsxWriter Excel object.
biochem_max_top.to_excel(writer, sheet_name='Overall Max')

#### Combinatorial biomarkers sorted by the number of target markers they correlate with.

In [110]:
biochem_max_type = bio_df.groupby('Type')[['Biomarker', 'Theil-Sen Score', 'Ridge Score', 'Kernel Ridge Score']]\
                          .max().sort_values(by=['Kernel Ridge Score'], ascending=False)
                         
biochem_max_type

Unnamed: 0_level_0,Biomarker,Theil-Sen Score,Ridge Score,Kernel Ridge Score
Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a*b/c/d,Uric acid*MCFA/Urea/TAG,8.75207,0.857783,0.991675
a/b/c,Uric acid/Urea/TAG,4.417148,0.839897,0.987554
a/b,Uric acid/TAG,2.851853,0.83225,0.983734
a*b/c,Urea*TAG/Uric acid,6.572136,0.795776,0.970988
1/a,1/Uric acid,2.115066,0.537865,0.967646
a,Uric acid,1.341052,0.547171,0.959395


In [111]:
# Convert the dataframe to an XlsxWriter Excel object.
biochem_max_type.to_excel(writer, sheet_name='Type Max')

# Close the Pandas Excel writer and output the Excel file.
writer.save()

### 1.6 Prediction

In [165]:
# Prediction for Uric acid/Urea/TAG marker
prediction_values = [0.51, 5.01, 1.01]
ts, r, kr = predict(prediction_values, bio_marker_data, bio_marker_names,
                    ['Uric acid', 'Urea', 'TAG'], 'a/b/c', bio_target_data, bio_target_names, 'AST')

# Theil-Sen, Ridge, Kernel Ridge Prediction
print(ts, r, kr)

[ 72.10447832] [[ 71.28966213]] [[ 124.96472383]]


## 2 Physiological Dataset
### 2.1 Importing and preprocessing

In [113]:
# Parameter names
phys_csv_names = csv.reader(open('./PhysData/parameters-names.csv'), delimiter=';')
phys_marker_names = np.array(list(phys_csv_names)).ravel()

# Setting shape of data array
phys_volunteers = 10
phys_parameters = 15
phys_points = 8

# Initializing data array with zeros
phys_data = np.zeros(shape=(phys_volunteers, phys_parameters, phys_points))

# Timepoints at which each biochemical parameter was measured (in hours after experiment)
# 0 stands for "before experiment", 0.05 - shortly after experiment, 1 - in an hour after experiment, so on...
phys_timepoints = np.array([0, 0.05, 1, 1*24, 2*24, 3*24, 5*24, 7*24, 9*24])

# Reading numeric data from csv-files
# Each file has a shape of N x M
#     where N is the number of parameters and M is the number of timepoints at which they were measured
for i in np.arange(phys_volunteers):
    phys_csv_file = csv.reader(open('./PhysData/%d.csv' % i), delimiter=';')
    phys_data[i,:,:] = np.array(list(phys_csv_file), np.float64)

# Mask zeros with NaN
phys_data[phys_data == 0] = np.nan

# Making an array of 1-hour measures
phys_marker_data = phys_data[:,:,1].T

# Shape
np.shape(phys_marker_data)

(15, 10)

In [200]:
# Observations number
np.count_nonzero(bio_data) + np.count_nonzero(phys_data) + np.count_nonzero(bio_target_data)

2150

### <span style="color: red">2.2 Search</span>

In [116]:
# Initialize with 15 parameters
phys_iterators = init_iterators(phys_parameters)

# Search for combinatorial biomarkers
phys_results = search(phys_marker_names, phys_marker_data, bio_target_names.size, bio_target_data, bio_target_names, phys_iterators)

Search: a*b/c started
Search: a/b/c started


  **self._backend_args)
  **self._backend_args)


Search: 1/a started
Search: a*b/c/d started


  **self._backend_args)
  **self._backend_args)


Search: 1/a finished
Search: a started
Search: a finished
Search: a/b started
Search: a/b finished
Search: a*b/c finished
Search: a/b/c finished
Search: a*b/c/d finished
Search was successfully finished


### 2.3 Results. Export to DataFrame and Pickle

In [117]:
# Creating DataFrame
phys_df = to_dataframe(phys_results)

# Export to Pickle file
phys_df.to_pickle('./phys_db.pickle')

### 2.4 Exporting to Excel

#### Top combinatorial physiological markers which correlate with each target marker

In [147]:
phys_max_group = phys_df.loc[phys_df.reset_index(drop=True).groupby(['Target marker'])['Kernel Ridge Score'].idxmax()]\
                            [['Biomarker', 'Target marker', 'Theil-Sen Score', 'Ridge Score', 'Kernel Ridge Score', 'Type']]
phys_max_group

Unnamed: 0,Biomarker,Target marker,Theil-Sen Score,Ridge Score,Kernel Ridge Score,Type
1525,EMG Freq*DIA Blood Pres 1/H-r Length/SYS Blood...,Creatinekinase / Albumin,0.990417,0.111359,0.953692,a*b/c/d
3694,R-Thigh Circ*Isometric Strength/Relax Tone/SYS...,Myoglobin,0.753208,0.440939,0.907135,a*b/c/d
4102,L-Thigh Circ*Contrac Tone/H-r Length/Isometric...,Myoglobin / Albumin,0.969902,0.191944,0.325659,a*b/c/d
3698,R-Thigh Circ*Isometric Strength/Contrac Tone/S...,AST,0.72002,0.422957,0.966523,a*b/c/d
4386,L-Thigh Circ*SYS Blood Pres 2/Max Amp EMG/SYS ...,Creatinekinase / Albumin,1.010077,0.270002,0.425639,a*b/c/d
1526,EMG Freq*DIA Blood Pres 1/H-r Length/SYS Blood...,Creatinekinase,0.650075,0.242771,0.921587,a*b/c/d
1528,EMG Freq*DIA Blood Pres 1/H-r Length/DIA Blood...,Creatinekinase,0.85712,0.032563,0.894136,a*b/c/d


In [148]:
# Initialize Pandas Excel writer with XlsxWriter
writer = pd.ExcelWriter('./combinatorial_physiological_biomarkers.xlsx', engine='xlsxwriter')

# Convert Dataframe to Excel object
phys_max_group.to_excel(writer, sheet_name='Target Max')

#### Combinatorial biomarkers sorted by the number of target markers they correlate with.

In [149]:
phys_max_top = phys_df.groupby(('Biomarker'))\
                      .agg({'Target marker': 'count', 'Kernel Ridge Score': 'mean', 'Theil-Sen Score': 'mean', 'Ridge Score': 'mean'})\
                      .sort_values(['Target marker', 'Kernel Ridge Score'], ascending=False)\
                      .head(10)\
                      [['Target marker', 'Kernel Ridge Score', 'Ridge Score', 'Theil-Sen Score']]\
                      .reset_index()
phys_max_top

Unnamed: 0,Biomarker,Target marker,Kernel Ridge Score,Ridge Score,Theil-Sen Score
0,R-Thigh Circ*DIA Blood Pres 1/H-r Latency/H-r ...,7,0.945168,0.028579,0.164029
1,L-Thigh Circ*Isometric Strength/Contrac Tone/D...,7,0.924334,0.316239,0.553227
2,Relax Tone/Contrac Tone/DIA Blood Pres 1,7,0.918476,0.024037,0.314835
3,H-r Length*Contrac Tone/Relax Tone/SYS Blood P...,7,0.916335,0.036826,0.545868
4,H-r Latency*H-r Amplitude/H-r Length,7,0.914053,0.036425,0.695606
5,Relax Tone*SYS Blood Pres 1/H-r Length/Contrac...,7,0.909458,0.025194,0.720275
6,L-Thigh Circ*DIA Blood Pres 1/H-r Length/DIA B...,7,0.909076,0.025418,0.761615
7,R-Thigh Circ*DIA Blood Pres 2/L-Thigh Circ/Rel...,7,0.905975,0.076496,0.489226
8,H-r Latency*H-r Length/R-Thigh Circ/DIA Blood ...,7,0.903848,0.022907,0.159196
9,R-Thigh Circ/Mean Amp EMG/H-r Length,7,0.899948,0.05264,0.144435


In [150]:
# Convert the dataframe to an XlsxWriter Excel object.
phys_max_top.to_excel(writer, sheet_name='Overall Max')

#### Combinatorial biomarkers sorted by the number of target markers they correlate with

In [151]:
phys_max_type = phys_df.groupby('Type')[['Biomarker', 'Theil-Sen Score', 'Ridge Score', 'Kernel Ridge Score']]\
                       .max().sort_values(by=['Kernel Ridge Score'], ascending=False)
                         
phys_max_type

Unnamed: 0_level_0,Biomarker,Theil-Sen Score,Ridge Score,Kernel Ridge Score
Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a*b/c/d,SYS Blood Pres 2*DIA Blood Pres 2/SYS Blood Pr...,9.520427,0.81155,0.997313
a/b/c,SYS Blood Pres 2/SYS Blood Pres 1/DIA Blood Pr...,8.53309,0.83876,0.990329
a*b/c,SYS Blood Pres 1*SYS Blood Pres 2/DIA Blood Pr...,2.16222,0.707595,0.985529
a/b,SYS Blood Pres 2/SYS Blood Pres 1,1.956988,0.586784,0.978674
1/a,1/SYS Blood Pres 1,1.393475,0.214263,0.888615
a,Isometric Strength,0.928265,0.486873,0.876781


In [152]:
# Convert the dataframe to an XlsxWriter Excel object.
phys_max_type.to_excel(writer, sheet_name='Type Max')

# Close the Pandas Excel writer and output the Excel file.
writer.save()