In [3]:
'''
In this one I'm hoping to graph out all the layers, hopefully to prove that all
the trained models focus on a couple of specific features. We'll leave identifying
the relevant features to maybe a different one? Or maybe it'll just happen near the end.
'''

import numpy as np, pandas as pd
from pathlib import Path
import statsmodels
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

working_dir = Path('/d/data/plasmo/nn_scalar3/output_mod')
# input_file = working_dir / 'saved_model0.h5'
data_file = working_dir / 'data.tsv'
meta_file = working_dir / 'meta.tsv'
id_file = working_dir / 'prefilter.tsv'
out_file = working_dir / 'predicted_genes.tsv'

data = np.loadtxt(data_file)
meta = np.loadtxt(meta_file)

In [28]:
#get linear models
from sklearn.linear_model import LassoLarsCV, LassoLars
from sklearn.model_selection import KFold
n_folds = 10
kf = KFold(n_splits=n_folds, shuffle=True)
top_hits = []
coefficients = []
features_to_use_mask = np.sum(data, axis=0) > (data.shape[0] * 0.05)
n_features_to_use = np.sum(features_to_use_mask)

for train_index, test_index in kf.split(data):
    model = LassoLars(alpha=0.001)
    model.fit(data[train_index], meta[train_index])
    abs_coefs = np.abs(model.coef_) * features_to_use_mask #take only the ones with 5% or more representation
    sorted_index = np.argsort(abs_coefs)
    good_idx = sorted_index[(-1 * int(n_features_to_use * 0.2)):]

    top_hits.append(np.sort(good_idx))
    coefficients.append(model.coef_)



In [34]:
# #We're looking for an intersection between three models
common_inds = np.intersect1d(top_hits[0], top_hits[1])
for x in top_hits[2:]:
    common_inds = np.intersect1d(common_inds, x)
common_inds = np.sort(common_inds)
average_coef = np.average(np.array(coefficients), axis=0)

# average_impact = np.average([x[common_inds] for x in coefficients], axis = 0)
print(len(common_inds))
print(n_features_to_use)
for hits in top_hits:
    print(hits)

3
88
[ 33  57  73  75  81  85  89  93  97 145 174 234 238 239 240 283 292]
[ 75  81  89  93  98 114 147 159 218 220 234 238 239 240 287 292 296]
[ 24  67  75  89  93  98 154 218 220 234 236 238 239 240 278 292 296]
[ 33  75  81  85  89 135 147 209 220 234 238 239 267 270 283 292 296]
[ 59  69  89  93  96 107 135 145 159 203 234 238 239 240 267 270 292]
[ 14  25  81  89  98 147 220 234 235 238 239 245 282 286 289 293 296]
[ 63  75  93 107 114 147 154 174 218 220 234 238 239 240 273 292 296]
[ 26  59  85  93  98 107 114 147 218 220 234 236 238 239 240 292 296]
[ 81  86  89 107 147 159 203 209 220 234 238 239 240 270 287 292 296]
[ 24  75  81  85  86  97  98 147 159 220 234 238 239 240 287 292 296]


In [32]:
id_df = pd.read_csv(id_file, sep='\t')
prediction_df = pd.DataFrame({'idx': common_inds,
                              'id': id_df.loc[common_inds, 'id'],
                              'average_impact': average_coef[common_inds],
                              'abs_impact': np.absolute(average_coef)[common_inds]})
prediction_df.set_index('idx', inplace=True)

In [33]:
prediction_df

Unnamed: 0_level_0,id,average_impact,abs_impact
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
234,Pf3D7_13_v3:1695000,0.250963,0.250963
238,Pf3D7_13_v3:1725000,-0.504432,0.504432
239,Pf3D7_13_v3:1725000,1.030264,1.030264
