In [1]:
'''
In this one I'm hoping to graph out all the layers, hopefully to prove that all
the trained models focus on a couple of specific features. We'll leave identifying
the relevant features to maybe a different one? Or maybe it'll just happen near the end.
'''

import tensorflow as tf
import numpy as np, pandas as pd
from pathlib import Path
from plotnine import *
import plotnine
from multiprocessing import Pool
import statsmodels
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

working_dir = Path('/d/data/neis/neis_nn/output_7_newest')
# input_file = working_dir / 'saved_model0.h5'
data_file = working_dir / 'data.tsv'
meta_file = working_dir / 'meta.tsv'
id_file = working_dir / 'prefilter.tsv'
out_file = working_dir / 'predicted_genes_full.tsv'

In [2]:
n_models = 5
models = [tf.keras.models.load_model((working_dir / 'saved_model{0}.h5'.format(n))) for n in range(n_models)]
    

# model1 = tf.keras.models.load_model(input_file)
data = np.loadtxt(data_file)
meta = np.loadtxt(meta_file)
models[0].summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 32)                7936      
_________________________________________________________________
dropout (Dropout)            (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                528       
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 17        
Total params: 8,481
Trainable params: 8,481
Non-trainable params: 0
_________________________________________________________________


In [3]:
# #get linear models
# from sklearn.linear_model import LassoLarsCV, LassoLars
# model = LassoLars(alpha=0.001)
# model.fit(data, meta.reshape((meta.shape[0],)))

# abs_coefs = np.abs(model.coef_) * (np.sum(data, axis=0) > (data.shape[0] * 0.05))
# sorted_index = np.argsort(abs_coefs)
# good_idx = sorted_index[(-1 * int(np.count_nonzero(abs_coefs) * 0.2)):]

# id_df = pd.read_csv(id_file, sep='\t')
# prediction_df = pd.DataFrame({'idx': good_idx,
#                               'id': id_df.loc[good_idx, 'id'],
#                               'average_impact': model.coef_[good_idx],
#                               'abs_impact': abs_coefs[good_idx]})
# prediction_df.set_index('idx', inplace=True)

In [3]:
weights = models[0].get_weights()
print([x.shape for x in weights])

[(247, 32), (32,), (32, 16), (16,), (16, 1), (1,)]


In [5]:
# def reshapeForPlot(matrix):
#     #makes a df out of the matrix where each line is a point
#     width, height = matrix.shape
#     x_list = [x for x in range(width) for y in range(height)]
#     y_list = [y for x in range(width) for y in range(height)]
#     z_list = [matrix[x,y] for x in range(width) for y in range(height)]
#     return pd.DataFrame({'features': x_list, 'neurons':y_list, 'val': z_list})

# for x in range(0, len(weights), 2):
#     layer_weights = reshapeForPlot(weights[x])
#     bias_weights = pd.DataFrame({'neurons': range(weights[x+1].shape[0]), 'vals':weights[x+1]})
    
#     bias_text_y_delta = (np.max(weights[x+1]) - np.min(weights[x+1])) / 100
#     bias_text_x_delta = weights[x+1].shape[0] / 60
    
#     layer_plot = (ggplot(layer_weights, aes('features', 'neurons', fill='val'))
#                 + geom_tile(aes(width=.95, height=.95))
#                 )
#     bias_plot = (ggplot(bias_weights, aes('neurons', 'vals'))
#              + geom_point()
#              + geom_text(aes(label='neurons'), size=5, nudge_x=0.1, nudge_y=-0.01))
    
#     layer_plot.draw()
#     layer_plot.save(working_dir / 'layer_{0}_weights.pdf'.format(int(np.ceil(x/2))), dpi=300, width=8, height=6, units='in')

#     bias_plot.draw()
#     bias_plot.save(working_dir / 'layer_{0}_bias.pdf'.format(int(np.ceil(x/2))), dpi=300, width=8, height=6, units='in')

In [4]:
#okay we're just doing the prediction here
#so the highest numbers in impact will be the selected vals

def makeModSamples(sample):
    n_features = sample.shape[0]
    base = np.tile(sample, (n_features, 1)) #the same one repeated
    msk = np.zeros_like(base)
    for x in range(n_features):
        if base[x][x] > 0:
            base[x][x] = 0
            msk[x,:] = 1

    return np.array([base, msk]) #returns a 3D array
    

def getPredictions(model):
    
    def apply_fn(arr): #receives a 3D array generated by makeModSamples and reduces to two, then predict
        preds = model.predict(arr[0])
        return preds.reshape(-1,) * arr[1,:,0]

    #here we are only counting ones above bound
    def my_relu(n, bound):
        if n > bound:
            return 1
        else:
            return 0
    
    #turn 1s to 0s
    mod = np.apply_along_axis(makeModSamples, 1, data)
    mod_preds = np.array([apply_fn(x) for x in mod]) #result should be a 2D array of samples x features
    original_preds = model.predict(data)
    counts = np.sum(data, axis=0)
    
#     print(original_preds, mod_preds)
#     diffs = np.absolute(((mod_preds - original_preds) * (mod_preds > 0)))
    #given that we have modded values, I'm going to try non-abs values.
    diffs = (original_preds - mod_preds) * (np.absolute(mod_preds) > 0)
    total_diffs = np.sum(diffs, axis=0)
    avg_diffs = total_diffs / counts
    
#     print(diffs.shape, avg_diffs.shape)
    
    #we care about counts up to a point
    
    n_samples = data.shape[0]
    bound = int(n_samples * 0.05) #that point being 5% of all samples
#     bound = 5
    count_modifier = np.array([my_relu(x, bound) for x in counts])
    
#     print(count_modifier.shape)
    res = avg_diffs * count_modifier #this one gave the good results
#     res = avg_diffs #here we're applying the count modifier later
    
#     res = np.sum(diffs, axis=0) * count_modifier
#     print(count_modifier)

    
    return res

In [7]:
# #this is an alternative method where we try to emulate the mistake we made before
# #by just adding up the cases were a feature = 1

# def makeModSamples(sample):
#     n_features = sample.shape[0]
#     base = np.tile(sample, (n_features, 1)) #the same one repeated
#     msk = np.zeros_like(base)
#     for x in range(n_features):
#         if base[x][x] > 0:
#             base[x][x] = 0
#             msk[x,:] = 1

#     return np.array([base, msk]) #returns a 3D array
    

# def getPredictions(model):
    
#     def apply_fn(arr): #receives a 3D array generated by makeModSamples and reduces to two, then predict
#         preds = model.predict(arr[0])
#         return preds.reshape(-1,) * arr[1,:,0]

#     #here we are only counting ones above bound
#     def my_relu(n, bound):
#         if n > bound:
#             return 1
#         else:
#             return 0
    
#     #turn 1s to 0s
# #     mod = np.apply_along_axis(makeModSamples, 1, data)
# #     mod_preds = np.array([apply_fn(x) for x in mod]) #result should be a 2D array of samples x features
#     original_preds = model.predict(data)
#     counts = np.sum(data, axis=0)
    
# #     print(original_preds, mod_preds)
# #     diffs = np.absolute(((mod_preds - original_preds) * (mod_preds > 0)))
#     #given that we have modded values, I'm going to try non-abs values.
# #     diffs = (original_preds - mod_preds) * data
# #     total_diffs = np.sum(diffs, axis=0)
# #     avg_diffs = total_diffs / counts

#     n_samples = data.shape[0]
#     bound = n_samples * 0.05 #that point being 5% of all samples
# #     bound = 5
#     count_modifier = np.sum(data, axis=0) > bound

#     sums = np.sum((original_preds * data), axis=0) / counts * count_modifier
    
# #     sums = original_preds * (mod_pre)
    
#     return sums

# # m = models[0]
# # s = getPredictions(m)
# # print(s.shape)

In [8]:
# data = np.loadtxt(data_file)
# pred_res = getPredictions(model1)
# print(np.argsort(pred_res)[:-10:-1])
# # np.absolute((np.array([[1,2,3,4], [5,6,7,8]]) - np.array([9,9,9,9])))

In [9]:
# print(np.argsort(pred_res)[:-50:-1])

In [5]:
impact_lists = []
for m in models:
    impact_lists.append(getPredictions(m))

In [6]:
abs_lists = [np.absolute(x) for x in impact_lists]
ind_lists = [np.argsort(x)[::-1] for x in abs_lists]

In [13]:
n_hits = int(np.count_nonzero(impact_lists[0]) * 0.25) #we take top 20%
top_hits = [np.sort(x[:n_hits]) for x in ind_lists]

# #in this case we're saying top 20% of all hits, and more than 5% representation
# n_min = data.shape[0] * 0.05
# features_to_use = np.argwhere(np.sum(data, axis=0) > n_min).reshape(-1,)
# top_hits_filtered = [np.intersect1d(hits, features_to_use) for hits in top_hits]

In [14]:
# #We're looking for an intersection between three models
common_inds = np.intersect1d(top_hits[0], top_hits[1])
for x in top_hits[2:]:
    common_inds = np.intersect1d(common_inds, x)
common_inds = np.sort(common_inds)

###DELETE AFTER
# common_inds = [np.sort(x[:np.count_nonzero(impact_lists[0])]) for x in ind_lists][0]
###
# ind_list_sorted = np.matrix([np.sort(x) for x in ind_list], dtype=np.int32)
# print(str(ind_list_sorted))

average_impact = np.average([x[common_inds] for x in impact_lists], axis = 0)
print(len(common_inds))
print(n_hits)
print(np.sum(data, axis=0))

3
50
[ 61.   6.  69.  44.   7.   7.   4.  16.   7.  37.  12.   8.  29.   3.
  16. 230.   3. 183.  89.  34.  56.  23.  56. 183. 203.  63.  53.  59.
  56.  62.  50.  59. 153.  58. 196.  57.  70.  58.  57. 316.   3.  35.
  57. 217.  58. 100.   4.  63.  64.  68.  63.  61.  59.  87. 162.  58.
  33. 195.  33. 165.   6.  59.  42.   6.  38.  13.  56. 170.  61.  33.
 114.  56.  77.  60.  53.  58.  85. 154. 269.   2.  54.  45.  44. 193.
  52.  68.  54. 288.  56.  56.  61.  58.  35.  58.  61.  58. 287.  57.
   3. 117.   6.  73.  32.   7.  25.  59. 217.   4.  16.   8.  20.   8.
  60.  48.  53. 204. 130.  88.  43. 205.  82.  54.  57. 242.  60.  68.
  32.  57. 153.  58.  59.  53.  40.  42.  14. 133.  48.  59.  63.  35.
  81.  55.  55.  53.  56.  36.  34.  53. 176.  54.  54.  84.  54.  54.
  84.  71.  74.  57.  56.  70.  56.  87.  58.  80.  16.  34.  25.  24.
   6.  41.  33. 222.   3.  40.  52.   3. 189.  51.  53. 203.  57.  58.
  66.  80.  23.  48.  46.  47.  35.  40.  98. 157.  34.   5.  90.  55.
 

In [15]:
id_df = pd.read_csv(id_file, sep='\t')
prediction_df = pd.DataFrame({'idx': common_inds,
                              'id': id_df.loc[common_inds, 'id'],
                              'average_impact': average_impact,
                              'abs_impact': np.absolute(average_impact)})
for x in range(n_models):
    prediction_df['model_{0}'.format(x)] = impact_lists[x][common_inds]
prediction_df.set_index('idx', inplace=True)

In [16]:
# def doTTest(idx):
#     test_df = pd.DataFrame({'x': data[:,idx], 'y': meta})
#     lm = 
    
#     return test_df

In [17]:
# sum(doTTest(5)['x'])

In [18]:
prediction_df

Unnamed: 0_level_0,id,average_impact,abs_impact,model_0,model_1,model_2,model_3,model_4
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
82,NC_002946.2_ChrI:745000,0.017741,0.017741,0.019833,0.020898,0.014598,0.021749,0.011626
84,NC_002946.2_ChrI:747000,0.018935,0.018935,0.018917,0.019155,0.021768,0.017831,0.017004
130,NC_002946.2_ChrI:1230000,0.014278,0.014278,0.00649,0.009648,0.018942,0.016301,0.02001


In [18]:
# #search for genes
# def getChrCode(name):
#     try:
#         return re.search('_([0-9]+)_', name).group(1)
#     except:
#         return None

# def makeReq(id):
#     chr, start = id.split(':')
#     end = int(start) + 5000
#     if getChrCode(chr):
#         url = url_template.format(getChrCode(chr), start, end)
#         res = requests.get(url)
#         try:
# #             res_list = res.json()['response']['recordset']['records'][0]['fields'][1]['value']

#             res_list = ['{0}:{1}'.format(x['id'].split('/')[0], str(x['fields'][1]['value'])) for x in res.json()['response']['recordset']['records']]
#             res_str = ','.join(res_list)
#             print(res_str)
#             return res_str
#         except Exception as e:
#             print('error at this one ' + str(e) + ' ' + str(res.json()))
#             return None

#     else:
#         print('bad chr ' + chr)
    
# import requests, re, json
# url_template = 'https://plasmodb.org/plasmo/webservices/GeneQuestions/GenesByLocation.json?\
# organismSinglePick=Plasmodium falciparum 3D7&\
# chromosomeOptional={0}&\
# start_point={1}&\
# end_point={2}&\
# o-fields=gene_product,gene_name'

# results = prediction_df['id'].map(makeReq)

# # makeReq('Pf3D7_13_v3:1725000')

In [19]:
# #This is the by-genes mode
# def makeGeneReq(id):
#     gene_id = id.split(':')[0]
#     if gene_id.endswith('_UTR'):
#         gene_id = gene_id[:-4]
        
#     url = gene_url_template.format(gene_id)
#     res = requests.get(url)
#     try:
# #             res_list = res.json()['response']['recordset']['records'][0]['fields'][1]['value']

#         res_list = ['{0}:{1}'.format(id, str(x['fields'][0]['value'])) for x in res.json()['response']['recordset']['records']]
#         res_str = ','.join(res_list)
#         print(res_str)
#         return res_str
#     except Exception as e:
#         print('error at this one ' + str(e) + ' ' + str(res.json()))
#         return None


# import requests, re, json
# gene_url_template = 'https://plasmodb.org/plasmo/webservices/GeneQuestions/GeneByLocusTag.json?\
# ds_gene_ids_data={0}&\
# o-fields=gene_product'
# results = prediction_df['id'].map(makeGeneReq)

In [20]:
# prediction_df['genes'] = results
prediction_df['counts'] = np.sum(data, axis=0)[[int(x) for x in prediction_df.index.to_numpy()]]
prediction_df = prediction_df.sort_values('abs_impact', ascending=False)
print(prediction_df)

                           id  average_impact    abs_impact   model_0  \
idx                                                                     
178  NC_002946.2_ChrI:1570000    6.282706e-02  6.282706e-02  0.111023   
122  NC_002946.2_ChrI:1122000    5.413979e-02  5.413979e-02  0.081052   
232  NC_002946.2_ChrI:1950000   -4.933258e-02  4.933258e-02 -0.010399   
17    NC_002946.2_ChrI:211000    4.247289e-02  4.247289e-02 -0.000768   
182  NC_002946.2_ChrI:1596000    4.153582e-02  4.153582e-02  0.033582   
..                        ...             ...           ...       ...   
166  NC_002946.2_ChrI:1469000    2.698521e-04  2.698521e-04 -0.000989   
85    NC_002946.2_ChrI:654000    1.455065e-04  1.455065e-04 -0.014168   
158  NC_002946.2_ChrI:1459000   -1.007392e-04  1.007392e-04 -0.001024   
248  NC_002946.2_ChrI:2035000   -3.063040e-05  3.063040e-05  0.000209   
80    NC_002946.2_ChrI:515000   -9.752553e-08  9.752553e-08 -0.000438   

      model_1   model_2   model_3   model_4  count

In [21]:
prediction_df.to_csv(out_file, sep='\t', index=True)

In [22]:
# sum1 = weights[0].sum(axis=1)
# print(sum1.shape)
# sum_df = pd.DataFrame({'sum':sum1})
# sum_file = working_dir / 'sums.tsv'
# sum_df.to_csv(sum_file, sep='\t')

In [23]:
# import sys
# class fakemodel():
#     def __init__(self):
#         pass
#     def predict(self, arr):
#         return np.sum(arr, axis=1).reshape(-1, 1)
# data = np.array([[1,0,1,0], [0,0,0,1], [1,1,1,0]])
# z = getPredictions(fakemodel())
# for x in z:
#     print(x)

In [24]:
top_hits

[array([  0,   2,   3,   9,  13,  14,  15,  16,  17,  18,  19,  20,  21,
         22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,
         35,  36,  37,  38,  39,  41,  43,  44,  45,  46,  47,  48,  49,
         50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,
         64,  65,  66,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
         79,  80,  81,  82,  83,  84,  85,  87,  88,  89,  90,  91,  92,
         93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105,
        106, 107, 108, 111, 113, 115, 116, 118, 122, 124, 125, 126, 127,
        128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140,
        142, 143, 144, 145, 146, 147, 148, 149, 151, 152, 153, 154, 155,
        156, 157, 158, 159, 160, 161, 162, 163, 164, 166, 167, 168, 169,
        170, 171, 172, 173, 175, 178, 179, 180, 182, 184, 185, 187, 188,
        189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201,
        202, 203, 205, 206, 207, 208, 209, 210, 211