In [1]:
'''
In this one I'm hoping to graph out all the layers, hopefully to prove that all
the trained models focus on a couple of specific features. We'll leave identifying
the relevant features to maybe a different one? Or maybe it'll just happen near the end.
'''

import tensorflow as tf
import numpy as np, pandas as pd
from pathlib import Path
from plotnine import *
import plotnine
from multiprocessing import Pool
import statsmodels
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

working_dir = Path('/d/data/plasmo/training_data/output_new')
# input_file = working_dir / 'saved_model0.h5'
data_file = working_dir / 'data.tsv'
meta_file = working_dir / 'meta.tsv'
id_file = working_dir / 'prefilter.tsv'
out_file = working_dir / 'predicted_genes_full.tsv'

In [2]:
n_models = 10
models = [tf.keras.models.load_model((working_dir / 'saved_model{0}.h5'.format(n))) for n in range(n_models)]
    

# model1 = tf.keras.models.load_model(input_file)
data = np.loadtxt(data_file)
meta = np.loadtxt(meta_file)
models[0].summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 64)                6976      
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                1040      
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 17        
Total params: 8,033
Trainable params: 8,033
Non-trainable params: 0
_________________________________________________________________


In [None]:
# #get linear models
# from sklearn.linear_model import LassoLarsCV, LassoLars
# model = LassoLars(alpha=0.001)
# model.fit(data, meta.reshape((meta.shape[0],)))

# abs_coefs = np.abs(model.coef_) * (np.sum(data, axis=0) > (data.shape[0] * 0.05))
# sorted_index = np.argsort(abs_coefs)
# good_idx = sorted_index[(-1 * int(np.count_nonzero(abs_coefs) * 0.2)):]

# id_df = pd.read_csv(id_file, sep='\t')
# prediction_df = pd.DataFrame({'idx': good_idx,
#                               'id': id_df.loc[good_idx, 'id'],
#                               'average_impact': model.coef_[good_idx],
#                               'abs_impact': abs_coefs[good_idx]})
# prediction_df.set_index('idx', inplace=True)

In [3]:
weights = models[0].get_weights()
print([x.shape for x in weights])

[(108, 64), (64,), (64, 16), (16,), (16, 1), (1,)]


In [None]:
# def reshapeForPlot(matrix):
#     #makes a df out of the matrix where each line is a point
#     width, height = matrix.shape
#     x_list = [x for x in range(width) for y in range(height)]
#     y_list = [y for x in range(width) for y in range(height)]
#     z_list = [matrix[x,y] for x in range(width) for y in range(height)]
#     return pd.DataFrame({'features': x_list, 'neurons':y_list, 'val': z_list})

# for x in range(0, len(weights), 2):
#     layer_weights = reshapeForPlot(weights[x])
#     bias_weights = pd.DataFrame({'neurons': range(weights[x+1].shape[0]), 'vals':weights[x+1]})
    
#     bias_text_y_delta = (np.max(weights[x+1]) - np.min(weights[x+1])) / 100
#     bias_text_x_delta = weights[x+1].shape[0] / 60
    
#     layer_plot = (ggplot(layer_weights, aes('features', 'neurons', fill='val'))
#                 + geom_tile(aes(width=.95, height=.95))
#                 )
#     bias_plot = (ggplot(bias_weights, aes('neurons', 'vals'))
#              + geom_point()
#              + geom_text(aes(label='neurons'), size=5, nudge_x=0.1, nudge_y=-0.01))
    
#     layer_plot.draw()
#     layer_plot.save(working_dir / 'layer_{0}_weights.pdf'.format(int(np.ceil(x/2))), dpi=300, width=8, height=6, units='in')

#     bias_plot.draw()
#     bias_plot.save(working_dir / 'layer_{0}_bias.pdf'.format(int(np.ceil(x/2))), dpi=300, width=8, height=6, units='in')

In [4]:
#okay we're just doing the prediction here
#so the highest numbers in impact will be the selected vals

def makeModSamples(sample):
    n_features = sample.shape[0]
    base = np.tile(sample, (n_features, 1)) #the same one repeated
    msk = np.zeros_like(base)
    for x in range(n_features):
        if base[x][x] > 0:
            base[x][x] = 0
            msk[x,:] = 1

    return np.array([base, msk]) #returns a 3D array
    

def getPredictions(model):
    
    def apply_fn(arr): #receives a 3D array generated by makeModSamples and reduces to two, then predict
        preds = model.predict(arr[0])
        return preds.reshape(-1,) * arr[1,:,0]

    #here we are only counting ones above bound
    def my_relu(n, bound):
        if n > bound:
            return 1
        else:
            return 0
    
    #turn 1s to 0s
    mod = np.apply_along_axis(makeModSamples, 1, data)
    mod_preds = np.array([apply_fn(x) for x in mod]) #result should be a 2D array of samples x features
    original_preds = model.predict(data)
    counts = np.sum(data, axis=0)
    
#     print(original_preds, mod_preds)
#     diffs = np.absolute(((mod_preds - original_preds) * (mod_preds > 0)))
    #given that we have modded values, I'm going to try non-abs values.
    diffs = (original_preds - mod_preds) * (np.absolute(mod_preds) > 0)
    total_diffs = np.sum(diffs, axis=0)
    avg_diffs = total_diffs / counts
    
#     print(diffs.shape, avg_diffs.shape)
    
    #we care about counts up to a point
    
    n_samples = data.shape[0]
    bound = int(n_samples * 0.05) #that point being 5% of all samples
#     bound = 5
    count_modifier = np.array([my_relu(x, bound) for x in counts])
    
#     print(count_modifier.shape)
    res = avg_diffs * count_modifier #this one gave the good results
#     res = avg_diffs #here we're applying the count modifier later
    
#     res = np.sum(diffs, axis=0) * count_modifier
#     print(count_modifier)

    
    return res

In [None]:
# #this is an alternative method where we try to emulate the mistake we made before
# #by just adding up the cases were a feature = 1

# def makeModSamples(sample):
#     n_features = sample.shape[0]
#     base = np.tile(sample, (n_features, 1)) #the same one repeated
#     msk = np.zeros_like(base)
#     for x in range(n_features):
#         if base[x][x] > 0:
#             base[x][x] = 0
#             msk[x,:] = 1

#     return np.array([base, msk]) #returns a 3D array
    

# def getPredictions(model):
    
#     def apply_fn(arr): #receives a 3D array generated by makeModSamples and reduces to two, then predict
#         preds = model.predict(arr[0])
#         return preds.reshape(-1,) * arr[1,:,0]

#     #here we are only counting ones above bound
#     def my_relu(n, bound):
#         if n > bound:
#             return 1
#         else:
#             return 0
    
#     #turn 1s to 0s
# #     mod = np.apply_along_axis(makeModSamples, 1, data)
# #     mod_preds = np.array([apply_fn(x) for x in mod]) #result should be a 2D array of samples x features
#     original_preds = model.predict(data)
#     counts = np.sum(data, axis=0)
    
# #     print(original_preds, mod_preds)
# #     diffs = np.absolute(((mod_preds - original_preds) * (mod_preds > 0)))
#     #given that we have modded values, I'm going to try non-abs values.
# #     diffs = (original_preds - mod_preds) * data
# #     total_diffs = np.sum(diffs, axis=0)
# #     avg_diffs = total_diffs / counts

#     n_samples = data.shape[0]
#     bound = n_samples * 0.05 #that point being 5% of all samples
# #     bound = 5
#     count_modifier = np.sum(data, axis=0) > bound

#     sums = np.sum((original_preds * data), axis=0) / counts * count_modifier
    
# #     sums = original_preds * (mod_pre)
    
#     return sums

# # m = models[0]
# # s = getPredictions(m)
# # print(s.shape)

In [None]:
# data = np.loadtxt(data_file)
# pred_res = getPredictions(model1)
# print(np.argsort(pred_res)[:-10:-1])
# # np.absolute((np.array([[1,2,3,4], [5,6,7,8]]) - np.array([9,9,9,9])))

In [None]:
# print(np.argsort(pred_res)[:-50:-1])

In [5]:
impact_lists = []
for m in models:
    impact_lists.append(getPredictions(m))

In [6]:
abs_lists = [np.absolute(x) for x in impact_lists]
ind_lists = [np.argsort(x)[::-1] for x in abs_lists]

In [7]:
n_hits = int(np.count_nonzero(impact_lists[0]) * 0.2) #we take top 20%
top_hits = [np.sort(x[:n_hits]) for x in ind_lists]

# #in this case we're saying top 20% of all hits, and more than 5% representation
# n_min = data.shape[0] * 0.05
# features_to_use = np.argwhere(np.sum(data, axis=0) > n_min).reshape(-1,)
# top_hits_filtered = [np.intersect1d(hits, features_to_use) for hits in top_hits]

In [13]:
#We're looking for an intersection between three models
common_inds = np.intersect1d(top_hits[0], top_hits[1])
for x in top_hits[2:]:
    common_inds = np.intersect1d(common_inds, x)
common_inds = np.sort(common_inds)

###DELETE AFTER, this part skips the intersection and lists everything
# common_inds = [np.sort(x[:np.count_nonzero(impact_lists[0])]) for x in ind_lists][0]
###
# ind_list_sorted = np.matrix([np.sort(x) for x in ind_list], dtype=np.int32)
# print(str(ind_list_sorted))

average_impact = np.average([x[common_inds] for x in impact_lists], axis = 0)
print(len(common_inds))
print(n_hits)
print(np.sum(data, axis=0))

7
14
[  2.  23.  96.   6.  14.   6.   8.  80.  28.   2.   4.   4.  37.   4.
  14.   4.   4.   3.   9.  54.   7.  13.  19.  39.  17.  10.   5.   7.
   8.   5.  42.   5.  27.  48.  22.  58.   8.  17. 113.  17.  95.  21.
  23.  19.  19.  20.   8.   5.   5.  21.   2.  17. 110.   3. 135. 103.
  28.  35.   4.  39. 132.   5.  79.  24.  67.   7.   8.  42.   4.  12.
  17.  17.   6.  27.   5.  67.   5.  33.   4.   9.  26.  11.   7.  23.
  23.   8.  78.  77.  26.   4.  11.   7.  23.  94.  24.   5.  99.   5.
   8.  18.   7.   7.   3.   3.   5.  18.  10.  53.]


In [14]:
id_df = pd.read_csv(id_file, sep='\t')
prediction_df = pd.DataFrame({'idx': common_inds,
                              'id': id_df.loc[common_inds, 'id'],
                              'average_impact': average_impact,
                              'abs_impact': np.absolute(average_impact)})
for x in range(n_models):
    prediction_df['model_{0}'.format(x)] = impact_lists[x][common_inds]
prediction_df.set_index('idx', inplace=True)

In [15]:
# def doTTest(idx):
#     test_df = pd.DataFrame({'x': data[:,idx], 'y': meta})
#     lm = 
    
#     return test_df

In [16]:
# sum(doTTest(5)['x'])

In [17]:
prediction_df

Unnamed: 0_level_0,id,average_impact,abs_impact,model_0,model_1,model_2,model_3,model_4,model_5,model_6,model_7,model_8,model_9
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
24,Pf3D7_05_v3:425000,0.539482,0.539482,0.453976,0.542418,0.578322,0.672632,0.491159,0.469516,0.564211,0.598318,0.512342,0.511928
28,Pf3D7_06_v3:635000,0.556075,0.556075,0.465491,0.637069,0.469721,0.620607,0.562875,0.546396,0.557477,0.57099,0.56277,0.567351
79,Pf3D7_13_v3:805000,0.542593,0.542593,0.466728,0.599847,0.501042,0.516853,0.500853,0.581882,0.563581,0.560515,0.515381,0.619251
85,Pf3D7_13_v3:1715000,0.409438,0.409438,0.370718,0.379489,0.308174,0.492559,0.416885,0.432492,0.470018,0.301428,0.424113,0.498509
88,Pf3D7_13_v3:1725000,0.486912,0.486912,0.560584,0.509911,0.385374,0.464293,0.443939,0.501288,0.487371,0.514259,0.481353,0.52075
98,Pf3D7_14_v3:765000,0.420293,0.420293,0.389437,0.355969,0.338384,0.360646,0.466625,0.456611,0.513232,0.415833,0.466709,0.439482
99,Pf3D7_14_v3:1235000,0.299846,0.299846,0.27791,0.319918,0.31444,0.34203,0.223985,0.337257,0.33439,0.227442,0.279752,0.341333


In [18]:
#search for genes
def getChrCode(name):
    try:
        return re.search('_([0-9]+)_', name).group(1)
    except:
        return None

def makeReq(id):
    chr, start = id.split(':')
    end = int(start) + 5000
    if getChrCode(chr):
        url = url_template.format(getChrCode(chr), start, end)
        res = requests.get(url)
        try:
#             res_list = res.json()['response']['recordset']['records'][0]['fields'][1]['value']

            res_list = ['{0}:{1}'.format(x['id'].split('/')[0], str(x['fields'][1]['value'])) for x in res.json()['response']['recordset']['records']]
            res_str = ','.join(res_list)
            print(res_str)
            return res_str
        except Exception as e:
            print('error at this one ' + str(e) + ' ' + str(res.json()))
            return None

    else:
        print('bad chr ' + chr)
    
import requests, re, json
url_template = 'https://plasmodb.org/plasmo/webservices/GeneQuestions/GenesByLocation.json?\
organismSinglePick=Plasmodium falciparum 3D7&\
chromosomeOptional={0}&\
start_point={1}&\
end_point={2}&\
o-fields=gene_product,gene_name'

results = prediction_df['id'].map(makeReq)

# # makeReq('Pf3D7_13_v3:1725000')

PF3D7_0510100:None
PF3D7_0615400:None
PF3D7_1319400:None,PF3D7_1319500:None
PF3D7_1343400:RAD5
PF3D7_1343700:Kelch13
PF3D7_1418100:LISP1
PF3D7_1431400:SRA,PF3D7_1431500:MAPK1


In [None]:
# #This is the by-genes mode
# def makeGeneReq(id):
#     gene_id = id.split(':')[0]
#     if gene_id.endswith('_UTR'):
#         gene_id = gene_id[:-4]
        
#     url = gene_url_template.format(gene_id)
#     res = requests.get(url)
#     try:
# #             res_list = res.json()['response']['recordset']['records'][0]['fields'][1]['value']

#         res_list = ['{0}:{1}'.format(id, str(x['fields'][0]['value'])) for x in res.json()['response']['recordset']['records']]
#         res_str = ','.join(res_list)
#         print(res_str)
#         return res_str
#     except Exception as e:
#         print('error at this one ' + str(e) + ' ' + str(res.json()))
#         return None


# import requests, re, json
# gene_url_template = 'https://plasmodb.org/plasmo/webservices/GeneQuestions/GeneByLocusTag.json?\
# ds_gene_ids_data={0}&\
# o-fields=gene_product'
# results = prediction_df['id'].map(makeGeneReq)

In [19]:
prediction_df['genes'] = results
prediction_df['counts'] = np.sum(data, axis=0)[[int(x) for x in prediction_df.index.to_numpy()]]
prediction_df = prediction_df.sort_values('abs_impact', ascending=False)
print(prediction_df)

                      id  average_impact  abs_impact   model_0   model_1  \
idx                                                                        
28    Pf3D7_06_v3:635000        0.556075    0.556075  0.465491  0.637069   
79    Pf3D7_13_v3:805000        0.542593    0.542593  0.466728  0.599847   
24    Pf3D7_05_v3:425000        0.539482    0.539482  0.453976  0.542418   
88   Pf3D7_13_v3:1725000        0.486912    0.486912  0.560584  0.509911   
98    Pf3D7_14_v3:765000        0.420293    0.420293  0.389437  0.355969   
85   Pf3D7_13_v3:1715000        0.409438    0.409438  0.370718  0.379489   
99   Pf3D7_14_v3:1235000        0.299846    0.299846  0.277910  0.319918   

      model_2   model_3   model_4   model_5   model_6   model_7   model_8  \
idx                                                                         
28   0.469721  0.620607  0.562875  0.546396  0.557477  0.570990  0.562770   
79   0.501042  0.516853  0.500853  0.581882  0.563581  0.560515  0.515381   
24   0.

In [20]:
prediction_df.to_csv(out_file, sep='\t', index=True)

In [None]:
# sum1 = weights[0].sum(axis=1)
# print(sum1.shape)
# sum_df = pd.DataFrame({'sum':sum1})
# sum_file = working_dir / 'sums.tsv'
# sum_df.to_csv(sum_file, sep='\t')

In [None]:
# import sys
# class fakemodel():
#     def __init__(self):
#         pass
#     def predict(self, arr):
#         return np.sum(arr, axis=1).reshape(-1, 1)
# data = np.array([[1,0,1,0], [0,0,0,1], [1,1,1,0]])
# z = getPredictions(fakemodel())
# for x in z:
#     print(x)