In [None]:
'''
In this one I'm hoping to graph out all the layers, hopefully to prove that all
the trained models focus on a couple of specific features. We'll leave identifying
the relevant features to maybe a different one? Or maybe it'll just happen near the end.
'''

import tensorflow as tf
import numpy as np, pandas as pd
from pathlib import Path
from multiprocessing import Pool
import statsmodels
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
import plotly.express as px

working_dir = Path('/mnt/d/data/nn_scalar3/output_mod')
# input_file = working_dir / 'saved_model0.h5'
data_file = working_dir / 'data.tsv'
meta_file = working_dir / 'meta.tsv'
id_file = working_dir / 'prefilter.tsv'
out_file = working_dir / 'predicted_genes_full_redo.tsv'

In [None]:
n_models = 10
models = [tf.keras.models.load_model((working_dir / 'saved_model{0}.h5'.format(n))) for n in range(n_models)]
    

# model1 = tf.keras.models.load_model(input_file)
data = np.loadtxt(data_file)
meta = np.loadtxt(meta_file)
models[0].summary()

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


2025-11-07 13:49:21.748128: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2025-11-07 13:49:21.753312: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 3686400000 Hz
2025-11-07 13:49:21.754537: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x577cb77f3bd0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2025-11-07 13:49:21.754568: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version


In [None]:
weights = models[0].get_weights()
print([x.shape for x in weights])

[(298, 32), (32,), (32, 16), (16,), (16, 1), (1,)]


In [None]:
#okay we're just doing the prediction here
#so the highest numbers in impact will be the selected vals

def makeModSamples(sample):
    n_features = sample.shape[0]
    base = np.tile(sample, (n_features, 1)) #the same one repeated
    msk = np.zeros_like(base)
    for x in range(n_features):
        if base[x][x] > 0:
            base[x][x] = 0
            msk[x,:] = 1

    return np.array([base, msk]) #returns a 3D array
    

def getPredictions(model):
    
    def apply_fn(arr): #receives a 3D array generated by makeModSamples and reduces to two, then predict
        preds = model.predict(arr[0])
        return preds.reshape(-1,) * arr[1,:,0]

    #here we are only counting ones above bound
    def my_relu(n, bound):
        if n > bound:
            return 1
        else:
            return 0
    
    #turn 1s to 0s
    mod = np.apply_along_axis(makeModSamples, 1, data)
    mod_preds = np.array([apply_fn(x) for x in mod]) #result should be a 2D array of samples x features
    original_preds = model.predict(data)
    counts = np.sum(data, axis=0)
    
#     print(original_preds, mod_preds)
#     diffs = np.absolute(((mod_preds - original_preds) * (mod_preds > 0)))
    #given that we have modded values, I'm going to try non-abs values.
    diffs = (original_preds - mod_preds) * (np.absolute(mod_preds) > 0)
    total_diffs = np.sum(diffs, axis=0)
    avg_diffs = total_diffs / counts
    
#     print(diffs.shape, avg_diffs.shape)
    
    #we care about counts up to a point
    
    n_samples = data.shape[0]
    bound = int(n_samples * 0.05) #that point being 5% of all samples
#     bound = 5
    count_modifier = np.array([my_relu(x, bound) for x in counts])
    
#     print(count_modifier.shape)
    res = avg_diffs * count_modifier #this one gave the good results
#     res = avg_diffs #here we're applying the count modifier later
    
#     res = np.sum(diffs, axis=0) * count_modifier
#     print(count_modifier)

    
    return res

In [None]:
# # Now we don't have to run this every time, since we saved the output in the next cell
# impact_lists = []
# for m in models:
#     impact_lists.append(getPredictions(m))

In [None]:
# # load this!
# np.save(
#     working_dir / 'impact_scores_full',
#     np.array(impact_lists)
# )

In [None]:
impact_lists = np.load(working_dir / 'impact_scores_full.npy')

In [None]:
abs_lists = [np.absolute(x) for x in impact_lists]
ind_lists = [np.argsort(x)[::-1] for x in abs_lists]

In [None]:
# n_hits = int(np.count_nonzero(impact_lists[0]) * 0.2) #we take top 20%
# top_hits = [np.sort(x[:n_hits]) for x in ind_lists]

# #in this case we're saying top 20% of all hits, and more than 5% representation
# n_min = data.shape[0] * 0.05
# features_to_use = np.argwhere(np.sum(data, axis=0) > n_min).reshape(-1,)
# top_hits_filtered = [np.intersect1d(hits, features_to_use) for hits in top_hits]

In [None]:
# #We're looking for an intersection between three models
# common_inds = np.intersect1d(top_hits[0], top_hits[1])
# for x in top_hits[2:]:
#     common_inds = np.intersect1d(common_inds, x)
# common_inds = np.sort(common_inds)

##DELETE AFTER, this part skips the intersection and lists everything
common_inds = [np.sort(x[:np.count_nonzero(impact_lists[0])]) for x in ind_lists][0]
##
# ind_list_sorted = np.matrix([np.sort(x) for x in ind_list], dtype=np.int32)
# print(str(ind_list_sorted))
average_impact = np.average([x[common_inds] for x in impact_lists], axis = 0)


In [None]:
# total_impact is the sum of the impact of each feature across all models
total_impact = np.array(impact_lists).sum(axis=0)
id_df = pd.read_csv(id_file, sep='\t')
prediction_df = pd.DataFrame({
    'id': id_df['id'],
    'total_impact': total_impact,
    'abs_impact': np.absolute(total_impact)
})


In [None]:
prediction_df

Unnamed: 0,id,total_impact,abs_impact
0,Pf3D7_01_v3:105000,-0.724425,0.724425
1,Pf3D7_01_v3:180000,0.000000,0.000000
2,Pf3D7_01_v3:195000,0.000000,0.000000
3,Pf3D7_01_v3:280000,0.000000,0.000000
4,Pf3D7_01_v3:315000,0.000000,0.000000
...,...,...,...
293,Pf3D7_14_v3:2480000,-1.388514,1.388514
294,Pf3D7_14_v3:2530000,0.000000,0.000000
295,Pf3D7_14_v3:2685000,0.000000,0.000000
296,Pf3D7_14_v3:3035000,0.684280,0.684280


In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np

In [None]:
# we don't need to search for genes here, just need to graph the total impact
n_chr = 14
sig_list = [1725000, 2240000, 1170000, 420000, 1695000]
# create plotly manhattan plot of abs_impact vs id from prediction_df
prediction_df['position'] = prediction_df.id.apply(lambda x: int(x.split(':')[-1]))
prediction_df['chr'] = prediction_df.id.apply(lambda x: int(x.split('_')[1]))
prediction_df['is_sig'] = prediction_df['position'].isin(sig_list)
n_subplots = len(prediction_df['chr'].unique())
fig = make_subplots(
    rows=1, cols=n_subplots,
    subplot_titles = [f"Chr {x}" for x in prediction_df['chr'].unique()],
    specs=[[{"secondary_y": False} for x in range(n_subplots)]],
    horizontal_spacing=0.05
)

# px.scatter(prediction_df, x='position', y='abs_impact', color='is_sig', hover_data=['total_impact'])


In [None]:
#search for genes
def getChrCode(name):
    try:
        return re.search('_([0-9]+)_', name).group(1)
    except:
        return None

def makeReq(id):
    chr, start = id.split(':')
    end = int(start) + 5000
    if getChrCode(chr):
        url = url_template.format(getChrCode(chr), start, end)
        res = requests.get(url)
        try:
#             res_list = res.json()['response']['recordset']['records'][0]['fields'][1]['value']

            res_list = ['{0}:{1}'.format(x['id'].split('/')[0], str(x['fields'][1]['value'])) for x in res.json()['response']['recordset']['records']]
            res_str = ','.join(res_list)
            print(res_str)
            return res_str
        except Exception as e:
            print('error at this one ' + str(e) + ' ' + str(res.json()))
            return None

    else:
        print('bad chr ' + chr)
    
import requests, re, json
url_template = 'https://plasmodb.org/plasmo/webservices/GeneQuestions/GenesByLocation.json?\
organismSinglePick=Plasmodium falciparum 3D7&\
chromosomeOptional={0}&\
start_point={1}&\
end_point={2}&\
o-fields=gene_product,gene_name'

results = prediction_df['id'].map(makeReq)

# # makeReq('Pf3D7_13_v3:1725000')

PF3D7_0510100:None
PF3D7_0615400:None
PF3D7_1319400:None,PF3D7_1319500:None
PF3D7_1343400:RAD5
PF3D7_1343700:Kelch13
PF3D7_1418100:LISP1
PF3D7_1431400:SRA,PF3D7_1431500:MAPK1


In [None]:
# #This is the by-genes mode
# def makeGeneReq(id):
#     gene_id = id.split(':')[0]
#     if gene_id.endswith('_UTR'):
#         gene_id = gene_id[:-4]
        
#     url = gene_url_template.format(gene_id)
#     res = requests.get(url)
#     try:
# #             res_list = res.json()['response']['recordset']['records'][0]['fields'][1]['value']

#         res_list = ['{0}:{1}'.format(id, str(x['fields'][0]['value'])) for x in res.json()['response']['recordset']['records']]
#         res_str = ','.join(res_list)
#         print(res_str)
#         return res_str
#     except Exception as e:
#         print('error at this one ' + str(e) + ' ' + str(res.json()))
#         return None


# import requests, re, json
# gene_url_template = 'https://plasmodb.org/plasmo/webservices/GeneQuestions/GeneByLocusTag.json?\
# ds_gene_ids_data={0}&\
# o-fields=gene_product'
# results = prediction_df['id'].map(makeGeneReq)

In [None]:
prediction_df['genes'] = results
prediction_df['counts'] = np.sum(data, axis=0)[[int(x) for x in prediction_df.index.to_numpy()]]
prediction_df = prediction_df.sort_values('abs_impact', ascending=False)
print(prediction_df)

                      id  average_impact  abs_impact   model_0   model_1  \
idx                                                                        
28    Pf3D7_06_v3:635000        0.556075    0.556075  0.465491  0.637069   
79    Pf3D7_13_v3:805000        0.542593    0.542593  0.466728  0.599847   
24    Pf3D7_05_v3:425000        0.539482    0.539482  0.453976  0.542418   
88   Pf3D7_13_v3:1725000        0.486912    0.486912  0.560584  0.509911   
98    Pf3D7_14_v3:765000        0.420293    0.420293  0.389437  0.355969   
85   Pf3D7_13_v3:1715000        0.409438    0.409438  0.370718  0.379489   
99   Pf3D7_14_v3:1235000        0.299846    0.299846  0.277910  0.319918   

      model_2   model_3   model_4   model_5   model_6   model_7   model_8  \
idx                                                                         
28   0.469721  0.620607  0.562875  0.546396  0.557477  0.570990  0.562770   
79   0.501042  0.516853  0.500853  0.581882  0.563581  0.560515  0.515381   
24   0.

In [None]:
prediction_df.to_csv(out_file, sep='\t', index=True)

In [None]:
# sum1 = weights[0].sum(axis=1)
# print(sum1.shape)
# sum_df = pd.DataFrame({'sum':sum1})
# sum_file = working_dir / 'sums.tsv'
# sum_df.to_csv(sum_file, sep='\t')

In [None]:
# import sys
# class fakemodel():
#     def __init__(self):
#         pass
#     def predict(self, arr):
#         return np.sum(arr, axis=1).reshape(-1, 1)
# data = np.array([[1,0,1,0], [0,0,0,1], [1,1,1,0]])
# z = getPredictions(fakemodel())
# for x in z:
#     print(x)