In [28]:
# impact scores are to be calculated in separate notebooks, impact_score_original and impact_score_neis
# and imported here

import tensorflow as tf
import numpy as np, pandas as pd
from pathlib import Path
from multiprocessing import Pool
import statsmodels
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np


In [29]:
# for plasmo
plasmo_dir = Path('/mnt/d/data/nn_scalar3/output_mod')
impact_raw = np.absolute(np.load(plasmo_dir / 'impact_scores_full.npy'))
id_df = pd.read_csv(plasmo_dir / 'prefilter.tsv', sep='\t')

sig_map = {
    "Pf3D7_13_v3:1725000": 'PfS4-kelch13',
    "Pf3D7_14_v3:2240000": 'PfS5-6PGD',
    "Pf3D7_07_v3:1170000": 'PfS2-Unknown',
    "Pf3D7_07_v3:420000": 'PfS1-MED14',
    "Pf3D7_13_v3:1695000": 'PfS3-PMT',
}

mean_impact = np.mean(impact_raw, axis=0)
min_impact = np.min(impact_raw, axis=0)

prediction_df = pd.DataFrame({
    'id': id_df['id'],
    'chr': id_df['id'].apply(lambda x: int(x.split('_')[1])),
    'position': id_df['id'].apply(lambda x: int(x.split(':')[-1])),
    'mean_impact': mean_impact,
    'min_impact': min_impact,
    'is_sig': id_df['id'].isin(sig_map.keys()),
})

In [30]:
def _get_text_position(label):
    if label == 'PfS1-MED14':
        return 'top center'
    if label == 'NgS1-purE':
        return 'top left'
    # if label == 'Unknown':
    #     return 'bottom left'
    return 'bottom center'

n_subplots = len(prediction_df['chr'].unique())

# calculate column widths
width_df = prediction_df.groupby('chr').agg({'position': ['max', 'min']})
width_df.columns = ['max_pos', 'min_pos']
width_df['total'] = width_df.max_pos - width_df.min_pos
width_df['frac'] = width_df['total'] / width_df['total'].sum()

#assign colors to markers
prediction_df['is_max'] = prediction_df.apply(
    lambda x: x.mean_impact == prediction_df.loc[prediction_df.id == x.id].mean_impact.max(),
    axis=1,
)
prediction_df['marker_color'] = prediction_df.apply(
    lambda x: 'red' if (x.is_sig and x.is_max) else 'blue',
    axis=1,
)

#add text labels for sig ones
prediction_df['text_label'] = prediction_df.apply(
    lambda x: sig_map[x.id] if x.marker_color == 'red' else '',
    axis=1,
)
prediction_df['text_position'] = prediction_df.text_label.apply(_get_text_position)

fig = make_subplots(
    rows=2, cols=n_subplots,
    column_widths=width_df['frac'].to_list(),
    subplot_titles = [f"{x}" for x in prediction_df['chr'].unique()],
    x_title = 'genomic position',
    specs=[[{"secondary_y": False} for x in range(n_subplots)] for _ in range(2)],
    horizontal_spacing=0.0005,
    shared_yaxes=True,
)
for i, (chr, rows) in enumerate(prediction_df.groupby('chr')):
    sig = rows.loc[rows['is_sig'] & rows['is_max']]
    non_sig = rows.loc[~rows['is_sig']]
    
    fig.add_trace(
        go.Scatter(
            x=non_sig['position'],
            y=non_sig['mean_impact'],
            mode='markers',
            showlegend=False if i != 12 else True,
            name='genomic segment, 5kb',
            marker=dict(color='blue', size=3),
        ), row=1, col=i+1,
    )
    fig.add_trace(
        go.Scatter(
            x=sig['position'],
            y=sig['mean_impact'],
            mode='markers+text',
            name='high-impact segment',
            showlegend=False if i != 12 else True,
            text=list(sig.text_label.values),
            textposition=list(sig.text_position.values),
            textfont=dict(size=8),
            marker=dict(color='red', size=3),
        ), row=1, col=i+1,
    )
    fig.add_trace(
        go.Scatter(
            x=non_sig['position'],
            y=non_sig['min_impact'],
            mode='markers',
            name='variant',
            showlegend=False,
            marker=dict(color='blue', size=3),
        ), row=2, col=i+1,
    )
    fig.add_trace(
        go.Scatter(
            x=sig['position'],
            y=sig['min_impact'],
            mode='markers',
            showlegend=False,
            marker=dict(color='red', size=3),
        ), row=2, col=i+1,
    )

# hide y axis and fix x axis tick labels
for r in [1, 2]:
    for c in range(1, n_subplots+1):
        if c > 1:
            fig.update_yaxes(visible=False, row=r, col=c)
        fig.update_xaxes(tickmode='linear', tick0=500000, dtick=500000, tickfont=dict(
            size=8, 
        ), tickangle=90, row=r, col=c)

# draws the blue baclground
fig.update_layout(shapes=[
    dict(
        type="rect", 
        xref=f"x{'' if j == 1 else j} domain", 
        yref=f"y{'' if j == 1 else j} domain",
        x0=0, y0=0, x1=1, y1=1,
        fillcolor="blue",
        opacity=0.1,
        layer="below",
        line_width=0,
    ) for j in range(1, (2 * n_subplots) + 1, 2)
])

fig.update_yaxes(title='mean impact', row=1, col=1)
fig.update_yaxes(title='min impact', row=2, col=1)
fig.update_layout(template='simple_white', legend_title_text='Legend', font_family='Arial')
fig.update_annotations(font_size=12)

In [31]:
# total considered segments
len(prediction_df['id'].unique())

285

In [32]:
# neisseria
d_path = Path('/mnt/d/data/popnet_paper/')
impact_df = pd.read_csv(
    d_path / 'neis_impact_score.tsv', sep='\t'
).sort_values(by='idx')

impact_raw = np.absolute(impact_df[
    ['model_0', 'model_1', 'model_2', 'model_3', 'model_4']
].values)
id_df = pd.read_csv(plasmo_dir / 'prefilter.tsv', sep='\t')

sig_map = {
    745000: 'NgS1-purE',
    747000: 'NgS2',
    1849000: 'NgS4-C39',
    1599000: 'NgS3-IS110',
}

mean_impact = np.mean(impact_raw, axis=1)
min_impact = np.min(impact_raw, axis=1)

prediction_df = pd.DataFrame({
    'id': impact_df['id'],
    'chr': 1,
    'position': impact_df['id'].apply(lambda x: int(x.split(':')[-1])),
    'mean_impact': mean_impact,
    'min_impact': min_impact, 
})
prediction_df['is_sig'] = prediction_df['position'].isin(sig_map.keys())

In [33]:
# copy of plasmo cell above
# adjusted for the neis figure

#assign colors to markers
prediction_df['is_max'] = prediction_df.apply(
    lambda x: x.mean_impact == prediction_df.loc[prediction_df.position == x.position].mean_impact.max(),
    axis=1,
)
prediction_df['marker_color'] = prediction_df.apply(
    lambda x: 'red' if (x.is_sig and x.is_max) else 'blue',
    axis=1,
)

#add text labels for sig ones
prediction_df['text_label'] = prediction_df.apply(
    lambda x: sig_map[x.position] if x.marker_color == 'red' else '',
    axis=1,
)
prediction_df['text_position'] = prediction_df.text_label.apply(_get_text_position)

fig = make_subplots(
    rows=2, cols=1,
    x_title = 'genomic position',
)
rows = prediction_df
sig = rows.loc[rows['is_sig'] & rows['is_max']]
non_sig = rows.loc[~rows['is_sig']]

fig.add_trace(
    go.Scatter(
        x=non_sig['position'],
        y=non_sig['mean_impact'],
        mode='markers',
        name='genomic segment, 1kb',
        marker=dict(color='blue', size=3),
    ), row=1, col=1,
)
fig.add_trace(
    go.Scatter(
        x=sig['position'],
        y=sig['mean_impact'],
        mode='markers+text',
        name='high-impact segment',
        text=list(sig.text_label.values),
        textposition=list(sig.text_position.values),
        textfont=dict(size=10),
        marker=dict(color='red', size=3),
    ), row=1, col=1,
)
fig.add_trace(
    go.Scatter(
        x=non_sig['position'],
        y=non_sig['min_impact'],
        showlegend=False,
        mode='markers',
        marker=dict(color='blue', size=3),
    ), row=2, col=1,
)
fig.add_trace(
    go.Scatter(
        x=sig['position'],
        y=sig['min_impact'],
        mode='markers',
        showlegend=False,
        marker=dict(color='red', size=3),
    ), row=2, col=1,
)

# hide y axis and fix x axis tick labels
for r in [1, 2]:
    fig.update_xaxes(autorangeoptions=dict(minallowed=0), tickmode='linear', tick0=50000, dtick=50000, tickfont=dict(
        size=8, 
    ), tickangle=90, row=r, col=1)


fig.update_yaxes(title='mean impact', autorangeoptions=dict(maxallowed=0.09), row=1, col=1)
fig.update_yaxes(title='min impact', row=2, col=1)
fig.update_layout(template='simple_white', legend_title_text='Legend', font_family='Arial')
fig.update_annotations(font_size=12)