# DMS-MaPseq
**Note: first run of this cell takes a while**


In [2]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('../../src')
from util import *
from config import *
import plots
import ipynbname
from study_gen import study

import plotly.graph_objects as go
from plotly.subplots import make_subplots

Reading study from df.feather...
Done reading study from df.feather.


In [29]:
## Change sample and family ##
sample = 'IDX-41_S10_L001'   
family = 'cb1'    
##############################

def compute_wilson_interval(p, n, z = 1.96):
    denominator = 1 + z**2/n
    centre_adjusted_probability = p + z*z / (2*n)
    adjusted_standard_deviation = np.sqrt((p*(1 - p) + z*z / (4*n)) / n)
    
    lower_bound = (centre_adjusted_probability - z*adjusted_standard_deviation) / denominator
    upper_bound = (centre_adjusted_probability + z*adjusted_standard_deviation) / denominator
    return (lower_bound, upper_bound)


def mutation_identity_at_each_position(study, sample, construct, section='full'):
    sample, construct, section = study.df.loc[0][['sample','construct','section']]
    data = study.df[(study.df['sample']==sample) & (study.df['construct']==construct) & (study.df['section']==section)].iloc[0]
    df, df_err_min, df_err_max = pd.DataFrame(index = list(data['sequence'])), pd.DataFrame(index = list(data['sequence'])), pd.DataFrame(index = list(data['sequence']))
    stacked_bar = []
    color_map={'A':'red','C':'blue','G':'yellow','T':'green'}


    data['err_min'] = [compute_wilson_interval(p, data['num_aligned'])[0] for p in  data['mut_rates']]
    data['err_max'] = [compute_wilson_interval(p, data['num_aligned'])[1] for p in data['mut_rates']]

    for base in ['A','C','G','T']:
        for d, col in zip([df, df_err_min, df_err_max], ['mut_rates', 'err_min', 'err_max']):
            d[base] = [mr if b==base  else np.nan for mr, b in zip(data[col], data['sequence'])]
        
        stacked_bar.append(
            go.Bar
            (x=np.arange(len(data['sequence'])), 
            y=list(df[base]), 
            marker_color=color_map[base], 
            error_y=dict(type='data', symmetric=False, array=list(df_err_max[base]-df[base]), arrayminus=list(df[base]-df_err_min[base])), 
            
            showlegend=False))
    return {'fig':stacked_bar, 'data':df}

unique_constructs = study.df[(study.df['sample']==sample)&(study.df['family']==family)&(study.df['section']=='full')]['construct'].unique()

fig = make_subplots(rows=len(unique_constructs), cols=1, vertical_spacing=0.2/len(unique_constructs),
                    subplot_titles=['Mutation identity at each position - {}'.format(cst) for cst in unique_constructs])

for i_c, construct in enumerate(unique_constructs):
    muts_identity = mutation_identity_at_each_position(study, sample, construct)

    for bar in muts_identity['fig']:
        fig.add_trace( bar, row=i_c+1, col=1 )
    
    fig.update_xaxes(tickangle=0, 
            tickvals=np.arange(len(muts_identity['data'].index)), ticktext=list(muts_identity['data'].index), tickfont={'size':8},
            row=i_c+1, col=1)
        
for trace, name in zip(fig["data"][:4], ['A','C','G','T']):
    trace.update(showlegend=True)
    trace["name"] = name

fig.update_yaxes(title='Mutation fraction')
fig.update_layout(barmode='stack', height=500*len(unique_constructs), width=1500)
save_plotly_fig(ipynbname.path(), '[B] Mutation identity at each position/{}/{}'.format(sample, family), fig)
plot = {
    'fig':fig,
    'data':study.df[
        (study.df['sample']==sample)&
        (study.df['family']==family)&
        (study.df['section']=='full')]\
    [['sample','construct','mod_bases_A','mod_bases_C','mod_bases_G','mod_bases_T','num_aligned']]
    }
plot['fig'].show()

### # Mutations per read
- histogram
- x-axis = # mutations
- y-axis = # reads

Questions:
- cross the average mutation rate with the rates of smaple.csv


In [None]:

unique_samples = study.df['sample'].unique()
fig = make_subplots(rows=len(unique_samples), cols=1, vertical_spacing=0.4/len(unique_samples),
                     subplot_titles=['Number of mutations per read - {}'.format(sample) for sample in unique_samples])
for i_s, sample in enumerate(unique_samples):
    fig.add_trace( plots.mutations_per_read(study, sample), row=i_s+1, col=1 )
    fig.update_yaxes(title='Count')
    fig.update_xaxes(dtick=10)


fig.update_layout(autosize=True, height=10000, title='Number of mutation per read across samples')

save_plotly_fig(ipynbname.path(), '[A] Mutations per read', fig)
plot = {
    'fig':fig,
    'data':study.df[study.df['section']=='full'][['sample','construct','num_of_mutations']]
    }
plot['fig'].show()


### Mutation identity at each position
- stacked bar graph (ACUG coloring)
- x-axis = position (number/base/both??)
- y-axis = # reads


In [7]:
## Change sample and family ##
sample = 'IDX-41_S10_L001'   
family = 'cb1'    
##############################

unique_constructs = study.df[(study.df['sample']==sample)&(study.df['family']==family)&(study.df['section']=='full')]['construct'].unique()

fig = make_subplots(rows=len(unique_constructs), cols=1, vertical_spacing=0.2/len(unique_constructs),
                    subplot_titles=['Mutation identity at each position - {} - {} reads'.format(cst, reads) for cst, reads in zip(unique_constructs, study.df[(study.df['sample']==sample)&(study.df['family']==family)&(study.df['section']=='full')]['num_aligned'])])
for i_c, construct in enumerate(unique_constructs):
    muts_identity = plots.mutation_identity_at_each_position(study, sample, construct)

    for bar in muts_identity['fig']:
        fig.add_trace( bar, row=i_c+1, col=1 )
    
    fig.update_xaxes(tickangle=0, 
            tickvals=np.arange(len(muts_identity['data'].index)), ticktext=list(muts_identity['data'].index), tickfont={'size':8},
            row=i_c+1, col=1)
        
for trace, name in zip(fig["data"][:4], ['A','C','G','T']):
    trace.update(showlegend=True)
    trace["name"] = name

fig.update_yaxes(title='Mutation fraction')
fig.update_layout(barmode='stack', height=500*len(unique_constructs), width=1500)
save_plotly_fig(ipynbname.path(), '[B] Mutation identity at each position/{}/{}'.format(sample, family), fig)
plot = {
    'fig':fig,
    'data':study.df[
        (study.df['sample']==sample)&
        (study.df['family']==family)&
        (study.df['section']=='full')]\
    [['sample','construct','mod_bases_A','mod_bases_C','mod_bases_G','mod_bases_T','num_aligned']]
    }
plot['fig'].show()

### Mutation fraction at each position 
- bar graph (ACUG coloring)
- x-axis = position (number/base/both??)
- y-axis = # reads


In [4]:
## Change sample and family ##
sample = '01_02_S23_reads'   
family = 'hp1'    
##############################    

unique_constructs = study.df[(study.df['sample']==sample)&(study.df['family']==family)]['construct'].unique()

fig = make_subplots(rows=len(unique_constructs), cols=1, vertical_spacing=0.2/len(unique_constructs),
                    subplot_titles=['Mutation fraction at each position - {}'.format(cst) for cst in unique_constructs])
for i_c, construct in enumerate(unique_constructs):
    muts_identity = plots.mutation_fraction_at_each_position(study, sample, construct)

    for bar in muts_identity['fig']:
        fig.add_trace( bar, row=i_c+1, col=1 )
    
    fig.update_xaxes(tickangle=0, 
            tickvals=np.arange(len(muts_identity['data'].index)), ticktext=list(muts_identity['data'].index), tickfont={'size':8},
            row=i_c+1, col=1)
        
for trace, name in zip(fig["data"][:4], ['A','C','G','T']):
    trace.update(showlegend=True)
    trace["name"] = name

fig.update_yaxes(title='Mutation fraction')
fig.update_layout(barmode='stack', height=500*len(unique_constructs), width=1500)
save_plotly_fig(ipynbname.path(), '[C] Mutation fraction at each position/{}/{}'.format(sample, family), fig)
plot = {
    'fig':fig,
    'data':study.df[
        (study.df['sample']==sample)&
        (study.df['family']==family)&
        (study.df['section']=='full')]\
    [['sample','construct','mut_rates','num_aligned']]
    }
plot['fig'].show()


### Read coverage per position
- bar graph
- x-axis = position (number/base/both??)
- y-axis = coverage fraction

In [5]:
## Edit sample and family here ##
sample = '01_02_S23_reads'   
family = 'hp1'    
#################################   

unique_constructs = study.df[(study.df['sample']==sample)&(study.df['family']==family)]['construct'].unique()

fig = make_subplots(rows=len(unique_constructs), cols=1, vertical_spacing=0.2/len(unique_constructs),
                    subplot_titles=['Read coverage per position - {}'.format(cst) for cst in unique_constructs])
for i_c, construct in enumerate(unique_constructs):
    read_coverage = plots.read_coverage_per_position(study, sample, construct)

    for bar in read_coverage['fig']:
        fig.add_trace( bar, row=i_c+1, col=1 )

# print a legend for each section
for trace, name in zip(fig["data"][:len(read_coverage['data']['section'])], read_coverage['data']['section']):
    trace.update(showlegend=True)
    trace["name"] = name

fig.update_yaxes(title='Read coverage')
fig.update_layout(barmode='stack', height=500*len(unique_constructs), width=1300)
save_plotly_fig(ipynbname.path(), '[D] Read coverage per position/{}/{}'.format(sample, family), fig)
plot = {
    'fig':fig,
    'data':study.df[
        (study.df['sample']==sample)&
        (study.df['family']==family)&
        (study.df['section']=='full')]\
    [['sample','construct','cov_bases']]
    }
plot['fig'].show()