## Get started
### Imports

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib.colors import LogNorm
from os.path import exists, dirname
import os, sys
import numpy as np
import seaborn as sns
import json
path = os.path.dirname((os.path.abspath('')))
print(path)
sys.path.append(path)
from dreem_nap.study import Study
import yaml

### Configuration
- Load config file and check what's in it
- Configurate the Notebook

In [None]:
with open('../config.yml', 'r') as ymlfile:
    cfg = yaml.safe_load(ymlfile)
for k,v in cfg.items():
    print(k,(30-len(k))*'_',v)

mpl.rcParams['figure.dpi'] = cfg['mpl_rcParams_figure_dpi'] # the highest the resolution, the slowest the plotting


### Load data
Create and load dataframe of a single study as a demo

In [None]:
path = os.path.dirname((os.path.abspath('')))
print(path)
sys.path.append(path)
from dreem_nap.study import Study

# Create a study
salt = Study.from_dict({'name': 'salt',
                         'description': 'Change the Na concentration', 
                         'samples': ['A6', 'B6', 'C6', 'D6', 'E6'], 
                         'label': 'Na quantity [M]', 
                         'conditions': [0.15, 0.3, 0.6, 1.0, 1.2]})

# Load data
salt.load_df_from_local_files(path_to_data= cfg['path_to_data'], 
                              min_cov_bases= cfg['min_cov_bases'])#,
                            #  index='roi')

# Show the dataframe
salt._df.head()

In [None]:
#! usr/bin/env python3

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib.colors import LogNorm
from os.path import exists, dirname
import os, sys
import numpy as np
import seaborn as sns
import json

path = '/Users/ymdt/src/dreem_nap/'

sys.path.append(path)
from dreem_nap.manipulator import Manipulator

from dreem_nap.study import Study, util
import yaml
import pickle


#mpl.use('agg')

with open(path+'config.yml', 'r') as ymlfile:
    cfg = yaml.safe_load(ymlfile)

mpl.rcParams['figure.dpi'] = cfg['mpl_rcParams_figure_dpi'] # the highest the resolution, the slowest the plotting

####
# SET HYPER PARAMETERS HERE
####

studies = Study.load_studies(cfg['path_to_studies'])
study = Study.from_dict(studies['3UTR_v_5UTR'].__dict__)
study.load_df_from_local_files(path_to_data= cfg['path_to_data'], min_cov_bases = cfg['min_cov_bases'], filter_by='sample')
study._df.head()

In [None]:
samp = [470,472]
index = list(range(19,42))
base_type=['A','C']
metric = 'euclidean'
mpl.use('agg')

for s in samp:
    for i in [ list(range(19,42)), 'CACAGTCGAAAGACTGTG']:
        out = study.plot.dendrogram(samp=s, index=i, metric=metric, base_type = base_type, figsize=(10,100), dpi=300, p=100)#, title='Hierarchical clustering of the MS2 hairpin')
       # util.save_fig(f'../data/figs/date/Lauren/dendrogram/{s}_{i}_{metric}.png')
        out.fig.patch.set_facecolor('white')
        
        plt.savefig(f'../data/figs/2022-09-14/Lauren/dendrogram/{s}_{i}_{metric}.png')
        
        with open(f'../data/figs/2022-09-14/Lauren/dendrogram/{s}_{i}_{metric}.txt', 'w') as f:
            for item in out.labels:
                f.write(item+'\n')
            f.close()



In [None]:
study.plot.mut_histogram(470,'3114-O-flank_1=hp7-DB')
study.mut_histogram(470,'3114-O-flank_1=hp7-DB')

In [None]:
import seaborn as sns

data = study.mani.get_col_across_constructs(samp=[470,472], base_type=['A','C'], index=list(range(19,42)), col='mut_rates')
data = data.reindex(out.labels.reverse())
plt.figure(figsize=(10,100))
sns.heatmap(data)

In [None]:
[l[6:] for l in out.labels]

In [None]:
import os, sys
path = '/Users/ymdt/src/dreem_nap/'

sys.path.append(path)
from dreem_nap.manipulator import Manipulator

from dreem_nap.study import Study, util

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Create a study
samples_csv = pd.read_csv('~/src/data/Jordan/samples.csv')

dms = Study.from_dict({'name': 'dms',
                         'description': 'Change the DMS concentration', 
                         'samples': list(samples_csv['sample']), 
                         'label': 'Na quantity [M]', 
                         'conditions': list(samples_csv['DMS_conc_mM'])})

# Load data
dms.load_df_from_local_files(path_to_data='/Users/ymdt/src/data/Jordan', 
                              min_cov_bases= 1000)

In [None]:

df = dms.get_df()
# sort df by cov_bases and print
df.sort_values(by='worst_cov_bases', ascending=False).head(10)

In [None]:
dms.plot.mut_histogram('JC1','299-O-flank_1=hp11')

dms.mani.get_SCC(samp='JC1', construct='299-O-flank_1=hp11', base_type=['A','C'], cols=['cov_bases','mut_rates','poisson_min','poisson_max', 'poisson_low', 'poisson_high'])

In [None]:


        

import numpy as np 
import matplotlib.pyplot as plt 
import scipy.stats
import pandas as pd

N = [500,1000,2000,3000,4000,5000, 7000, 10000]
P = [round(a,2) for a in np.linspace(0,0.15,16)]

def compute_conf_interval(p,n, cl=0.95):
    #calculate Poisson confidence interval for median  
    min, max = 0.5*scipy.stats.chi2.ppf(cl/2, df=2*n*p)/n, 0.5*scipy.stats.chi2.ppf(1-cl/2, df=2*(p*n+1))/n
    if p == 0: 
        min = 0
    return np.array([min, max])

res = pd.DataFrame(columns=P)
res_prop = pd.DataFrame(columns=P)
for p in P:
    for n in N:
        res.loc[n,p] = p
        res.loc[n,str(p)+'_min'] = compute_conf_interval(p,n)[0]

        res.loc[n,str(p)+'_max'] = compute_conf_interval(p,n)[1] 
        
res = res.T
plt.legend(['N='+str(a) for a in N])
plt.xlabel('Probability p')
plt.ylabel('Probability p with confidence interval')


In [None]:
import numpy as np 
import matplotlib.pyplot as plt 
  
X = ['Group A','Group B','Group C','Group D']
Ygirls = [10,20,20,40]
Zboys = [20,30,25,30]
  
X_axis = np.arange(len(X))
  
plt.bar(X_axis - 0.2, Ygirls, 0.4, label = 'Girls')
plt.bar(X_axis + 0.2, Zboys, 0.4, label = 'Boys')
  
plt.xticks(X_axis, X)
plt.xlabel("Groups")
plt.ylabel("Number of Students")
plt.title("Number of Students in each group")
plt.legend()
plt.show()

In [None]:
import matplotlib

font = {'family' : 'normal',
        'weight' : 'bold',
        'size'   : 22}

matplotlib.rc('font', **font)
        

import numpy as np 
import matplotlib.pyplot as plt 
import scipy.stats
import pandas as pd

N = [1000, 2000, 4000, 8000, 16000]
P = np.array(list(range(10)))/100

def compute_conf_interval(p,n, cl=0.05):
    #calculate Poisson confidence interval for median  
    min, max = 0.5*scipy.stats.chi2.ppf(cl/2, df=2*n*p)/n, 0.5*scipy.stats.chi2.ppf(1-cl/2, df=2*(p*n+1))/n
    if p == 0: 
        min = 0
    return np.array([min, max])

data = np.zeros((len(N), len(P)))

res = pd.DataFrame(columns=P)
for i, n in enumerate(N):
    for j, p in enumerate(P):
        res.loc[n,p] = p
        res.loc[n,str(p)+'_min'] = compute_conf_interval(p,n)[0]
        res.loc[n,str(p)+'_max'] = compute_conf_interval(p,n)[1] 
        data[i,j]= compute_conf_interval(p,n)[1]-compute_conf_interval(p,n)[0]

  
X_axis = np.arange(len(P))
  
fig = plt.figure(figsize =(15, 7))
for i in range(len(N)):
    plt.bar(X_axis - 0.35+0.17*i, data[i], 0.17, label = 'N='+str(N[i]))
  
plt.xticks(X_axis, P)
plt.xlabel("Probability of mutation p")
plt.ylabel("Confidence interval width")
plt.title("Confidence interval width for different N")
plt.legend()
plt.tight_layout()

plt.figure()     
res = res.T
res.loc[P].plot.bar(figsize=(20,10), yerr= np.array([res.loc[[str(p)+'_min' for p in P]].values.T,res.loc[[str(p)+'_max' for p in P]].values.T]).transpose(1,0,2), width=0.9)
plt.legend(['N='+str(a) for a in N])
plt.xlabel('Probability p')
plt.ylabel('Probability p with confidence interval')
plt.title("Confidence interval width for different N")
plt.tight_layout()


In [None]:
df = study.mani.get_SCC(samp=470,
                        construct='3114-O-flank_1=hp7-DB', 
                        cols=['mut_rates','sequence','structure','cov_bases'],
                        base_type=['A','C'], 
                        index=list(range(40,50))) 
df.to_csv('example.csv')

In [None]:
from dreem_nap import util
sub_lib = 'ART5'
mpl.use('agg') # use this to avoid display issues
for samp in study.samples:
    for construct in study.constructs:
        df = study.get_df()
        if df[(df['samp']==samp)&(df['construct']==construct)]['sub-library'].iloc[0] == sub_lib:
            study.plot.mut_histogram(samp=samp,construct=construct)
           # util.save_fig(path_to_figs+'/'+study.name+'/'+samp+'_'+construct+'_mut_histogram.png')

In [None]:
study.plot.deltaG_sample(samp=472, 
                         structure='structure', 
                         deltaG='deltaG_min', 
                         max_mutation=0.15, 
                         models=['lambda x,a,b: a*x+b'], 
                         index=list(range(19,42)))



In [None]:
study._df['flank'].unique()

In [None]:
study.plot.mut_histogram(samp=470,
                         construct='3114-O-flank_1=hp7-DB',
                         cluster=0,
                         index=list(range(19,80)),
                         base_paired=True,
                         structure='structure')

In [None]:

salt.plot.mut_histogram(samp=470, construct='3114-O-flank_1=hp7-DB', base_paired=False, structure='structure')#, base_type=['A','C'])#index=list(range(19,42)))


In [None]:
base_type = list('ACTG')
figsize=(10,5)
colors = {'A':'r','C':'b','G':'y','T':'g'}
samp=470
construct='3114-O-flank_1=hp7-DB'
index='CACAGTCGAAAGACTGTG'
args = locals()
args.pop('df')
fig = plt.figure(figsize=figsize)
ax = plt.axes()
df_hist = pd.DataFrame()




In [None]:
min_base_cov = 1000
studies = Study.load_studies(cfg['path_to_studies'])
study = Study().from_dict(studies['all'].__dict__)
study.load_df_from_local_files(path_to_data= cfg['path_to_data'], min_cov_bases =min_base_cov, filter_by='sample')
for s in study.samples:
    study.plot.deltaG_sample(samp=s, index = list(range(19,42)), base_type=['A','C'], structure='structure',deltaG='deltaG_min',figsize=(25, 7), grid=True, cluster=0, models=['lambda x, a, b, c: a+2*b*(np.exp(c*x)/(1+np.exp(c*x)))'])
    util.save_fig(f"/Users/ymdt/src/data/figs/Lauren/date/deltaG sample/min_base_cov = {min_base_cov}/index 19-41/{s}.png")


In [None]:
df = study.get_df()

print('base_coverage | # constructs 470 | # constructs 472')
for min_base_cov in [5000, 4000, 3000, 2000,  1000, 500 ]:
    stu = Study().from_dict(studies['3UTR_v_5UTR'].__dict__)
    stu.load_df_from_local_files(path_to_data= cfg['path_to_data'], min_cov_bases = min_base_cov, filter_by='sample')
    df = stu.get_df()
    print(min_base_cov,sum(df.samp==470), sum(df.samp==472))
    for s in df.samp.unique():
        for c in df[df['samp']==s].construct.unique():
            stu.plot.mut_histogram(samp=s, construct=c, plot_type='index', index = 'CACAGTCGAAAGACTGTG', figsize=(25, 7), grid=True, cluster=0)
            util.save_fig(f"/Users/ymdt/src/data/figs/Lauren/date/mutation histograms/{min_base_cov}/{s}/{c}.png")


In [None]:
lambda x, a, b, c: a+2*b*(np.exp(c*x)/(1+np.exp(c*x)))

In [None]:
study.mani.get_SCC(samp=s, construct=c, cols=['mut_rates'], index = 'CACAGTCGAAAGACTGTG', base_type=['A','C'], structure='structure',  cluster=0)

In [None]:
out = study.plot.mut_histogram(samp=470, construct='3114-O-flank_1=hp7-DB', plot_type='index', index = list(range(19,42)), figsize=(25, 7), grid=True)
print(out.data.to_csv('hi_lauren.csv'))

In [None]:
stack = pd.DataFrame()
for c in study.constructs:
    stack = pd.concat((stack, pd.DataFrame(study.mani.get_SCC(samp=470, construct=c, cols=['mut_rates'], index=list(range(19,42))).T)))#, index=[c])))
stack.index = study.constructs
stack

## Make plots
Plot this study with different plots. Check out the list of plots in the plot module

### Mutation histogram

In [None]:
salt.mut_histogram(samp='A6', construct='7695',\
             plot_type='index', figsize=(28,4))

### DeltaG
DeltaG vs mutation rate for all constructs of a sample

In [None]:
salt.deltaG(samp='A6',bases_type=['A','C'], roi_range='all')  # currently bugged due to DREEM, work in progress

### DeltaG_basewise
Plot the mutation rate of each paired-predicted base of the ROI for each construct of a sample, w.r.t the deltaG estimation

In [None]:
salt.deltaG_basewise(samp='A6', roi_range=[94,95,96,97])

###  Heatmap
Here a heatmap of the minimum base coverage across all samples. Column can be any column that contains a single scalar value.

In [None]:
salt.heatmap(column='min_cov_bases') 

### Mutation rate vs 1-base_pairing
Plot a mutation rate histogram, a 1-base_pairing probability histogram, and a scatter plot fitting the mutation rate vs 1-base_pairing. 

In [None]:
# currently bugged due to DREEM, work in progress
salt.mut_rate_vs_base_non_pairing_prob(samp='A6', construct='7695') # shows the mutation rate vs base non-pairing probability

### Base coverage
Plot the base coverage of a specific (sample, construct)

In [None]:
salt.base_coverage(samp='A6', construct='9572')

### Base coverage for all constructs
Plot the base-coverage of the worst-covered base of the Region of Interest, for each construct. 

In [None]:
salt.base_coverage_ROI_for_all_constructs()


In [None]:
salt.random_9_base_coverage()

In [None]:
random_9_base_coverage()
sample_coverage_distribution()
valid_construct_per_sample()
sliding_window_r2_gini()
study_base()
study_sample()
base_wise_mut_vs_prob()
correlation_n_samples()


In [None]:
salt.study_base(construct='9572', structure='full',
                roi_range=[40,63,78,94])

### You can load every study from a file using Study.load_studies()

In [None]:
studies = Study.load_studies(cfg['path_to_studies'])
for study in studies.values():
    if study.name != 'all_samples':
        study.load_df_from_local_files(path_to_data= cfg['path_to_data'], 
                                   min_cov_bases= cfg['min_cov_bases'])

studies['temperature'].df.head()

### Studies can be called from the dictionary using their name

In [None]:
studies['temperature'].mut_histogram(studies['temperature'].samples[0], '9572', 'index')

In [None]:
for study in studies.values():
    if study.name != 'all_samples':
        for s in study.samples:
            for construct in study.constructs:
                study.mut_histogram(s, construct, 'index')
                util.save_fig(f"data/figs/date/mutation histogram/{study.name}/{s}/{construct}.png")
                plt.close()


In [None]:
class TestStudy(Study):
    def mut_histogram(self, samp:str, construct:str, plot_type:str, figsize=(35,7))->None:
        """Plot the mutation rate of a specific (sample, construct).

        Args:
        plot_type: 'index' or 'partition'. 
            - 'index' uses bases numbers as index and the original construct bases as colors.
            - 'partition' uses original sequence bases as index and the partition of mutated bases as colors.
        samp: sample of interest.
        construct: construct of interest.
        """

        df_use = self.df.set_index(['samp','construct'])
        
        if not plot_type in ['index','partition']:
            raise Exception(f"{plot_type} must be 'index' or 'partition', please check this argument")

        if plot_type == 'index':  # Plot the mutation rate for each base along the sequence

            mut_per_base = pd.DataFrame({'mut_rates': df_use['mut_rates'].loc[samp, construct]
                                        ,'base':list(df_use['sequence'].loc[samp, construct])})\
                                        .reset_index()\
                                        .set_index(['base', 'index'])
            df_hist = pd.DataFrame()
            df_hist.index = mut_per_base.reset_index()['index']

            for base in ['A','C','G','T']:
                df_hist[base] = pd.Series(dtype=float)
                df_hist[base] = mut_per_base.loc[base]

            #df_hist.index = mut_per_base.reset_index()['base']

            ax = df_hist.plot.bar(stacked=True, color=['r','b','y','g'],  figsize=figsize)
            plt.title(f"sample {samp}, construct {construct}")

        if plot_type == 'partition': # Plot the partition of mutations for each base along the sequence
            df_hist = pd.DataFrame()
            for base in ['A','C','G','T']:
                df_hist[f"mod_bases_{base}"]  = np.array(df_use[f"mod_bases_{base}"].loc[samp, construct][1:])/df_use['info_bases'].loc[samp, construct][1:]

            df_hist.index = list(df_use['sequence'].loc[samp,construct])

            ax = df_hist.plot.bar(stacked=True, color=['r','b','y','g'], figsize=figsize)

        return ax


# Load configuration
with open('config.yml', 'r') as ymlfile:
    cfg = yaml.safe_load(ymlfile)
for k,v in cfg.items():
    print(k,(30-len(k))*'_',v)

mpl.rcParams['figure.dpi'] = cfg['mpl_rcParams_figure_dpi'] # the highest the resolution, the slowest the plotting

# Create a study
salt = TestStudy().from_dict({'name': 'salt',
                         'description': 'Change the Na concentration', 
                         'samples': ['A6', 'B6', 'C6', 'D6', 'E6'], 
                         'title': 'Na quantity [M]', 
                         'conditions': [0.15, 0.3, 0.6, 1.0, 1.2]})

# Load data
salt.load_df_from_local_files(path_to_data= cfg['path_to_data'], 
                              min_cov_bases= cfg['min_cov_bases'])

# Show the dataframe
salt.df.head()

In [None]:
import pickle
from os import listdir

listdir('../data/DEMULTIPLEXED/')

with open(f"../data/DEMULTIPLEXED/A4/mh.p",'rb') as f:
    pick =pickle.load(f)
    print(dir(pick['9572']))


In [None]:
for s in salt.samples:
    salt.mut_histogram(s, '9572', 'index')

In [7]:
import plotly.graph_objects as go
import numpy as np
 
# Creating the X, Y value that will
# change the values of Z as a function
feature_x = np.arange(0, 50, 2)
feature_y = np.arange(0, 50, 3)
 
# Creating 2-D grid of features
[X, Y] = np.meshgrid(feature_x, feature_y)
 
Z = np.cos(X / 2) + np.sin(Y / 4)
 
# plotting the figure
fig = go.Figure(data =
    go.Contour(x = feature_x, y = feature_y, z = Z))
 
fig.show()



In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import os, sys
import numpy as np
from tqdm.auto import tqdm

sys.path.append('/Users/ymdt/src/dreem_nap/')
from dreem_nap import manipulator 
from dreem_nap.study import Study, util
from dreem_nap.util import *
from itertools import cycle
import plotly.express as px
from scipy.optimize import curve_fit
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from dreem_nap.manipulator import Fit

# Create a study
samples_csv = pd.read_csv('~/src/data/Jordan/samples.csv')

dms = Study.from_dict({'name': 'dms',
                         'description': 'Change the DMS concentration', 
                         'samples': list(samples_csv['sample']), 
                         'label': 'Na quantity [M]', 
                         'conditions': list(samples_csv['DMS_conc_mM'])})

# Load data
dms.load_df_from_local_files(path_to_data='/Users/ymdt/src/data/Jordan', 
                              min_cov_bases= 1000, 
                              filter_by='sample')


    
dms._df.head(3)


  from .autonotebook import tqdm as notebook_tqdm
100%|[32m██████████[0m| 410/410 [00:00<00:00, 18588.96construct filtered/s, sample:JC100]
100%|[32m██████████[0m| 409/409 [00:00<00:00, 22927.05construct filtered/s, sample:JC5]
100%|[32m██████████[0m| 409/409 [00:00<00:00, 23834.25construct filtered/s, sample:JC1]
100%|[32m██████████[0m| 410/410 [00:00<00:00, 24092.36construct filtered/s, sample:JC0]


Unnamed: 0,samp,construct,sequence,structure,data_type,num_reads,num_aligned,num_of_mutations,mut_bases,info_bases,...,mod_bases_A,mod_bases_C,mod_bases_G,mod_bases_T,cluster,ROI_start,ROI_stop,mut_rates,worst_cov_bases,min_cov_bases
0,JC100,410-O-flank_1=hp14,GAGCCTTATGATTTCCCGCGCATATGAGGATCACCCATATGCTCCG...,((((....((......((.(((((((.((....)))))))))..))...,DMS,3971,3930,"[563, 936, 956, 632, 387, 168, 76, 34, 8, 6, 0...","[0.0, 0.0, 0.0, 1.0, 2.0, 1.0, 0.0, 5.0, 2.0, ...","[2772.0, 3537.0, 3615.0, 3670.0, 3674.0, 3675....",...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 5.0, 1.0, ...","[0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",0,0,169,"[0.0, 0.0, 0.0, 0.00027247956403269756, 0.0005...",2668.0,1000
1,JC100,366-O-flank_1=hp13,GAGCCTTATGATTTCCCGCGCATATGAGGATCACCCATATGCTCTG...,...................(((((((.((....))))))))).(((...,DMS,3156,3121,"[404, 667, 700, 564, 318, 191, 86, 31, 10, 12,...","[0.0, 0.0, 0.0, 4.0, 1.0, 0.0, 0.0, 1.0, 0.0, ...","[2223.0, 2890.0, 2941.0, 2973.0, 2976.0, 2976....",...,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 3.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",0,0,169,"[0.0, 0.0, 0.0, 0.0013454423141607804, 0.00033...",2110.0,1000
2,JC100,45-O-flank_1=hp2,GAGCCTTATGATTTCCCGCGCATATGAGGATCACCCATATGCTCGA...,................((.(((((((.((....))))))))).))....,DMS,6621,6554,"[1132, 1665, 1490, 1047, 524, 216, 69, 25, 18,...","[0.0, 0.0, 0.0, 1.0, 4.0, 1.0, 0.0, 5.0, 2.0, ...","[4599.0, 6025.0, 6161.0, 6245.0, 6248.0, 6248....",...,"[0.0, 0.0, 0.0, 0.0, 3.0, 1.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","[0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",0,0,169,"[0.0, 0.0, 0.0, 0.0001601281024819856, 0.00064...",4410.0,1000


In [36]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import os, sys
import numpy as np
from tqdm.auto import tqdm

sys.path.append('/Users/ymdt/src/dreem_nap/')
from dreem_nap import manipulator 
from dreem_nap.study import Study, util
from dreem_nap.util import *
from itertools import cycle
import plotly.express as px
from scipy.optimize import curve_fit
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from dreem_nap.manipulator import Fit, Manipulator
import plotly

# Create a study
samples_csv = pd.read_csv('~/src/data/Jordan/samples.csv')

dms = Study.from_dict({'name': 'dms',
                         'description': 'Change the DMS concentration', 
                         'samples': list(samples_csv['sample']), 
                         'label': 'Na quantity [M]', 
                         'conditions': list(samples_csv['DMS_conc_mM'])})

# Load data
dms.load_df_from_local_files(path_to_data='/Users/ymdt/src/data/Jordan', 
                              min_cov_bases= 1000, 
                              filter_by='sample')


    
dms._df.head(3)

def mut_histogram(df, samp:str, construct:str, cluster:int=0, index='all', base_type:List[str]=['A','C','G','T'], base_paired:bool=None, structure:str=None, show_ci:bool=True, figsize:Tuple[int]=(35,7), title_fontsize=40, xticks_fontsize=10, yticks_fontsize=30, **kwargs)->OutputPlot:
    """Plot the mutation rates as histograms.

    Args:
        samp (str): Sample of your sample-construct-cluster.
        construct (str): Construct of your sample-construct-cluster.
        cluster (int, optional): Cluster of your sample-construct-cluster. Defaults to 0. 
        index (_type_, optional): Indexes to plot. Defaults to ``'all'``.
        base_type (List[str], optional): Bases type to plot. Defaults to ``['A','C','G','T']``.
        base_paired (bool, optional): Base-pairing predicition to plot. Defaults to None.
        structure (str, optional): Structure to use for base_paired filtering. Defaults to None.
        show_ci (bool, optional): Show confidence interval on the histogram. Defaults to True.
        figsize (Tuple[int], optional): Figure size. Defaults to (35,7).
        title_fontsize (int, optional): Title font size. Defaults to 40.
        yticks_fontsize (int, optional): Ytick font size. Defaults to 30.
        **kwargs: Other arguments to pass to matplotlib.pyplot.

    Raises:
        Exception: plot_type is not ``index`` or ``partition``.

    Returns:
        OutputPlot: Figure, axis and data of the output plot.
    """

    mh = Manipulator(df).get_series(df, SubDF.from_locals(locals()))
    xaxis_coordinates = [i for i in range(len(mh.sequence) -1)]

    mut_y = []
    for pos in range(len(mh.sequence)):
        try:
            mut_frac = mh.mut_bases[pos] / mh.info_bases[pos]
        except:
            mut_frac = 0.0
        mut_y.append(mut_frac)
        mut_frac = round(mut_frac, 5)

    cmap = {"A": "red", "T": "green", "G": "orange", "C": "blue"}  # Color map
    colors = []
    ref_bases = []
    hover_attr = pd.DataFrame({'mut_rate':mut_y,
                                'base':list(mh.sequence), 
                                'paired':[{'.':True, '(':False,')':False}[s] for s in mh.structure]})
    for i in range(len(mh.sequence)):
        if i >= len(mh.sequence)-1:
            continue
        colors.append(cmap[mh.sequence[i - 1]])
        ref_bases.append(mh.sequence[i - 1])
    mut_trace = go.Bar(
            x=xaxis_coordinates,
            y=mut_y,
            text=hover_attr,
            marker=dict(color=colors),
            showlegend=False,
            hovertemplate = ''.join(["<b>"+ha+": %{text["+str(i)+"]}<br>" for i, ha in enumerate(hover_attr)]),
        )   
    
    if show_ci:
        mut_trace.update(
           error_y=dict(
                type='data',
                symmetric=False,
                array=mh.poisson_high,
                arrayminus=mh.poisson_low
                )
        )

    mut_fig_layout = go.Layout(
            title=mh.construct,
            xaxis=dict(title="Postion"),
            yaxis=dict(title="Fraction", range=[0, 0.1]),
            plot_bgcolor="white"

    )
    mut_fig = go.Figure(data=mut_trace, layout=mut_fig_layout)
    seqs = list(mh.sequence)
    if mh.structure is not None:
        db = list(mh.structure)
    else:
        db = " " * len(seqs)
    mut_fig.update_yaxes(
            gridcolor='lightgray',
            linewidth=1,
            linecolor='black',
            mirror=True
    )
    mut_fig.update_xaxes(
            linewidth=1,
            linecolor='black',
            mirror=True
    )
    mut_fig.update_xaxes(
            tickvals=xaxis_coordinates,
            ticktext=["%s<br>%s" % (x, y) for (x, y) in zip(seqs, db)],
            tickangle=0
    )

    plotly.offline.plot(
            mut_fig, filename= "pop_avg.html", auto_open=True,
    )
    return OutputPlot(mh, mut_fig)
print(mut_histogram(dms._df, samp='JC100',construct='410-O-flank_1=hp14'))

100%|[32m██████████[0m| 410/410 [00:00<00:00, 20265.68construct filtered/s, sample:JC100]
100%|[32m██████████[0m| 409/409 [00:00<00:00, 20407.20construct filtered/s, sample:JC5]
100%|[32m██████████[0m| 409/409 [00:00<00:00, 21574.17construct filtered/s, sample:JC1]
100%|[32m██████████[0m| 410/410 [00:00<00:00, 20763.63construct filtered/s, sample:JC0]


<dreem_nap.util.OutputPlot object at 0x2abcb4a60>


In [85]:
dms._df[dms._df.construct=='410-O-flank_1=hp14']['DMS_conc_mM']


0      104.0
262     52.0
581     10.4
909      0.0
Name: DMS_conc_mM, dtype: float64