## Get started
### Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import os, sys
import numpy as np
from tqdm.auto import tqdm

sys.path.append('/Users/ymdt/src/dreem_nap/')
from dreem_nap.manipulator import Manipulator
from dreem_nap.study import Study, util

# Create a study
samples_csv = pd.read_csv('~/src/data/Jordan/samples.csv')

dms = Study.from_dict({'name': 'dms',
                         'description': 'Change the DMS concentration', 
                         'samples': list(samples_csv['sample']), 
                         'label': 'Na quantity [M]', 
                         'conditions': list(samples_csv['DMS_conc_mM'])})

# Load data
dms.load_df_from_local_files(path_to_data='/Users/ymdt/src/data/Jordan', 
                              min_cov_bases= 1000, 
                              filter_by='sample')

dms.get_df().head()

  from .autonotebook import tqdm as notebook_tqdm
100%|[32m██████████[0m| 410/410 [00:00<00:00, 20802.81construct filtered/s, sample:JC100]
100%|[32m██████████[0m| 409/409 [00:00<00:00, 22287.52construct filtered/s, sample:JC5]
100%|[32m██████████[0m| 409/409 [00:00<00:00, 22524.56construct filtered/s, sample:JC1]
100%|[32m██████████[0m| 410/410 [00:00<00:00, 22632.23construct filtered/s, sample:JC0]


Unnamed: 0,samp,construct,sequence,structure,data_type,num_reads,num_aligned,num_of_mutations,mut_bases,info_bases,...,mod_bases_A,mod_bases_C,mod_bases_G,mod_bases_T,cluster,ROI_start,ROI_stop,mut_rates,worst_cov_bases,min_cov_bases
0,JC100,410-O-flank_1=hp14,GAGCCTTATGATTTCCCGCGCATATGAGGATCACCCATATGCTCCG...,((((....((......((.(((((((.((....)))))))))..))...,DMS,3971,3930,"[563, 936, 956, 632, 387, 168, 76, 34, 8, 6, 0...","[0.0, 0.0, 0.0, 1.0, 2.0, 1.0, 0.0, 5.0, 2.0, ...","[2772.0, 3537.0, 3615.0, 3670.0, 3674.0, 3675....",...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 5.0, 1.0, ...","[0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",0,0,169,"[0.0, 0.0, 0.0, 0.00027247956403269756, 0.0005...",2668.0,1000
1,JC100,366-O-flank_1=hp13,GAGCCTTATGATTTCCCGCGCATATGAGGATCACCCATATGCTCTG...,...................(((((((.((....))))))))).(((...,DMS,3156,3121,"[404, 667, 700, 564, 318, 191, 86, 31, 10, 12,...","[0.0, 0.0, 0.0, 4.0, 1.0, 0.0, 0.0, 1.0, 0.0, ...","[2223.0, 2890.0, 2941.0, 2973.0, 2976.0, 2976....",...,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 3.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",0,0,169,"[0.0, 0.0, 0.0, 0.0013454423141607804, 0.00033...",2110.0,1000
2,JC100,45-O-flank_1=hp2,GAGCCTTATGATTTCCCGCGCATATGAGGATCACCCATATGCTCGA...,................((.(((((((.((....))))))))).))....,DMS,6621,6554,"[1132, 1665, 1490, 1047, 524, 216, 69, 25, 18,...","[0.0, 0.0, 0.0, 1.0, 4.0, 1.0, 0.0, 5.0, 2.0, ...","[4599.0, 6025.0, 6161.0, 6245.0, 6248.0, 6248....",...,"[0.0, 0.0, 0.0, 0.0, 3.0, 1.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.0, 1.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, ...","[0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...",0,0,169,"[0.0, 0.0, 0.0, 0.0001601281024819856, 0.00064...",4410.0,1000
3,JC100,91-CC-flank_1=hp4,GAGCCTTATGATTTCCCGCGCATATGAGGATCACCCATATGCTCGC...,...................(((((((.((....)))))))))..((...,DMS,2494,2472,"[344, 557, 618, 410, 243, 100, 63, 17, 12, 6, ...","[0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 1.0, 2.0, 0.0, ...","[1764.0, 2246.0, 2294.0, 2318.0, 2321.0, 2321....",...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0,0,169,"[0.0, 0.0, 0.0, 0.0012942191544434857, 0.0, 0....",1697.0,1000
4,JC100,308-O-flank_1=hp11,GAGCCTTATGATTTCCCGCGCATATGAGGATCACCCATATGCTCGC...,...................(((((((.((....)))))))))..((...,DMS,2600,2571,"[526, 705, 574, 332, 158, 47, 30, 3, 3, 1, 0, ...","[0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, ...","[1889.0, 2406.0, 2452.0, 2484.0, 2486.0, 2487....",...,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, ...","[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0,0,169,"[0.0, 0.0, 0.0, 0.00040257648953301127, 0.0004...",1735.0,1000


## Make plots
Plot this study with different plots. Check out the list of plots in the plot module

### Mutation histogram

In [2]:
from dreem_nap.manipulator import Fit, Manipulator, SubDF, OutputPlot
import plotly.graph_objects as go
import plotly

from plotly.subplots import make_subplots
import plotly.graph_objects as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from dreem_nap.manipulator import Fit, Manipulator


dms.base_coverage(samp='JC100', 
                  constructs=['366-O-flank_1=hp13','45-O-flank_1=hp2'],
                  savefile='base_coverage.png')


<dreem_nap.util.OutputPlot at 0x17c906ce0>

### DeltaG
DeltaG vs mutation rate for all constructs of a sample

In [None]:
salt.deltaG(samp='A6',bases_type=['A','C'], roi_range='all')  # currently bugged due to DREEM, work in progress

### DeltaG_basewise
Plot the mutation rate of each paired-predicted base of the ROI for each construct of a sample, w.r.t the deltaG estimation

In [None]:
salt.deltaG_basewise(samp='A6', roi_range=[94,95,96,97])

###  Heatmap
Here a heatmap of the minimum base coverage across all samples. Column can be any column that contains a single scalar value.

In [None]:
salt.heatmap(column='min_cov_bases') 

### Mutation rate vs 1-base_pairing
Plot a mutation rate histogram, a 1-base_pairing probability histogram, and a scatter plot fitting the mutation rate vs 1-base_pairing. 

In [None]:
# currently bugged due to DREEM, work in progress
salt.mut_rate_vs_base_non_pairing_prob(samp='A6', construct='7695') # shows the mutation rate vs base non-pairing probability

### Base coverage
Plot the base coverage of a specific (sample, construct)

In [None]:
salt.base_coverage(samp='A6', construct='9572')

### Base coverage for all constructs
Plot the base-coverage of the worst-covered base of the Region of Interest, for each construct. 

In [None]:
salt.base_coverage_ROI_for_all_constructs()


In [None]:
salt.random_9_base_coverage()

In [None]:
random_9_base_coverage()
sample_coverage_distribution()
valid_construct_per_sample()
sliding_window_r2_gini()
study_base()
study_sample()
base_wise_mut_vs_prob()
correlation_n_samples()


In [None]:
salt.study_base(construct='9572', structure='full',
                roi_range=[40,63,78,94])

### You can load every study from a file using Study.load_studies()

In [None]:
studies = Study.load_studies(cfg['path_to_studies'])
for study in studies.values():
    if study.name != 'all_samples':
        study.load_df_from_local_files(path_to_data= cfg['path_to_data'], 
                                   min_cov_bases= cfg['min_cov_bases'])

studies['temperature'].df.head()

### Studies can be called from the dictionary using their name

In [None]:
studies['temperature'].mut_histogram(studies['temperature'].samples[0], '9572', 'index')

In [None]:
for study in studies.values():
    if study.name != 'all_samples':
        for s in study.samples:
            for construct in study.constructs:
                study.mut_histogram(s, construct, 'index')
                util.save_fig(f"data/figs/date/mutation histogram/{study.name}/{s}/{construct}.png")
                plt.close()


In [None]:
class TestStudy(Study):
    def mut_histogram(self, samp:str, construct:str, plot_type:str, figsize=(35,7))->None:
        """Plot the mutation rate of a specific (sample, construct).

        Args:
        plot_type: 'index' or 'partition'. 
            - 'index' uses bases numbers as index and the original construct bases as colors.
            - 'partition' uses original sequence bases as index and the partition of mutated bases as colors.
        samp: sample of interest.
        construct: construct of interest.
        """

        df_use = self.df.set_index(['samp','construct'])
        
        if not plot_type in ['index','partition']:
            raise Exception(f"{plot_type} must be 'index' or 'partition', please check this argument")

        if plot_type == 'index':  # Plot the mutation rate for each base along the sequence

            mut_per_base = pd.DataFrame({'mut_rates': df_use['mut_rates'].loc[samp, construct]
                                        ,'base':list(df_use['sequence'].loc[samp, construct])})\
                                        .reset_index()\
                                        .set_index(['base', 'index'])
            df_hist = pd.DataFrame()
            df_hist.index = mut_per_base.reset_index()['index']

            for base in ['A','C','G','T']:
                df_hist[base] = pd.Series(dtype=float)
                df_hist[base] = mut_per_base.loc[base]

            #df_hist.index = mut_per_base.reset_index()['base']

            ax = df_hist.plot.bar(stacked=True, color=['r','b','y','g'],  figsize=figsize)
            plt.title(f"sample {samp}, construct {construct}")

        if plot_type == 'partition': # Plot the partition of mutations for each base along the sequence
            df_hist = pd.DataFrame()
            for base in ['A','C','G','T']:
                df_hist[f"mod_bases_{base}"]  = np.array(df_use[f"mod_bases_{base}"].loc[samp, construct][1:])/df_use['info_bases'].loc[samp, construct][1:]

            df_hist.index = list(df_use['sequence'].loc[samp,construct])

            ax = df_hist.plot.bar(stacked=True, color=['r','b','y','g'], figsize=figsize)

        return ax


# Load configuration
with open('config.yml', 'r') as ymlfile:
    cfg = yaml.safe_load(ymlfile)
for k,v in cfg.items():
    print(k,(30-len(k))*'_',v)

mpl.rcParams['figure.dpi'] = cfg['mpl_rcParams_figure_dpi'] # the highest the resolution, the slowest the plotting

# Create a study
salt = TestStudy().from_dict({'name': 'salt',
                         'description': 'Change the Na concentration', 
                         'samples': ['A6', 'B6', 'C6', 'D6', 'E6'], 
                         'title': 'Na quantity [M]', 
                         'conditions': [0.15, 0.3, 0.6, 1.0, 1.2]})

# Load data
salt.load_df_from_local_files(path_to_data= cfg['path_to_data'], 
                              min_cov_bases= cfg['min_cov_bases'])

# Show the dataframe
salt.df.head()

In [None]:
import pickle
from os import listdir

listdir('../data/DEMULTIPLEXED/')

with open(f"../data/DEMULTIPLEXED/A4/mh.p",'rb') as f:
    pick =pickle.load(f)
    print(dir(pick['9572']))


In [None]:
for s in salt.samples:
    salt.mut_histogram(s, '9572', 'index')

In [None]:
import plotly.graph_objects as go
import numpy as np
 
# Creating the X, Y value that will
# change the values of Z as a function
feature_x = np.arange(0, 50, 2)
feature_y = np.arange(0, 50, 3)
 
# Creating 2-D grid of features
[X, Y] = np.meshgrid(feature_x, feature_y)
 
Z = np.cos(X / 2) + np.sin(Y / 4)
 
# plotting the figure
fig = go.Figure(data =
    go.Contour(x = feature_x, y = feature_y, z = Z))
 
fig.show()



In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import os, sys
import numpy as np
from tqdm.auto import tqdm

sys.path.append('/Users/ymdt/src/dreem_nap/')
from dreem_nap import manipulator 
from dreem_nap.study import Study, util
from dreem_nap.util import *
from itertools import cycle
import plotly.express as px
from scipy.optimize import curve_fit
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from dreem_nap.manipulator import Fit

# Create a study
samples_csv = pd.read_csv('~/src/data/Jordan/samples.csv')

dms = Study.from_dict({'name': 'dms',
                         'description': 'Change the DMS concentration', 
                         'samples': list(samples_csv['sample']), 
                         'label': 'Na quantity [M]', 
                         'conditions': list(samples_csv['DMS_conc_mM'])})

# Load data
dms.load_df_from_local_files(path_to_data='/Users/ymdt/src/data/Jordan', 
                              min_cov_bases= 1000, 
                              filter_by='sample')


    
dms._df.head(3)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import os, sys
import numpy as np
from tqdm.auto import tqdm

sys.path.append('/Users/ymdt/src/dreem_nap/')
from dreem_nap import manipulator 
from dreem_nap.study import Study, util
from dreem_nap.util import *
from itertools import cycle
import plotly.express as px
from scipy.optimize import curve_fit
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from dreem_nap.manipulator import Fit, Manipulator
import plotly

# Create a study
samples_csv = pd.read_csv('~/src/data/Jordan/samples.csv')

dms = Study.from_dict({'name': 'dms',
                         'description': 'Change the DMS concentration', 
                         'samples': list(samples_csv['sample']), 
                         'label': 'Na quantity [M]', 
                         'conditions': list(samples_csv['DMS_conc_mM'])})

# Load data
dms.load_df_from_local_files(path_to_data='/Users/ymdt/src/data/Jordan', 
                              min_cov_bases= 1000, 
                              filter_by='sample')


    
dms._df.head(3)

print(dms.mutation_histogram(samp='JC100',construct='410-O-flank_1=hp14',show_ci=False))

In [None]:
dms._df[dms._df.construct=='410-O-flank_1=hp14']['DMS_conc_mM']
