# Read me
This template is meant to be a starter for your customized DREEM output data analysis.

- To install this library, please check the installation on the [Git repo](https://github.com/yvesmartindestaillades/NAP).
- To learn how to use this library, please get through the [tutorial](tutorial.ipynb).


# Turner overthrown

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib.colors import LogNorm
from os.path import exists, dirname
import os, sys
import numpy as np
import seaborn as sns
import json
from dreem_nap import data_wrangler, data_manip, database, plot, utils
from dreem_nap.study import Study


# Step 1: Data wrangling
### Step 1.1: Define your study and some basics about your project

In [194]:
# Set your root folder for the database (at the moment, keep Yves)
folder = 'Yves'

path_to_data = '../data'

# Pull the firebase
# Firebase credentials file
firebase_credentials_file = f"{path_to_data}/credentials_firebase.json"
with open(firebase_credentials_file) as file:
    firebase_credentials = json.load(file)
database.connect(firebase_credentials)

# Select your study
study_name = 'all samples' 

## Set your base coverage high-pass filter value
min_bases_cov = 1000 

# Set the resolution for the plots
mpl.rcParams['figure.dpi'] = 600 # the highest the resolution, the slowest the plotting
mpl.rcParams["figure.figsize"] = [25,7]
#plt.rcParams["figure.autolayout"] = True

# Depending on the study you select, you'll get a series of samples. You can also create new studies using this dictionary.
# Here's an example.

# Your studies under the shape of a dataframe
df_studies = data_wrangler.load_studies( f"{path_to_data}/samples.csv")
temp = df_studies.to_dict(orient='index')

# Your studies under the shape of a dictionary of Study
studies = {study: Study().from_dict(temp[study])  for study in temp}
print(f"Here are the available studies: {studies.keys()}")

# Load the study that you want
study = studies[study_name]
print(f"Here is your study {study.to_dict()}" )


Couldn't initiate connection to Firebase. Connection might be already initiated.
Here are the available studies: dict_keys(['180 mM DMS', '3.1 DMS', '3.2 DMS', '60 mM DMS', 'PEG1K', 'PEG3350', 'PEG8K', 'RNA titration', 'TO_DO_2', 'all samples', 'magnesium', 'replicates 1', 'replicates 2', 'salt', 'spermidine', 'spermine', 'temperature'])
Here is your study {'name': 'all samples', 'description': 'Simply all of the valid samples', 'samples': ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'B1', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8', 'B9', 'B10', 'C1', 'C2', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'E1', 'E2', 'E3', 'E5', 'E6', 'E7', 'E8', 'E9', 'F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'G1', 'G2', 'G3', 'G4', 'G5', 'G6', 'G7', 'G8', 'G9', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6', 'H7', 'H8', 'H9'], 'title': nan, 'conditions': [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0

### Step 1.2: Process new pickle files and push them to Firebase
- Select which samples you want to push to Firebase.
To plot automatically arrays of samples, see [tutorial](tutorial.ipynb), section 3.2.
- Process samples and push them to Firebase.

In [195]:
## Pickle files to process and to push to Firebase
pickles_list = []# study.samples # Can be samples if you want to process the samples from your study

pickles = {pickle:  f"{path_to_data}/DREEM/{pickle}/mutation_histos.p" for pickle in pickles_list}

# Indicate the location of your RNA structure file
RNAstructureFile =  f"{path_to_data}/RNAstructureFile.csv"

# Default location for your local database (JSON file)
json_file =  f"{path_to_data}/db.json"

# If the user gives some new pickles files, push them to the firebase, then pull the entire firebase
if len(pickles):
    data_wrangler.push_samples_to_firebase(pickles = pickles,
                        RNAstructureFile = RNAstructureFile,
                        firebase_credentials = firebase_credentials,
                        min_bases_cov = min_bases_cov, 
                        folder=folder)

### Step 1.3: Pull the data from the Firebase and clean/reformat it.
`df` is used for the analysis. Each of the construct have above 1000 reads for each sample.     
`df_full` is used for quality quality analysis. It has all construct above 1000 valid reads for each sample individually.

In [196]:

def clean_dataset(df_database:pd.DataFrame, studies:dict[Study], verbose:bool = True):
    """Process the content of the Firebase into Pandas dataframes.

    Args:
        df_database (pd.Dataframe): the dataframe downloaded from the Firebase.
        studies (dict[Study]): the studies you want in your dataframe. Format is {name:Study} 
        verbose: print relevant information

    Returns:
        pd.DataFrame: A subset of df_database, in which every construct had a good-enough reading quality for each sample.
        pd.DataFrame: The same content as df_database, with an additional 'samples_covered' column, corresponding to the amount of samples for containing this construct.
    """

    # Only keep desired pickle files
    samples = []
    for s in pd.DataFrame.from_dict({study: studies[study].to_dict()  for study in studies}, orient='index')['samples']:
        samples += s
    df_full = df_database[df_database['samp'].isin(samples)]

    # Only keep desired pickle files
    df_full = df_database[df_database['samp'].isin(samples)]

    # Check how many samples reach 1000 reads on each base for a given construct
    df_full['samples_covered'] = pd.Series(dtype=int)
    for construct in df_full.groupby('construct'):
        df_full['samples_covered'].loc[construct[1].index] = construct[1]['full_sequence'].count()

    # Only keep constructs that reach 1000 reads in every sample    
     # Check how many samples reach 1000 reads on each base for a given construct
     
    df = pd.DataFrame()

    for study in studies:
      df_loc = df_full[df_full['samp'].isin(studies[study].samples)].set_index('construct')
    # print(df_loc.groupby('construct').count()['samp'])
      df_loc = df_loc[df_loc.groupby('construct').count()['samp'] == len(studies[study].samples)]
      df = pd.concat((df, df_loc))
    df = df.reset_index() 
    df = df.loc[df.astype(str).drop_duplicates().index]


    number_of_void_dropped = (data_manip.get_construct_attribute(df, 'roi_deltaG' )=='void').apply(int).sum()
    if verbose: print(f"{number_of_void_dropped} constructs were dropped because deltaG was 'void'")
    df = df[df['roi_deltaG'] != 'void']

    df = df.astype(dtype={'samp': str, 'construct':int, 'roi_sequence':str, 'full_sequence':str, 'roi_start_index':int,
    'roi_end_index':int, 'roi_deltaG':float, 'full_deltaG':float,
    'roi_structure_comparison':str, 'full_structure':str, 'data_type':str,
    'num_reads':int, 'num_aligned':int, 'num_of_mutations':object, 'mut_bases':object,
    'info_bases':object, 'del_bases':object, 'ins_bases':object, 'cov_bases':object, 'start':int, 'end':int,
    'mod_bases_A':object, 'mod_bases_C':object, 'mod_bases_G':object, 'mod_bases_T':object,
    'skips_low_mapq':int, 'skips_short_read':int, 'skips_too_many_muts':int,
    'cov_bases_roi':int, 'cov_bases_sec_half':int, 'samples_covered':int,
    'sub-library':str, 'flank':str})

    return df, df_full


#df_database = database.load(study=study, folder=folder)

#data_wrangler.dump_dict_json(JSONFileDict=json_file, df=df_database)
df_database = data_wrangler.json_load(json_file)

# Clean and reformat the dataset
df, df_full = clean_dataset(df_database=df_database,
                                             studies=studies)
print(f"{df.groupby('construct')['samples_covered'].count().count()} constructs have more than {min_bases_cov} reads for each base of their ROI on each sample")


Load from dict-type JSON file
Done!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_full['samples_covered'] = pd.Series(dtype=int)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_full['samples_covered'].loc[construct[1].index] = construct[1]['full_sequence'].count()
  df_loc = df_loc[df_loc.groupby('construct').count()['samp'] == len(studies[study].samples)]


31 constructs were dropped because deltaG was 'void'
66 constructs have more than 1000 reads for each base of their ROI on each sample


In [183]:
#df_out = df_out.drop_duplicates(keep=False)
#for i in df_out.groupby(['samp','construct']):
#    print(i)
#print(df_out.reset_index())#['samples_per_study_covered'])

Unnamed: 0,construct,samp,cov_bases,cov_bases_roi,cov_bases_sec_half,data_type,del_bases,end,flank,full_deltaG,...,roi_end_index,roi_sequence,roi_start_index,roi_structure_comparison,skips_low_mapq,skips_short_read,skips_too_many_muts,start,sub-library,samples_covered
0,167,B9,"[0.0, 1897.0, 1887.0, 1912.0, 1913.0, 1842.0, ...",2234,1297,DMS,"[0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, ...",170,flank_1,-38.6,...,109,CCACCUGUAUAUAUCGGGUCCGAUAUAUACAGGUGG,73,000000000000000000000000000000000000,6379,0,0,1,structured PUM2 hairpin variants,
1,185,B9,"[0.0, 16043.0, 15672.0, 16063.0, 16107.0, 1588...",13905,9596,DMS,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, ...",170,flank_3_partial,-63.0,...,114,UACAAAUCAGUGUAUAUAUGCCCCCAUAUAUACACUGAUUUGUA,70,00000000000000000000000000000000000000000000,9767,0,0,1,structured PUM2 hairpin variants,
2,373,B9,"[0.0, 22991.0, 23015.0, 23007.0, 23057.0, 2298...",3961,38,DMS,"[0.0, 0.0, 0.0, 0.0, 0.0, 8.0, 0.0, 0.0, 0.0, ...",170,flank_1,-14.8,...,101,UGUAUAUAGUAAUAUAUACA,81,10000000000000000001,7387,0,1,1,structured PUM2 hairpin variants,
3,381,B9,"[0.0, 20544.0, 20567.0, 20568.0, 20580.0, 2044...",18446,42,DMS,"[0.0, 0.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, ...",170,flank_2,-19.6,...,103,GCUCGUGAGGAUCACCCAACCGC,80,00111001010000111100011,16301,0,1,1,mismatched MS2 hairpin variants,
4,389,B9,"[0.0, 43598.0, 43075.0, 43735.0, 43720.0, 4320...",43051,1135,DMS,"[0.0, 0.0, 0.0, 0.0, 0.0, 9.0, 0.0, 3.0, 0.0, ...",170,flank_3,-67.5,...,105,UGUUUAUAUCACCCACACAUGUUUAUAU,77,1110011110000000001111001110,3332,0,0,1,functional sites with multiple PUM Binding Sites,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19767,11217,H7,"[0.0, 2946.0, 2941.0, 2950.0, 2949.0, 2907.0, ...",1193,45,DMS,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",170,flank_1,-16.6,...,101,UGUAUAUAUAAUUAUAUACA,81,10000000000000000001,2240,0,0,1,structured PUM2 hairpin variants,
19771,11513,H7,"[0.0, 1992.0, 1980.0, 1986.0, 1990.0, 1935.0, ...",1246,87,DMS,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",170,flank_1,-13.0,...,105,UGUCAUAUAUUUUUUUUUUGUCAUAUAU,77,0111000000000000000000000000,3666,0,0,1,functional sites with multiple PUM Binding Sites,
19774,11775,H7,"[0.0, 4149.0, 4071.0, 4225.0, 4148.0, 4047.0, ...",4096,2666,DMS,"[0.0, 0.0, 0.0, 0.0, 0.0, 3.0, 0.0, 1.0, 0.0, ...",170,flank_3,-75.4,...,103,GAAUAUGAGGAUCACCCAUUGAC,80,10000000000000000000001,2108,0,0,1,mismatched MS2 hairpin variants,
19776,12079,H7,"[0.0, 38949.0, 38995.0, 38860.0, 38990.0, 3861...",4039,26,DMS,"[0.0, 0.0, 0.0, 0.0, 0.0, 8.0, 0.0, 6.0, 0.0, ...",170,flank_1,-17.8,...,113,UGUAUAUAUUUUCUUUUCUUUUCUUUUCUUUUCUUUAUAUACA,70,1000000000000000000000000000000000000000001,19914,0,0,1,structured PUM2 hairpin variants,


In [138]:
df_temp = pd.DataFrame
for study in studies:
    df_loc = df_full[df_full['samp'].isin(studies[study].samples)]
    df_loc['samples_covered'] =  df_full[df_full['samp'].isin(studies[study].samples)].groupby('construct').count()['samp']
    print(df_loc)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_loc['samples_covered'] =  df_full[df_full['samp'].isin(studies[study].samples)].groupby('construct').count()['samp']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_loc['samples_covered'] =  df_full[df_full['samp'].isin(studies[study].samples)].groupby('construct').count()['samp']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.htm

      samp construct                                          cov_bases  \
4046    B9        28  [0.0, 19585.0, 19559.0, 19635.0, 19653.0, 1855...   
4047    B9       108  [0.0, 3033.0, 3014.0, 3007.0, 3038.0, 2915.0, ...   
4048    B9       167  [0.0, 1897.0, 1887.0, 1912.0, 1913.0, 1842.0, ...   
4049    B9       174  [0.0, 15.0, 14.0, 14.0, 13.0, 13.0, 15.0, 15.0...   
4050    B9       179  [0.0, 48459.0, 48456.0, 48444.0, 48522.0, 4806...   
...    ...       ...                                                ...   
13395   G9     12281  [0.0, 5851.0, 5845.0, 5831.0, 5849.0, 5735.0, ...   
13396   G9     12306  [0.0, 19240.0, 19196.0, 19226.0, 19249.0, 1904...   
13397   G9     12361  [0.0, 41647.0, 41566.0, 41604.0, 41682.0, 4077...   
13398   G9     12419  [0.0, 6860.0, 6811.0, 6860.0, 6841.0, 6688.0, ...   
13399   G9     12460  [0.0, 6601.0, 6600.0, 6599.0, 6613.0, 6501.0, ...   

      cov_bases_roi cov_bases_sec_half data_type  \
4046          15313               1108       DM

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_loc['samples_covered'] =  df_full[df_full['samp'].isin(studies[study].samples)].groupby('construct').count()['samp']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_loc['samples_covered'] =  df_full[df_full['samp'].isin(studies[study].samples)].groupby('construct').count()['samp']


      samp construct                                          cov_bases  \
859     A4        28  [0.0, 8993.0, 8997.0, 8999.0, 9016.0, 8746.0, ...   
860     A4       108  [0.0, 5041.0, 5010.0, 4955.0, 5050.0, 4767.0, ...   
861     A4       167  [0.0, 4462.0, 4460.0, 4480.0, 4493.0, 4324.0, ...   
862     A4       179  [0.0, 41206.0, 41216.0, 41210.0, 41257.0, 4053...   
863     A4       185  [0.0, 30498.0, 29972.0, 30605.0, 30639.0, 3016...   
...    ...       ...                                                ...   
13986   H3     12108  [0.0, 19858.0, 19851.0, 19799.0, 19882.0, 1966...   
13987   H3     12281  [0.0, 2577.0, 2582.0, 2573.0, 2580.0, 2534.0, ...   
13988   H3     12306  [0.0, 16828.0, 16836.0, 16845.0, 16849.0, 1665...   
13989   H3     12361  [0.0, 12795.0, 12802.0, 12840.0, 12854.0, 1247...   
13990   H3     12419  [0.0, 2558.0, 2572.0, 2595.0, 2575.0, 2473.0, ...   

      cov_bases_roi cov_bases_sec_half data_type  \
859            2888                479       DM

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_loc['samples_covered'] =  df_full[df_full['samp'].isin(studies[study].samples)].groupby('construct').count()['samp']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_loc['samples_covered'] =  df_full[df_full['samp'].isin(studies[study].samples)].groupby('construct').count()['samp']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.htm

      samp construct                                          cov_bases  \
1499    A7        28  [0.0, 15978.0, 15975.0, 16035.0, 16030.0, 1545...   
1500    A7       108  [0.0, 2906.0, 2896.0, 2876.0, 2909.0, 2825.0, ...   
1501    A7       167  [0.0, 2136.0, 2131.0, 2149.0, 2141.0, 2100.0, ...   
1502    A7       179  [0.0, 38767.0, 38781.0, 38773.0, 38830.0, 3855...   
1503    A7       185  [0.0, 16966.0, 16499.0, 17018.0, 17050.0, 1676...   
...    ...       ...                                                ...   
14555   H6     12281  [0.0, 2218.0, 2216.0, 2219.0, 2220.0, 2181.0, ...   
14556   H6     12306  [0.0, 20850.0, 20834.0, 20882.0, 20874.0, 2064...   
14557   H6     12361  [0.0, 20205.0, 20213.0, 20237.0, 20213.0, 1968...   
14558   H6     12419  [0.0, 4232.0, 4257.0, 4273.0, 4248.0, 4161.0, ...   
14559   H6     12460  [0.0, 3535.0, 3546.0, 3551.0, 3548.0, 3481.0, ...   

      cov_bases_roi cov_bases_sec_half data_type  \
1499          10017                582       DM

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_loc['samples_covered'] =  df_full[df_full['samp'].isin(studies[study].samples)].groupby('construct').count()['samp']


# Step 2: Data quality analysis

It's always hard to realize that you were analysing noise. Here, we'll get through a series a plot to check the data sanity.

### Get the list of samples and constructs:

In [None]:
print(f"samples are: {study.samples}")
print(f"constructs are: {df.construct.unique()}")

### Explore the data
`data_manip.get_roi_info()` gives information about the Region of Interest (ROI) of a (sample, construct) pair.

In [None]:
samp, construct = data_manip.rand_sample_construct(df)

data_manip.get_roi_info(df=df,
                        samp='D7',
                        construct= 381,
                        bases= ['A','G','T', 'C'], #bases you want 
                        structure='full', # which structure prediction, 'full' or 'roi' 
                        overlay=(10, 5) # extend/shrink the roi, default is 0
                       # roi_index= [80, 110]
                        )#.xs((True, '0'),level=('paired','roi_structure_comparison'))    #additional filters

### (sample, construct)-specific base coverage plot

In [None]:
samp, construct = data_manip.rand_sample_construct(df)
plot.base_coverage(df, samp, construct)

### Plot the base coverage per construct distribution

In [None]:
plot.base_coverage_ROI_for_all_constructs(df=df_full)

### Sanity-check construct-wise base coverage plots
Plot randomly picked sequences to check the quality of the data.

In [None]:
plot.random_9_base_coverage(df=df)

### Heatmap of the roi part coverage

In [None]:
plot.heatmap(df = df, 
             column="cov_bases_roi")

### Heatmap of the second half coverage

In [None]:
plot.heatmap(df = df, 
                column="cov_bases_sec_half")

# Step 3: Data analysis
In this part, we know that we read good data, and we want to visualize it through different plots.

### Analysis parameters

Define a limited amount of constructs if that's useful to you

In [None]:
# Display the plots on this notebook? Not recommended if numerous plots
show_plots = True

# Constructs used
a_few_constructs = df.construct.unique()[:3].tolist()
first_construct = df.construct.unique()[0].tolist()
constructs_per_name = {
    'all_construct': df.construct.unique().tolist(),
    str(a_few_constructs) : a_few_constructs,
    str(first_construct): [first_construct]
}

# Select constructs here
constructs_name = str(a_few_constructs)

# Define what you will analyse
constructs = constructs_per_name[constructs_name]

### Histogram of mutation (sample-construct)

`plot.mut_histogam(df, sample, construct, plot_type, index, normalize)` plots the mutation rate base-wise for a given construct of a given sample as a barplot. 
Arguments:
- `plot_type` :
    - `'sequence'` : each bar is colored w.r.t to the base of the original sequence.
    - `'partition'` : each bar shows the partition of into which bases this base mutates.
- `index`:
    - `'index'`: each base is identified with its position number
    - `'base'`: each base is identified with its type (A, C, G, T)

In [None]:
constructs = constructs # Define this beforehand

for construct in constructs:
    for samp in study.samples:
        plot.mut_histogram(df=df,
                           samp=samp,
                           construct=construct,
                           plot_type='index') # Sequence (show the base and their index) or partition (show what the base mutates to)
        utils.save_fig(path=f"data/figs/date/{study.name}/mut_per_base/sequence/{construct}/", 
                    title=f"base_per_base_sequence_{samp}_{construct}")
        plt.close(not show_plots)

### DeltaG plots

In [None]:
for samp in study.samples:
    plot.deltaG(df=df, sample=samp)

    utils.save_fig(path=f"data/figs/date/{study}/deltaG/", 
             title=f"deltaG_{samp}")

    plt.close(not show_plots)

### Samples correlation
Only plot the correlation construct-by-construct

In [None]:
for construct in [constructs[0]]:
        df_global_corr = plot.correlation_n_samples(df, study, [construct])
        plt.title(f"Study: {study.name}")
        utils.save_fig(path=f"data/figs/date/correlation/{study.name}", 
                      title=f"correlation_{study.name}_{construct}")
        plt.close(not show_plots)
        print(construct, end=' ')

### Heatmap of the R value + heatmap of the slope for the previous correlation dataset

In [None]:
for plt_type in ['r_value', 'slope']:
    pivot = df_global_corr.pivot("sample_0","sample_1", plt_type).astype(float)
    f, ax = plt.subplots(figsize=(28, 10))
    sns.heatmap(pivot, annot=False, linewidths=0, ax=ax)#, norm=LinNorm())
    plt.title(plt_type)
    utils.save_fig(path=f"data/figs/date/global_correlation/{study.name}", 
                    title=f"correlation_{plt_type}_{study.name}_all_constructs")
    plt.close(not show_plots)

### Plot the correlation + heatmap of the R value + heatmap of the slope construct by construct

In [None]:
show_plots = False
for _, study in studies:
    samples = study.samples
    for construct in df.construct.unique():
        construct_name = construct
        df_global_corr = plot.correlation_n_samples(df, study, construct)
        plt.title(f"Correlation between samples for study: {study.name}, constructs: {construct_name}")
        utils.save_fig(path=f"data/figs/date/correlation/{study.name}/{construct_name}", 
                        title=f"correlation_fit_{study.name}_{construct_name}")
        plt.close(not show_plots)

        for plt_type in ['r_value', 'slope']:
            pivot = df_global_corr.pivot("sample_0","sample_1", plt_type).astype(float)
            f, ax = plt.subplots(figsize=(28, 10))
            sns.heatmap(pivot, annot=False, linewidths=0, ax=ax)#, norm=LinNorm())
            plt.title(f"{plt_type} of the correlation between samples for study: {study.name}, construct: {construct_name}")
            utils.save_fig(path=f"data/figs/date/correlation/{study.name}/{construct_name}", 
                            title=f"correlation_{plt_type}_{study.name}_{construct_name}")
            plt.close(not show_plots)

### Plot the correlation + heatmap of the R value + heatmap of the slope for a bunch of constructs altogether

In [None]:
heat_map = True

constructs = constructs # this has to be defined somewhere

for _, study in studies.iterrows():
    df_global_corr = plot.correlation_n_samples(df, study, constructs)
    utils.save_fig(path=f"data/figs/date/global_correlation/{study.name}", 
                    title=f"correlation_{study.name}_all_constructs")
    plt.close(not show_plots)

    if heat_map:
        for plt_type in ['r_value', 'slope']:
            pivot = df_global_corr.pivot("sample_0","sample_1", plt_type).astype(float)
            f, ax = plt.subplots(figsize=(28, 10))
            sns.heatmap(pivot, annot=False, linewidths=0, ax=ax, norm=LogNorm())
            utils.save_fig(path=f"data/figs/date/global_correlation/{study.name}", 
                            title=f"correlation_{plt_type}_{study.name}_all_constructs")
            plt.close(not show_plots)

### Temperature vs reactivity plots

In [None]:
plot.study_sample(df, study, scale_x='log',structure='full', overlay=5, figsize=(16,8))
utils.save_fig(path=f"{path_to_data}/figs/date/study_behavior", 
            title=f"{study.name}.png")
plt.close(not show_plots)

In [None]:

for _, study in studies.iterrows():
    study =  Study().from_dict(study.to_dict())
    study.models = ['lambda x,a: np.log(x)+a']
    plot.study_sample(df, study, scale_x='log',structure='full', overlay=5, figsize=(16,8))
    utils.save_fig(path=f"{path_to_data}/figs/date/study_behavior", 
                title=f"{study.name}.png")
    plt.close()

### Base-wise mutation along a study

In [None]:
for construct in [df.construct.unique()[0]]:
    for stu in studies.iterrows():
        study=Study().from_dict(stu[1].to_dict())
        if study.name == 'all samples':
            continue
        plot.study_base(df=df,
                study=Study().from_dict(studies.loc['temperature']),
                construct=9572, 
                scale_x='log', # can be log or lin
                bases=['A','C'],  # bsaes you want in A, C, G, T
                structure='full', # sequence for structure prediction. full or roi. 
                #overlay = 10,  # expand/shrink the roi. Can't expand roi if structure = 'roi'
                base_index=list(range(40,120))) # select your favorite bases. Can't expand roi if structure = 'roi' 
                
        utils.save_fig(path= f"{path_to_data}/figs/date/Base-wise mutation along a study/{study.name}", 
                       title=f"{construct}_{study.name}.png")
        plt.close()

### Save columns to a csv file

In [None]:
data_manip.columns_to_csv(df=df,
                   samples=samples,
                   columns=['sample', 'construct','full_sequence','roi_sequence','mut_bases','info_bases'],
                   title=f"seq_and_reactivity_{study}",
                   path='data/figs/date/{study}')

### Save construct vs deltaG 

In [None]:
data_manip.deltaG_vs_construct_to_csv(df=df,    
                                 title=f"deltaG_vs_construct.csv", 
                                 path = f"data/figs/date/{study}", 
                                 samples=samples)