# Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from os.path import exists
import os
from nap import *

# Data wrangling

### Resources used

Here, we will set the user name (at the moment, we'll call you Yves, it's a nice name). This is your main folder in the database. Check it by yourself [on the database!](https://console.firebase.google.com/u/0/project/dreem-542b7/database/dreem-542b7-default-rtdb/data)

The **tubes** that you chose here will be pulled from the database. Every tube correspond to a physical tube, also known as "experiment" during the wet lab part.  

The **constructs** are specific RNA sequences. They are referred to by their name, such as 8584 or 9572, and each tube has the same series of constructs.

A **study** is a group of tubes that are relevant to be studied together. For example, they are all replicates, or the salt concentration was increased along the tubes, etc.

The **pickles** are a dictionary of the tube's names and their respective path+title.

In [None]:
## EDIT THIS ZONE 
user = 'Yves'
constructs = [8594, 9572]
study = 'tutorial'  
show_plots = True
# END OF EDIT ZONE

## Database path
json_file = 'data/db.json'

## Constants
min_bases_cov = 1000 

tubes_per_study = {   'tutorial':['A6','D6'],
                      'replicates':['C5','A4' , 'F4', 'A6', 'A7'],
                      'salt': ['A6','B6','C6','D6','E6'], 
                      'temperature':['D7','E7','F7','G7','H7','A8','B8','C8'], 
                      'magnesium':['F6', 'G6', 'H6', 'A7', 'B7', 'C7'],
                      '60 mM DMS kinestics':['D8', 'E8', 'F8', 'G8', 'H8', 'A9']
                      }

tubes = tubes_per_study[study]

### Load the data

In [None]:
# If not local copy of firebase, pull the firebase, else, load your copy
if not exists(json_file):
    if not exists('data'):
        os.mkdir('data')
    df_rough = data_wrangler.load_data_from_firebase(tubes=tubes, user=user)
    data_wrangler.dump_dict_json(JSONFileDict=json_file,
                                 df=df_rough)
else:
    df_rough = data_wrangler.load_dict_json(json_file)

If everything is normal, so far, a json file was downloaded as `data/db.json`. Now, we'll extract two dataframes from this file, `df_full` for data quality analysis and `df` for data analysis. Check out the difference below.

### Clean and reformat the dataset. 
`df` is used for the analysis. Each of the construct have above 1000 reads for each tube.     
`df_full` is used for quality quality analysis. It has all constructs above 1000 valid reads for each tube individually.

In [None]:
df, df_full = turner_overthrow.clean_dataset(df_rough=df_rough,
                                             tubes=tubes, 
                                             min_bases_cov=min_bases_cov)

# Data quality analysis

It's always hard to realize that you were analysing noise. Here, we'll get through a series a plot to check the data sanity.

### Show the tube's quantity of valid structures (good indicator of the tube's quality)

In [None]:
plot.valid_construct_per_tube(df=df_full,
                              min_bases_cov=min_bases_cov)

### Show the tube coverage distribution

In [None]:
plot.tube_coverage_distribution(df=df_full)

### Plot the base coverage per construct distribution

In [None]:
plot.base_coverage_for_all_constructs(df=df_full, 
                                      min_bases_cov=min_bases_cov)

### Sanity-check construct-wise base coverage plots
Plot randomly picked sequences to check the quality of the data.

In [None]:
plot.random_base_coverage_plot_wise(df=df, 
                                    min_bases_cov=min_bases_cov)

### Heatmap of the var part coverage

In [None]:
plot.heatmap(df = df, 
             column="cov_bases_var")

### Heatmap of the second half coverage

In [None]:
plot.heatmap(df = df, 
                column="cov_bases_sec_half")

# NOT SO FAST

These plots only featured 2 tubes. How about adding a few more up there, and plotting the quality plots again?

# Data Analysis

### Explore the data
`utils.get_var_info(df=df, tube=tube, construct=construct)` gives information about the variable part of the given construct of a given tube.

In [None]:
tube, construct = tubes[0], constructs[0]  # it takes a singel (tube, construct) pair
utils.get_var_info(df=df, tube=tube, construct=construct).xs((True, '0'),level=('paired','var_structure_comparison'))

### Mutation sequence-wise

`plot.mutation_rate(df, tube, construct, plot_type, index, normalize)` plots the mutation rate base-wise for a given construct of a given tube as a barplot. 
Arguments:
- `plot_type` :
    - `'sequence'` : each bar is colored w.r.t to the base of the original sequence.
    - `'partition'` : each bar shows the partition of into which bases this base mutates.
- `index`:
    - `'index'`: each base is identified with its position number
    - `'base'`: each base is identified with its type (A, C, G, T)

Partition type - non normalized

In [None]:
for tube in tubes:
    for construct in constructs:
        plot.mutation_rate(df, tube, construct, plot_type='partition', index='base')  
        plot.save_fig(path=f"data/figs/date/{study}/mut_per_base/partition/{construct}/", 
                    title=f"base_per_base_partition_{tube}_{construct}")
        plt.close(not show_plots)

Sequence type

In [None]:
for tube in tubes:
    for construct in constructs:
        plot.mutation_rate(df=df,
                           tube=tube,
                           construct=construct,
                           plot_type='sequence',
                           index='index')
        plot.save_fig(path=f"data/figs/date/{study}/mut_per_base/sequence/{construct}/", 
                    title=f"base_per_base_sequence_{tube}_{construct}")
        plt.close(not show_plots)

### DeltaG plots

In [None]:
for tube in tubes:
    plot.deltaG(df=df, tube=tube)

    plot.save_fig(path=f"data/figs/date/{study}/deltaG/", 
             title=f"deltaG_{tube}")

    plt.close(not show_plots)

### Tubes comparison

In [None]:
for construct in constructs:
        plot.compare_n_tubes(df, tubes, construct)
        plot.save_fig(path=f"data/figs/date/comparison/{study}", 
                      title=f"comparison_{study}_{construct}")
        plt.close(not show_plots)
        print(construct, end=' ')

### Save columns to a csv file

In [None]:
utils.columns_to_csv(df=df,
                   tubes=tubes,
                   columns=['tube', 'construct','full_sequence','var_sequence','mut_bases','info_bases'],
                   title='about_{study}',
                   path='data/figs/date/{study}'
                   )

### Save construct vs deltaG 

In [None]:
def deltaG_vs_construct_to_csv(df, title, path, tubes):
    full_path = utils.make_path(path)
    df[df['tube']==tubes[0]][['construct','var_deltaG','full_deltaG']].reset_index().drop(columns=['index']).to_csv(f"{full_path}/{title}")
    
deltaG_vs_construct_to_csv(df=df, title=f"deltaG_vs_construct.csv", path = f"data/figs/date", tubes=tubes)

# Process pickles, push and pull to the database (#TODO, add security features to avoid having them destroying everything)


# 