# One pagers
## Load data

In [35]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('../../src')
import util
from config import *
import plots
import ipynbname
from study_gen import study
df = study.df # pandas dataframe

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Create a one-pager for a construct

In [48]:
def get_avg_pearson_score_for_construct_in_family(study, sample, construct, section):
    family = util.family_from_construct(construct)
    data = plots.sample_replicates_heatmap_per_family(study, [sample,sample], family, section)['data']
    score = data[construct].loc[~data.index.str.contains(construct)].mean()
    return '{:.2f}'.format(score)

In [63]:
%reload_ext autoreload
###############
# Parameters
###############
sample = 'lauren470_S1'
construct = '3042-O-flank_1=hp1-DB'
family = util.family_from_construct(construct)
###############

# Get data
data = study.get_df(sample=sample, section='ROI', construct=construct)
assert len(data) <= 1, 'More than one sequence found for {} - {}'.format(sample, construct)
assert len(data) > 0, 'No sequence found for {} - {}'.format(sample, construct)
data = data.iloc[0]

# Construct info
construct = construct
sequence = data['sequence']
type = 'Canonical base pair'
gc_content = '{:.2f}%'.format(100*np.array([base in ['G','C'] for base in sequence]).mean())
print('Construct info \n ------------')
print('Construct: {}'.format(construct))
print('Sequence: {}'.format(sequence))
print('Type: {}'.format(type))
print('GC content: {}'.format(gc_content))
print('')

# Sample info
library = 'TODO' # should be in samples.csv
exp_env = data['exp_env']
exp_env_name = 'cell_line' if exp_env == 'in_vivo' else 'buffer'
exp_env_var = data['cell_line'] if exp_env == 'in_vivo' else data['buffer']
DMS = data['DMS_conc_mM']
reaction_time = data['inc_time_tot_secs']
deltaG = data['deltaG']
print('Library: {}'.format(library))
print('Cell line: {}'.format(exp_env_var)) if exp_env == 'in_vivo' else print('Buffer: {}'.format(exp_env_var))
print('DMS concentration: {} mM'.format(DMS))
print('Reaction time: {} secs'.format(reaction_time))
print('DeltaG: {}'.format(deltaG))
print('')

# Quality control
num_reads = data['num_aligned']
pearson_R_5_hp = get_avg_pearson_score_for_construct_in_family(study, sample, construct, 'MS2')
pearson_R_3_hp = get_avg_pearson_score_for_construct_in_family(study, sample, construct, 'LAH')
print('Number of reads: {}'.format(num_reads))
print('Pearson R 5 hp: {}'.format(pearson_R_5_hp))
print('Pearson R 3 hp: {}'.format(pearson_R_3_hp))
print('')

## Plots
plots.read_coverage_per_position(study, sample, construct)['fig'].show()
plots.mutation_fraction_at_each_position(study, sample, construct)['fig'].show()
plots.mutation_identity_at_each_position(study, sample, construct)['fig'].show()
plots.mutation_per_read_per_construct(study, sample, construct)['fig'].show()


Construct info 
 -----------------
Construct: 3042-O-flank_1=hp1-DB
Sequence: AAGATATTCGAAAGAATATCTT
Type: Canonical base pair
GC content: 22.73%

Library: TODO
Cell line: HEK293T
DMS concentration: 211.0 mM
Reaction time: 180 secs
DeltaG: -5.9

Number of reads: 1533
Pearson R 5 hp: 0.99
Pearson R 3 hp: 0.97

