# Figure 3b: Co 0-->1-->0 Timepoint Graph
Also, Co 1-->0 in S10b
## Import Modules

In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import matplotlib as mpl

from turtles.turtles_utils import *

## Set Filepaths

In [3]:
#: Overall directory for TdT project data and analysis
tdt_dir = '/mnt/c/Users/jonst/Box Sync/All_dNTP_data/'

#: Subdirectory for NGS run
data_dir = tdt_dir + 'Data_and_Results/All_Data_Filtered_Trimmed/Co_010_2_full/'

#: Filename prefixes for 0 control conditions
zero_control_conds = ['Mg_1_FLD0313-250126970',
                      'Mg_2_FLD0314-250134935',
                      'Mg_3_FLD0315-250124990']

#: Filename prefixes for 1 control conditions
one_control_conds = ['Co_1_FLD0316-250137920',
                     'Co_2_FLD0317-250127955',
                     'Co_3_FLD0318-250125993']

#: suffix of R1 fastq file to read in each directory
filename_end = 'trimmed.fq'

#: Directory to save generated figures in - it gets created if not present already
figure_dir = tdt_dir + 'Data_and_Results/Cobalt_010/Co_010_Final_for_Josh/'
os.makedirs(os.path.dirname(figure_dir), exist_ok=True)

## Load Sequences

In [4]:
seqs_dict = read_seqs(data_dir, filename_end=filename_end, cutoff=5.8)

Loading 010_1_FLD0321-250134934
Read 357058 sequences...

Loading 010_2_FLD0322-250141898
Read 421736 sequences...

Loading 010_3_FLD0323-250127951
Read 364749 sequences...

Loading 010_4_FLD0329-250129958
Read 387757 sequences...

Loading 010_5_FLD0330-250129959
Read 394940 sequences...

Loading 010_6_FLD0331-250138918
Read 461729 sequences...

Loading 10_1_FLD0324-250132935
Read 365783 sequences...

Loading 10_2_FLD0325-250132937
Read 364481 sequences...

Loading 10_3_FLD0326-250131949
Read 461218 sequences...

Loading 10_4_FLD0332-250124988
Read 350123 sequences...

Loading 10_5_FLD0333-250126968
Read 422405 sequences...

Loading 10_6_FLD0334-250137919
Read 388863 sequences...

Loading Co_1_FLD0316-250137920
Read 360681 sequences...

Loading Co_2_FLD0317-250127955
Read 339347 sequences...

Loading Co_3_FLD0318-250125993
Read 375438 sequences...

Loading Mg_1_FLD0313-250126970
Read 367678 sequences...

Loading Mg_2_FLD0314-250134935
Read 403585 sequences...

Loading Mg_3_FLD0315-2501

## Choose Legend Labels for Plotting
### Condition Labels

In [5]:
condition_dict = {}
conditions = seqs_dict.keys()
for condition in conditions:
    if '010' in condition:
        condition_dict[condition] = '010 (20,40 min)'
    elif '10' in condition:
        condition_dict[condition] = '10 (40 min)'
    elif 'Co' in condition:
        condition_dict[condition] = '1 Control'
    elif 'Mg' in condition:
        condition_dict[condition] = '0 Control'
    elif 'NoTdT' in condition:
        condition_dict[condition] = 'No TdT Control'
    else:
        print('Error: {} not recognized.'.format(condition))

### Replicate Labels

In [6]:
import re
rep_dict = {}
for condition in conditions:
    rep_dict[condition] = condition[3:][re.search("[0-9]+", condition[3:]).start()]

## Bin Sequences, Calculate dNTP Frequency, and Transform into Aitchison Space
Note: Binning can take a while.

In [7]:
counts_dict = get_norm_len_base_counts(seqs_dict, num_bins=1000)

010_1_FLD0321-250134934 processed

010_2_FLD0322-250141898 processed

010_3_FLD0323-250127951 processed

010_4_FLD0329-250129958 processed

010_5_FLD0330-250129959 processed

010_6_FLD0331-250138918 processed

10_1_FLD0324-250132935 processed

10_2_FLD0325-250132937 processed

10_3_FLD0326-250131949 processed

10_4_FLD0332-250124988 processed

10_5_FLD0333-250126968 processed

10_6_FLD0334-250137919 processed

Co_1_FLD0316-250137920 processed

Co_2_FLD0317-250127955 processed

Co_3_FLD0318-250125993 processed

Mg_1_FLD0313-250126970 processed

Mg_2_FLD0314-250134935 processed

Mg_3_FLD0315-250124990 processed

NoTdT_FLD0327-250138921 processed



In [8]:
pcts_dict = calc_norm_len_base_pcts(counts_dict)

In [9]:
clr_data = calc_aitchison_distance(pcts_dict)

## Calculate and Plot Timepoint Results

### Convert Data into Long-Form pandas DataFrame

In [10]:
data = generate_aitch_df(pcts_dict, clr_data, condition_dict, rep_dict,
                         zero_control_conds, one_control_conds)
data.head()

Unnamed: 0,Directory,Condition,Replicate,Bin Number,Aitch Dist (from 0),Aitch Dist (from 1),A % Aitch,C % Aitch,G % Aitch,T % Aitch,...,G % Aitch Diff from 0,T % Aitch Diff from 0,A % Aitch Diff from 1,C % Aitch Diff from 1,G % Aitch Diff from 1,T % Aitch Diff from 1,A %,C %,G %,T %
0,010_1_FLD0321-250134934,"010 (20,40 min)",1,1,0.03065,0.577022,0.463098,-0.41951,0.637786,-0.681373,...,0.003612,0.011224,0.493027,0.181931,0.220314,0.090782,0.342118,0.141535,0.407419,0.108928
1,010_1_FLD0321-250134934,"010 (20,40 min)",1,2,0.03065,0.577022,0.463098,-0.41951,0.637786,-0.681373,...,0.003612,0.011224,0.493027,0.181931,0.220314,0.090782,0.342118,0.141535,0.407419,0.108928
2,010_1_FLD0321-250134934,"010 (20,40 min)",1,3,0.03065,0.577022,0.463098,-0.41951,0.637786,-0.681373,...,0.003612,0.011224,0.493027,0.181931,0.220314,0.090782,0.342118,0.141535,0.407419,0.108928
3,010_1_FLD0321-250134934,"010 (20,40 min)",1,4,0.03065,0.577022,0.463098,-0.41951,0.637786,-0.681373,...,0.003612,0.011224,0.493027,0.181931,0.220314,0.090782,0.342118,0.141535,0.407419,0.108928
4,010_1_FLD0321-250134934,"010 (20,40 min)",1,5,0.03065,0.577022,0.463098,-0.41951,0.637786,-0.681373,...,0.003612,0.011224,0.493027,0.181931,0.220314,0.090782,0.342118,0.141535,0.407419,0.108928


In [11]:
data = calc_signal(data, zero_control_conds, one_control_conds)
data.head()

Unnamed: 0,Directory,Condition,Replicate,Bin Number,Aitch Dist (from 0),Aitch Dist (from 1),A % Aitch,C % Aitch,G % Aitch,T % Aitch,...,A % Aitch Diff from 1,C % Aitch Diff from 1,G % Aitch Diff from 1,T % Aitch Diff from 1,A %,C %,G %,T %,Aitch Fraction,Signal
0,010_1_FLD0321-250134934,"010 (20,40 min)",1,1,0.03065,0.577022,0.463098,-0.41951,0.637786,-0.681373,...,0.493027,0.181931,0.220314,0.090782,0.342118,0.141535,0.407419,0.108928,0.050438,0.020502
1,010_1_FLD0321-250134934,"010 (20,40 min)",1,2,0.03065,0.577022,0.463098,-0.41951,0.637786,-0.681373,...,0.493027,0.181931,0.220314,0.090782,0.342118,0.141535,0.407419,0.108928,0.050438,0.020502
2,010_1_FLD0321-250134934,"010 (20,40 min)",1,3,0.03065,0.577022,0.463098,-0.41951,0.637786,-0.681373,...,0.493027,0.181931,0.220314,0.090782,0.342118,0.141535,0.407419,0.108928,0.050438,0.020502
3,010_1_FLD0321-250134934,"010 (20,40 min)",1,4,0.03065,0.577022,0.463098,-0.41951,0.637786,-0.681373,...,0.493027,0.181931,0.220314,0.090782,0.342118,0.141535,0.407419,0.108928,0.050438,0.020502
4,010_1_FLD0321-250134934,"010 (20,40 min)",1,5,0.03065,0.577022,0.463098,-0.41951,0.637786,-0.681373,...,0.493027,0.181931,0.220314,0.090782,0.342118,0.141535,0.407419,0.108928,0.050438,0.020502


In [12]:
signal_df = data.pivot_table(index='Bin Number', columns='Condition')['Signal']

In [13]:
signal_df.head()

Condition,0 Control,"010 (20,40 min)",1 Control,10 (40 min),No TdT Control
Bin Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2.457525e-18,0.063496,1.0,0.939421,0.51381
2,2.457525e-18,0.063496,1.0,0.939421,0.51381
3,2.457525e-18,0.063496,1.0,0.939421,0.51381
4,2.457525e-18,0.063496,1.0,0.939421,0.51381
5,2.457525e-18,0.063496,1.0,0.939421,0.51381


In [14]:
signal_df.to_csv(figure_dir + 'signal_df.csv')

# Rate Data

In [15]:
len_dists = get_length_dists(seqs_dict, max_len=200)

In [16]:
len_df = generate_length_df(len_dists, condition_dict, rep_dict)

In [17]:
rates_df = len_df.pivot_table(index='Length', columns='Condition', aggfunc=sum)['Count']

In [18]:
rates_df.head()

Condition,0 Control,"010 (20,40 min)",1 Control,10 (40 min),No TdT Control
Length,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,66,264,59,230,20
1,341,1302,330,1240,85
2,487,1757,404,1731,125
3,759,2514,551,2290,215
4,1121,3642,771,3219,120


In [19]:
rates_df.tail()

Condition,0 Control,"010 (20,40 min)",1 Control,10 (40 min),No TdT Control
Length,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
196,0,0,0,0,0
197,0,0,0,0,0
198,0,0,0,0,0
199,0,0,0,0,0
200,0,0,0,0,0


In [20]:
rates_df.to_csv(figure_dir + 'rates_df.csv')