# 01__preprocess_mpranalyze_quantify

in this notebook, i take the tidy-formatted counts and re-shape them to be in the input format needed to run MPRAnalyze. i also add some additional controls to serve as negative controls in the cell-type comparison model. counterintuitively, these negative controls are sampled from our positive controls: the null hypothesis is that their activities should not be too different between hESCs and mESCs, since it's the CMV promoter. there are 4 "tiles" of the CMV promoter, and i sample 13 barcodes from each tile 100 times, to create a total of 400 "negative" controls (...from our positive controls). [note: negative controls in the quantification model are just random sequences, expected to have no activity].

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import math
import matplotlib.pyplot as plt
import numpy as np
import re
import seaborn as sns
import sys

from scipy.stats import spearmanr

# import utils
sys.path.append("../../../utils")
from plotting_utils import *

%matplotlib inline
%config InlineBackend.figure_format = 'svg'
mpl.rcParams['figure.autolayout'] = False

In [2]:
sns.set(**PAPER_PRESET)
fontsize = PAPER_FONTSIZE

In [3]:
np.random.seed(2019)

## functions

In [4]:
def get_barc_id(row):
    str_split = row.tile_id.split(".")
    return str_split[-1]

In [5]:
def dna_status(row, barc_thresh, perc_barc_thresh, ctrl_elems):
    samp_cols = [x for x in row.index if "samp:" in x]
    vals = row[samp_cols]
    if row.element not in ctrl_elems:
        tot_barcs = len(vals)
        n_barcs_above_thresh = len([x for x in vals if x >= barc_thresh])
        perc_barcs_above_thresh = n_barcs_above_thresh / tot_barcs
        if perc_barcs_above_thresh >= perc_barc_thresh:
            return "good"
        else:
            return "bad"
    else:
        return "good"

In [6]:
def get_ctrl_status(row):
    if pd.isnull(row.tile_type):
        return False
    elif row.tile_type == "RANDOM":
        return True
    else:
        return False

## variables

In [7]:
counts_dir = "../../../data/02__mpra/01__counts"

In [8]:
HUES64_data_f = "%s/HUES64__all_counts.txt" % counts_dir

In [9]:
mESC_data_f = "%s/mESC__all_counts.txt" % counts_dir

In [10]:
index_f = "../../../data/01__design/01__index/TWIST_pool4_v8_final.txt.gz"

## 1. import data

In [11]:
HUES64_data = pd.read_table(HUES64_data_f, sep="\t")
HUES64_data.head()

Unnamed: 0,barcode,dna_1,rep_1,rep_2
0,GTTCGACTATG,246,84.0,280.0
1,ATAACGTGTAA,120,48.0,19.0
2,TGAGCGGGTTC,1368,413.0,481.0
3,AAACAACGTCC,32,14.0,
4,CCTTACGTCAC,1184,1093.0,615.0


In [12]:
mESC_data = pd.read_table(mESC_data_f, sep="\t")
mESC_data.head()

Unnamed: 0,barcode,dna_1,rep_1
0,GTTCGACTATG,246,85.0
1,ATAACGTGTAA,120,
2,TGAGCGGGTTC,1368,379.0
3,AAACAACGTCC,32,2.0
4,CCTTACGTCAC,1184,400.0


In [13]:
index = pd.read_table(index_f, sep="\t")
index_elem = index[["element", "tile_type"]].drop_duplicates()

## 2. merge data w/ index

In [14]:
HUES64_data.columns = ["barcode", "dna_1", "HUES64_rep1", "HUES64_rep2"]
mESC_data.columns = ["barcode", "dna_1", "mESC_rep1"]

In [15]:
all_counts = HUES64_data.merge(mESC_data, on=["barcode", "dna_1"])
all_counts.head()

Unnamed: 0,barcode,dna_1,HUES64_rep1,HUES64_rep2,mESC_rep1
0,GTTCGACTATG,246,84.0,280.0,85.0
1,ATAACGTGTAA,120,48.0,19.0,
2,TGAGCGGGTTC,1368,413.0,481.0,379.0
3,AAACAACGTCC,32,14.0,,2.0
4,CCTTACGTCAC,1184,1093.0,615.0,400.0


In [16]:
df = all_counts.merge(index[["barcode", "element", "tile_type", "tile_id"]], on="barcode")
df["barc_id"] = df.apply(get_barc_id, axis=1).astype(int)
df.head()

Unnamed: 0,barcode,dna_1,HUES64_rep1,HUES64_rep2,mESC_rep1,element,tile_type,tile_id,barc_id
0,GTTCGACTATG,246,84.0,280.0,85.0,CCCCGGACGGGGATGGTCAGCGGCTGCGGCCGTCTGGCACGCGAAC...,WILDTYPE,10590.1.0.0.9,9
1,ATAACGTGTAA,120,48.0,19.0,,GGAGGCTGGTGGGGGCCAGATGTGCTAAAGAGATCCAGATGTGAGA...,WILDTYPE_SINGLE_DELETION,5.5.0.76.5,5
2,TGAGCGGGTTC,1368,413.0,481.0,379.0,TCTGGGGCGGCATCAGTTTACAAGTTTGTCTTAAGATGCCGTGCGG...,RANDOM,596.3,3
3,AAACAACGTCC,32,14.0,,2.0,GGGGCGCGGCGGATTTACGATCCAGTTCACCCCGGCAGGAAACGTT...,WILDTYPE,8091.1.0.0.3,3
4,CCTTACGTCAC,1184,1093.0,615.0,400.0,TCCTTTACGTACACCCACGCTTTATAGTTTACAAAGCGATTTCAAC...,FLIPPED_SINGLE_DELETION,2.21.1.72.1,1


## 3. create separate dfs for dna & rna counts

In [17]:
dna_counts = df[["element", "barcode", "tile_type", "barc_id", "dna_1"]]
rna_counts = df[["element", "barcode", "tile_type", "barc_id", "HUES64_rep1", "HUES64_rep2", "mESC_rep1"]]
rna_counts.head()

Unnamed: 0,element,barcode,tile_type,barc_id,HUES64_rep1,HUES64_rep2,mESC_rep1
0,CCCCGGACGGGGATGGTCAGCGGCTGCGGCCGTCTGGCACGCGAAC...,GTTCGACTATG,WILDTYPE,9,84.0,280.0,85.0
1,GGAGGCTGGTGGGGGCCAGATGTGCTAAAGAGATCCAGATGTGAGA...,ATAACGTGTAA,WILDTYPE_SINGLE_DELETION,5,48.0,19.0,
2,TCTGGGGCGGCATCAGTTTACAAGTTTGTCTTAAGATGCCGTGCGG...,TGAGCGGGTTC,RANDOM,3,413.0,481.0,379.0
3,GGGGCGCGGCGGATTTACGATCCAGTTCACCCCGGCAGGAAACGTT...,AAACAACGTCC,WILDTYPE,3,14.0,,2.0
4,TCCTTTACGTACACCCACGCTTTATAGTTTACAAAGCGATTTCAAC...,CCTTACGTCAC,FLIPPED_SINGLE_DELETION,1,1093.0,615.0,400.0


In [18]:
dna_counts = dna_counts.sort_values(by=["element", "barc_id"])
rna_counts = rna_counts.sort_values(by=["element", "barc_id"])

## 4. sum up all dna & rna counts, across all elements, for library depth correction

In [19]:
dna_counts_elem = dna_counts.groupby(["element", "tile_type"])["dna_1"].agg("sum").reset_index()
dna_counts_elem = dna_counts_elem[["element", "dna_1"]]
dna_counts_elem.set_index("element", inplace=True)
dna_counts_elem.head()

Unnamed: 0_level_0,dna_1
element,Unnamed: 1_level_1
AAAAAAAAAAAAAAAAACCCTGCAGAGAGCCTGCAAAGTCACTGCCGGAAGTCCCTCCGCGGTGACGAGCACGGCGGAAGTGGGTTCAATGCAGCTCCCCGAAGAACTGTCTCACTCCCGCTCGCCTGACTTCTGGATGGGAGG,6356
AAAAAAAAAAAAAAAGAAAAGAAAAGAAAAAAAAGAAAGGATTGAGGGGAAGTTTCAAAGGGTGTGCCGGGGACCGGGGAAGAGTCTCATTCTCATGAGTCAGCGGATCCGGCCCAGTGTGACTTCACTGCTTCCCCAGAAGAG,992
AAAAAAAAAAAAGAGGAGAAATAGATTGTTACCTTATATTATTTAAACTTTCAAATGTGCTAGGGTTCCTGGAATTTGGAGAGGGAACCGAAAGGGTTTTATGGTTCTTGGGAGACAGCAGAGCACAAAGAGCCAGGGGGTGGA,1052
AAAAAAAAAAAAGTGGGGGTGGATCGGGCGTGCCGGTAGGGAACCCGCGGCAGGGGGCGGCTCTGCTCCCCAGCAGGGCGTGGGCCGGGCGAGGTCCTCCGCGCAGCGCACGGTGCCAACAATAGGCTGTTGTGGAAGGAGGCA,282
AAAAAAAAAACCGGCAAAATGTCCTTTTCCTTGTTTTGAAAAGACTGGAAAATTCATCCCTGCAACCTTCCCTCCCATTTCACTGGTCAGAGTAAAAATTGGAAGTAGGAAAATTAGTACCACCACATCCTTTGAGTCAGAGAC,4922


In [20]:
rna_counts_elem = rna_counts.groupby(["element", "tile_type"])["HUES64_rep1", "HUES64_rep2", "mESC_rep1"].agg("sum").reset_index()
rna_counts_elem = rna_counts_elem[["element", "HUES64_rep1", "HUES64_rep2", "mESC_rep1"]]
rna_counts_elem.set_index("element", inplace=True)
rna_counts_elem.head()

Unnamed: 0_level_0,HUES64_rep1,HUES64_rep2,mESC_rep1
element,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAAAAAAAAAAAAAAAACCCTGCAGAGAGCCTGCAAAGTCACTGCCGGAAGTCCCTCCGCGGTGACGAGCACGGCGGAAGTGGGTTCAATGCAGCTCCCCGAAGAACTGTCTCACTCCCGCTCGCCTGACTTCTGGATGGGAGG,18480.0,17053.0,16712.0
AAAAAAAAAAAAAAAGAAAAGAAAAGAAAAAAAAGAAAGGATTGAGGGGAAGTTTCAAAGGGTGTGCCGGGGACCGGGGAAGAGTCTCATTCTCATGAGTCAGCGGATCCGGCCCAGTGTGACTTCACTGCTTCCCCAGAAGAG,338.0,824.0,469.0
AAAAAAAAAAAAGAGGAGAAATAGATTGTTACCTTATATTATTTAAACTTTCAAATGTGCTAGGGTTCCTGGAATTTGGAGAGGGAACCGAAAGGGTTTTATGGTTCTTGGGAGACAGCAGAGCACAAAGAGCCAGGGGGTGGA,288.0,282.0,348.0
AAAAAAAAAAAAGTGGGGGTGGATCGGGCGTGCCGGTAGGGAACCCGCGGCAGGGGGCGGCTCTGCTCCCCAGCAGGGCGTGGGCCGGGCGAGGTCCTCCGCGCAGCGCACGGTGCCAACAATAGGCTGTTGTGGAAGGAGGCA,138.0,125.0,356.0
AAAAAAAAAACCGGCAAAATGTCCTTTTCCTTGTTTTGAAAAGACTGGAAAATTCATCCCTGCAACCTTCCCTCCCATTTCACTGGTCAGAGTAAAAATTGGAAGTAGGAAAATTAGTACCACCACATCCTTTGAGTCAGAGAC,1754.0,1261.0,1132.0


## 5. create annotation file for library depth correction

In [21]:
# col annotations for depth estimation
dna_depth_anns = {"dna_1": {"sample": "1", "condition": "dna"}}
rna_depth_anns = {"HUES64_rep1": {"sample": "1", "condition": "HUES64"}, 
                  "HUES64_rep2": {"sample": "2", "condition": "HUES64"},
                  "mESC_rep1": {"sample": "1", "condition": "mESC"}}

dna_depth_anns = pd.DataFrame.from_dict(dna_depth_anns).T
rna_depth_anns = pd.DataFrame.from_dict(rna_depth_anns).T
rna_depth_anns

Unnamed: 0,condition,sample
HUES64_rep1,HUES64,1
HUES64_rep2,HUES64,2
mESC_rep1,mESC,1


## 6. write library depth correction files

In [22]:
# write depth estimation files
mpranalyze_dir = "%s/mpranalyze_files" % counts_dir
!mkdir -p $mpranalyze_dir

dna_depth_anns.to_csv("%s/dna_col_ann.for_depth_estimation.mpranalyze.txt" % mpranalyze_dir, sep="\t")
rna_depth_anns.to_csv("%s/rna_col_ann.for_depth_estimation.mpranalyze.txt" % mpranalyze_dir, sep="\t")

dna_counts_elem.to_csv("%s/dna_counts.for_depth_estimation.mpranalyze.txt" % mpranalyze_dir, sep="\t", index=True)
rna_counts_elem.to_csv("%s/rna_counts.for_depth_estimation.mpranalyze.txt" % mpranalyze_dir, sep="\t", index=True)

## 7. to run MPRAnalyze, get data in pivot format (opposite of tidy format)

In [23]:
# first filter to the TSSs we care about quantifying
tss_elems = list(index[index["name"].str.contains("EVO_TSS")]["element"].unique())
ctrl_elems = list(index[index["tile_type"] == "RANDOM"]["element"].unique())
pos_ctrl_elems = list(index[index["tile_type"] == "CONTROL"]["element"].unique())
print(len(tss_elems))
print(len(ctrl_elems))
print(len(pos_ctrl_elems))
good_elems = tss_elems + ctrl_elems
len(good_elems)

13533
1619
4


15152

In [24]:
dna_counts_filt = dna_counts[dna_counts["element"].isin(good_elems)]
rna_counts_filt = rna_counts[rna_counts["element"].isin(good_elems)]
len(rna_counts_filt)

154949

In [25]:
dna_counts_pos_ctrls = dna_counts[dna_counts["element"].isin(pos_ctrl_elems)]
rna_counts_pos_ctrls = rna_counts[rna_counts["element"].isin(pos_ctrl_elems)]
len(rna_counts_pos_ctrls)

237

In [26]:
dna_counts_filt = dna_counts_filt.melt(id_vars=["element", "barcode", "tile_type", "barc_id"])
rna_counts_filt = rna_counts_filt.melt(id_vars=["element", "barcode", "tile_type", "barc_id"])
rna_counts_filt.head()

Unnamed: 0,element,barcode,tile_type,barc_id,variable,value
0,AAAAAAAAAAAAAAAAACCCTGCAGAGAGCCTGCAAAGTCACTGCC...,CCGACTTAGAC,WILDTYPE,1,HUES64_rep1,2086.0
1,AAAAAAAAAAAAAAAAACCCTGCAGAGAGCCTGCAAAGTCACTGCC...,CACATCGGTGG,WILDTYPE,2,HUES64_rep1,2797.0
2,AAAAAAAAAAAAAAAAACCCTGCAGAGAGCCTGCAAAGTCACTGCC...,AACATAGGTGG,WILDTYPE,3,HUES64_rep1,1481.0
3,AAAAAAAAAAAAAAAAACCCTGCAGAGAGCCTGCAAAGTCACTGCC...,GTATCGGTTCC,WILDTYPE,4,HUES64_rep1,158.0
4,AAAAAAAAAAAAAAAAACCCTGCAGAGAGCCTGCAAAGTCACTGCC...,CTCGATACATG,WILDTYPE,6,HUES64_rep1,725.0


In [27]:
dna_counts_pos_ctrls = dna_counts_pos_ctrls.melt(id_vars=["element", "barcode", "tile_type", "barc_id"])
rna_counts_pos_ctrls = rna_counts_pos_ctrls.melt(id_vars=["element", "barcode", "tile_type", "barc_id"])
rna_counts_pos_ctrls.head()

Unnamed: 0,element,barcode,tile_type,barc_id,variable,value
0,AGTTCCGCTTACATAACTTACGGTAAATGGCCCGCCTGGCTGACCG...,CAATAGTTCCG,CONTROL,1,HUES64_rep1,3765.0
1,AGTTCCGCTTACATAACTTACGGTAAATGGCCCGCCTGGCTGACCG...,AACGTTCGCTA,CONTROL,2,HUES64_rep1,3079.0
2,AGTTCCGCTTACATAACTTACGGTAAATGGCCCGCCTGGCTGACCG...,AACTCACACGA,CONTROL,3,HUES64_rep1,4890.0
3,AGTTCCGCTTACATAACTTACGGTAAATGGCCCGCCTGGCTGACCG...,TTGCGATACCG,CONTROL,4,HUES64_rep1,1648.0
4,AGTTCCGCTTACATAACTTACGGTAAATGGCCCGCCTGGCTGACCG...,CATTCATTGGG,CONTROL,5,HUES64_rep1,10118.0


In [28]:
dna_counts_filt["samp_id"] = "samp:" + dna_counts_filt["variable"] + "__barc:" + dna_counts_filt["barc_id"].astype(str)
rna_counts_filt["samp_id"] = "samp:" + rna_counts_filt["variable"] + "__barc:" + rna_counts_filt["barc_id"].astype(str)
rna_counts_filt.sample(5)

Unnamed: 0,element,barcode,tile_type,barc_id,variable,value,samp_id
109223,GGGCGCACGCGTTCGGGTCGGAGGGAAGGCGGCAATTTCACCCTCG...,CTATATCGGGC,WILDTYPE,6,HUES64_rep1,137.0,samp:HUES64_rep1__barc:6
262197,GGCTTGGAGCCCGCGCGAAATCCACCCTTCTGTCCTTGTTAGGTCT...,GACCACGTTAG,WILDTYPE,9,HUES64_rep2,390.0,samp:HUES64_rep2__barc:9
70595,CTCGGGGAGGGGGAGAAGGGGTGCCGAAAGGGGGCGCGGGTCCGGG...,GCTATCGTCAA,WILDTYPE,5,HUES64_rep1,1510.0,samp:HUES64_rep1__barc:5
270320,GTCACAGGTTTTGTCCACCCGTTCACGATCGTCATCATGGGGCTCT...,TGCCGGCACCA,RANDOM,1,HUES64_rep2,233.0,samp:HUES64_rep2__barc:1
179353,AGGGCAGATGTGCTTACCGCTAGTCCCAGCCGGTGGGATAAAAGGT...,CGACTGGGGTT,WILDTYPE,9,HUES64_rep2,193.0,samp:HUES64_rep2__barc:9


In [29]:
dna_counts_pos_ctrls["samp_id"] = "samp:" + dna_counts_pos_ctrls["variable"] + "__barc:" + dna_counts_pos_ctrls["barc_id"].astype(str)
rna_counts_pos_ctrls["samp_id"] = "samp:" + rna_counts_pos_ctrls["variable"] + "__barc:" + rna_counts_pos_ctrls["barc_id"].astype(str)
rna_counts_pos_ctrls.sample(5)

Unnamed: 0,element,barcode,tile_type,barc_id,variable,value,samp_id
485,AGTTCCGCTTACATAACTTACGGTAAATGGCCCGCCTGGCTGACCG...,CTATCTCGCGC,CONTROL,12,mESC_rep1,3182.0,samp:mESC_rep1__barc:12
395,CTGGCATTATGCCCAGTACATGACCTTATGGGACTTTCCTACTTGG...,GTGCGCTGTTA,CONTROL,42,HUES64_rep2,3675.0,samp:HUES64_rep2__barc:42
245,AGTTCCGCTTACATAACTTACGGTAAATGGCCCGCCTGGCTGACCG...,TATTCTCGTAT,CONTROL,9,HUES64_rep2,3551.0,samp:HUES64_rep2__barc:9
664,TGGATAGCGGTTTGACTCACGGGGATTTCCAAGTCTCCACCCCATT...,AACTCGTGCAA,CONTROL,14,mESC_rep1,494.0,samp:mESC_rep1__barc:14
640,CTGGCATTATGCCCAGTACATGACCTTATGGGACTTTCCTACTTGG...,CGACTTCTGTA,CONTROL,50,mESC_rep1,184.0,samp:mESC_rep1__barc:50


In [30]:
dna_counts_piv = dna_counts_filt.pivot(index="element", columns="samp_id", values="value").reset_index()
rna_counts_piv = rna_counts_filt.pivot(index="element", columns="samp_id", values="value").reset_index()
rna_counts_piv.head()

samp_id,element,samp:HUES64_rep1__barc:1,samp:HUES64_rep1__barc:10,samp:HUES64_rep1__barc:11,samp:HUES64_rep1__barc:12,samp:HUES64_rep1__barc:13,samp:HUES64_rep1__barc:2,samp:HUES64_rep1__barc:3,samp:HUES64_rep1__barc:4,samp:HUES64_rep1__barc:5,...,samp:mESC_rep1__barc:12,samp:mESC_rep1__barc:13,samp:mESC_rep1__barc:2,samp:mESC_rep1__barc:3,samp:mESC_rep1__barc:4,samp:mESC_rep1__barc:5,samp:mESC_rep1__barc:6,samp:mESC_rep1__barc:7,samp:mESC_rep1__barc:8,samp:mESC_rep1__barc:9
0,AAAAAAAAAAAAAAAAACCCTGCAGAGAGCCTGCAAAGTCACTGCC...,2086.0,622.0,14.0,6770.0,2330.0,2797.0,1481.0,158.0,,...,6298.0,1606.0,2441.0,1465.0,145.0,,199.0,824.0,133.0,884.0
1,AAAAAAAAAAAAAAAGAAAAGAAAAGAAAAAAAAGAAAGGATTGAG...,4.0,,,,3.0,12.0,54.0,47.0,61.0,...,,2.0,22.0,,,113.0,10.0,,29.0,185.0
2,AAAAAAAAAAAAGAGGAGAAATAGATTGTTACCTTATATTATTTAA...,10.0,83.0,28.0,,2.0,86.0,6.0,5.0,,...,,1.0,,88.0,,3.0,,,18.0,
3,AAAAAAAAAAAAGTGGGGGTGGATCGGGCGTGCCGGTAGGGAACCC...,,12.0,,,20.0,,25.0,,21.0,...,1.0,,,16.0,,60.0,106.0,,,
4,AAAAAAAAAACCGGCAAAATGTCCTTTTCCTTGTTTTGAAAAGACT...,55.0,6.0,246.0,241.0,201.0,123.0,22.0,1.0,192.0,...,226.0,64.0,130.0,15.0,,260.0,88.0,7.0,81.0,131.0


In [31]:
dna_counts_pos_ctrl_piv = dna_counts_pos_ctrls.pivot(index="element", columns="samp_id", values="value").reset_index()
rna_counts_pos_ctrl_piv = rna_counts_pos_ctrls.pivot(index="element", columns="samp_id", values="value").reset_index()
rna_counts_pos_ctrl_piv.head()

samp_id,element,samp:HUES64_rep1__barc:1,samp:HUES64_rep1__barc:10,samp:HUES64_rep1__barc:11,samp:HUES64_rep1__barc:12,samp:HUES64_rep1__barc:13,samp:HUES64_rep1__barc:14,samp:HUES64_rep1__barc:15,samp:HUES64_rep1__barc:16,samp:HUES64_rep1__barc:17,...,samp:mESC_rep1__barc:55,samp:mESC_rep1__barc:56,samp:mESC_rep1__barc:57,samp:mESC_rep1__barc:58,samp:mESC_rep1__barc:59,samp:mESC_rep1__barc:6,samp:mESC_rep1__barc:60,samp:mESC_rep1__barc:7,samp:mESC_rep1__barc:8,samp:mESC_rep1__barc:9
0,AGTTCCGCTTACATAACTTACGGTAAATGGCCCGCCTGGCTGACCG...,3765.0,688.0,2249.0,2579.0,2047.0,2647.0,8680.0,962.0,4.0,...,402.0,1118.0,127.0,17.0,125.0,524.0,1195.0,2088.0,6413.0,2696.0
1,CCATTGACGTCAATGGGTGGAGTATTTACGGTAAACTGCCCACTTG...,176.0,54.0,77.0,52.0,484.0,134.0,285.0,64.0,1076.0,...,932.0,664.0,4.0,109.0,82.0,22.0,404.0,384.0,162.0,53.0
2,CTGGCATTATGCCCAGTACATGACCTTATGGGACTTTCCTACTTGG...,670.0,307.0,,200.0,26.0,519.0,808.0,449.0,1043.0,...,1137.0,199.0,93.0,1044.0,1038.0,813.0,256.0,1.0,887.0,
3,TGGATAGCGGTTTGACTCACGGGGATTTCCAAGTCTCCACCCCATT...,457.0,247.0,1055.0,1404.0,1868.0,533.0,52.0,55.0,918.0,...,949.0,1153.0,407.0,309.0,126.0,1151.0,752.0,1332.0,1101.0,952.0


In [32]:
dna_counts_piv.fillna(0, inplace=True)
rna_counts_piv.fillna(0, inplace=True)
rna_counts_piv.head()

samp_id,element,samp:HUES64_rep1__barc:1,samp:HUES64_rep1__barc:10,samp:HUES64_rep1__barc:11,samp:HUES64_rep1__barc:12,samp:HUES64_rep1__barc:13,samp:HUES64_rep1__barc:2,samp:HUES64_rep1__barc:3,samp:HUES64_rep1__barc:4,samp:HUES64_rep1__barc:5,...,samp:mESC_rep1__barc:12,samp:mESC_rep1__barc:13,samp:mESC_rep1__barc:2,samp:mESC_rep1__barc:3,samp:mESC_rep1__barc:4,samp:mESC_rep1__barc:5,samp:mESC_rep1__barc:6,samp:mESC_rep1__barc:7,samp:mESC_rep1__barc:8,samp:mESC_rep1__barc:9
0,AAAAAAAAAAAAAAAAACCCTGCAGAGAGCCTGCAAAGTCACTGCC...,2086.0,622.0,14.0,6770.0,2330.0,2797.0,1481.0,158.0,0.0,...,6298.0,1606.0,2441.0,1465.0,145.0,0.0,199.0,824.0,133.0,884.0
1,AAAAAAAAAAAAAAAGAAAAGAAAAGAAAAAAAAGAAAGGATTGAG...,4.0,0.0,0.0,0.0,3.0,12.0,54.0,47.0,61.0,...,0.0,2.0,22.0,0.0,0.0,113.0,10.0,0.0,29.0,185.0
2,AAAAAAAAAAAAGAGGAGAAATAGATTGTTACCTTATATTATTTAA...,10.0,83.0,28.0,0.0,2.0,86.0,6.0,5.0,0.0,...,0.0,1.0,0.0,88.0,0.0,3.0,0.0,0.0,18.0,0.0
3,AAAAAAAAAAAAGTGGGGGTGGATCGGGCGTGCCGGTAGGGAACCC...,0.0,12.0,0.0,0.0,20.0,0.0,25.0,0.0,21.0,...,1.0,0.0,0.0,16.0,0.0,60.0,106.0,0.0,0.0,0.0
4,AAAAAAAAAACCGGCAAAATGTCCTTTTCCTTGTTTTGAAAAGACT...,55.0,6.0,246.0,241.0,201.0,123.0,22.0,1.0,192.0,...,226.0,64.0,130.0,15.0,0.0,260.0,88.0,7.0,81.0,131.0


In [33]:
dna_counts_pos_ctrl_piv.fillna(0, inplace=True)
rna_counts_pos_ctrl_piv.fillna(0, inplace=True)

## 8. filter: remove any elements that don't have >=50% of barcodes with DNA counts >= 10

In [34]:
dna_counts_piv["dna_status"] = dna_counts_piv.apply(dna_status, barc_thresh=10, perc_barc_thresh=0.5, 
                                                    ctrl_elems=ctrl_elems, axis=1)
dna_counts_pos_ctrl_piv["dna_status"] = dna_counts_pos_ctrl_piv.apply(dna_status, barc_thresh=10, 
                                                                      perc_barc_thresh=0.5, 
                                                                      ctrl_elems=ctrl_elems, axis=1)
dna_counts_piv.dna_status.value_counts()

good    13479
bad      1549
Name: dna_status, dtype: int64

In [35]:
good_dna_elems = list(dna_counts_piv[dna_counts_piv["dna_status"] == "good"]["element"])
good_pos_ctrl_dna_elems = list(dna_counts_pos_ctrl_piv[dna_counts_pos_ctrl_piv["dna_status"] == "good"]["element"])

In [36]:
dna_counts_piv_filt = dna_counts_piv[dna_counts_piv["element"].isin(good_dna_elems)]
dna_counts_piv_filt.drop("dna_status", axis=1, inplace=True)
rna_counts_piv_filt = rna_counts_piv[rna_counts_piv["element"].isin(good_dna_elems)]
print(len(dna_counts_piv_filt))
print(len(rna_counts_piv_filt))

13479
13479


In [37]:
dna_counts_pos_ctrl_piv_filt = dna_counts_pos_ctrl_piv[dna_counts_pos_ctrl_piv["element"].isin(good_pos_ctrl_dna_elems)]
dna_counts_pos_ctrl_piv_filt.drop("dna_status", axis=1, inplace=True)
rna_counts_pos_ctrl_piv_filt = rna_counts_pos_ctrl_piv[rna_counts_pos_ctrl_piv["element"].isin(good_pos_ctrl_dna_elems)]
print(len(dna_counts_pos_ctrl_piv_filt))
print(len(rna_counts_pos_ctrl_piv_filt))

4
4


## 9. add new negative controls -- which are sampled from positive controls -- for MPRAnalyze comparison b/w cell types

In [38]:
barc_ids = list(range(1, 61))
n_samps = 100
elems = list(dna_counts_pos_ctrl_piv_filt["element"])
elems

['AGTTCCGCTTACATAACTTACGGTAAATGGCCCGCCTGGCTGACCGCCCAACGACCCCCGCCCATTGACGTCAATAATGACGTATGTTCCCATAGTAACGCCAATAGGGACTTTCCATTGACGTCAATGGGTGGAGTATTTACG',
 'CCATTGACGTCAATGGGTGGAGTATTTACGGTAAACTGCCCACTTGGCAGTACATCAAGTGTATCATATGCCAAGTACGCCCCCTATTGACGTCAATGACGGTAAATGGCCCGCCTGGCATTATGCCCAGTACATGACCTTATG',
 'CTGGCATTATGCCCAGTACATGACCTTATGGGACTTTCCTACTTGGCAGTACATCTACGTATTAGTCATCGCTATTACCATGGTGATGCGGTTTTGGCAGTACATCAATGGGCGTGGATAGCGGTTTGACTCACGGGGATTTCC',
 'TGGATAGCGGTTTGACTCACGGGGATTTCCAAGTCTCCACCCCATTGACGTCAATGGGAGTTTGTTTTGGCACCAAAATCAACGGGACTTTCCAAAATGTCGTAACAACTCCGCCCCATTGACGCAAATGGGCGGTAGGCGTGT']

In [39]:
rep_map_cols = list(set([x.split("__")[0] for x in list(rna_counts_pos_ctrl_piv_filt.columns) if x != "element"]))
rep_map_cols

['samp:mESC_rep1', 'samp:HUES64_rep1', 'samp:HUES64_rep2']

In [40]:
neg_ctrl_dna_counts = pd.DataFrame()
neg_ctrl_rna_counts = pd.DataFrame()

for i, elem in enumerate(elems):
    elem_dna_data = dna_counts_pos_ctrl_piv_filt[dna_counts_pos_ctrl_piv_filt["element"] == elem]
    elem_rna_data = rna_counts_pos_ctrl_piv_filt[rna_counts_pos_ctrl_piv_filt["element"] == elem]
    
    for j in range(n_samps):
        barcs_sampled = np.random.choice(barc_ids, size=13)
        
        dna_cols_sampled = ["element"]
        dna_cols_sampled.extend(["samp:dna_1__barc:%s" % x for x in barcs_sampled])
        new_dna_cols = ["element"]
        new_dna_cols.extend(["samp:dna_1__barc:%s" % x for x in range(1, 14)])
        
        rna_cols_sampled = ["element"]
        new_rna_cols = ["element"]
        for rep in rep_map_cols:
            rna_cols_sampled.extend(["%s__barc:%s" % (rep, x) for x in barcs_sampled])
            new_rna_cols.extend(["%s__barc:%s" % (rep, x) for x in range(1, 14)])
        
        # subsample dataframe w/ columns we just defined
        elem_dna_data_sampled = elem_dna_data[dna_cols_sampled]
        elem_rna_data_sampled = elem_rna_data[rna_cols_sampled]   
        
        # rename columns
        elem_dna_data_sampled.columns = new_dna_cols
        elem_rna_data_sampled.columns = new_rna_cols
                
        # rename element with element + samp #
        elem_dna_data_sampled["element"] = elem + "__samp%s" % (j+1)
        elem_rna_data_sampled["element"] = elem + "__samp%s" % (j+1)
        
        # for error checking -- print out the barcode that should be barcode 1"
#         print("error checking for %s__samp%s: %s" % (elem, j+1, barcs_sampled[0]))

        
        # append
        neg_ctrl_dna_counts = neg_ctrl_dna_counts.append(elem_dna_data_sampled)
        neg_ctrl_rna_counts = neg_ctrl_rna_counts.append(elem_rna_data_sampled)

## 10. get negative control IDs [negative controls only for quantification]

In [41]:
dna_counts_piv_filt = dna_counts_piv_filt.append(neg_ctrl_dna_counts)
rna_counts_piv_filt = rna_counts_piv_filt.append(neg_ctrl_rna_counts)
print(len(dna_counts_piv_filt))
print(len(rna_counts_piv_filt))

13879
13879


In [42]:
ctrls = rna_counts_piv_filt[["element"]].merge(index_elem[["element", "tile_type"]], on="element", how="left")
print(len(ctrls))
ctrls.head()

13879


Unnamed: 0,element,tile_type
0,AAAAAAAAAAAAAAAAACCCTGCAGAGAGCCTGCAAAGTCACTGCC...,WILDTYPE
1,AAAAAAAAAAAAAAAGAAAAGAAAAGAAAAAAAAGAAAGGATTGAG...,WILDTYPE
2,AAAAAAAAAAAAGAGGAGAAATAGATTGTTACCTTATATTATTTAA...,WILDTYPE
3,AAAAAAAAAACCGGCAAAATGTCCTTTTCCTTGTTTTGAAAAGACT...,WILDTYPE
4,AAAAAAAAAGGCCACGCTCAAAACCCCAGACTAGTTTTCCTCACCA...,WILDTYPE


In [43]:
ctrls["ctrl_status"] = ctrls.apply(get_ctrl_status, axis=1)
ctrls.sample(5)

Unnamed: 0,element,tile_type,ctrl_status
64,AAAAGGAACGGAAGTACAAGAGTGACTCTGCTATTATTCTACTCCA...,WILDTYPE,False
4310,CCCCCCCTCCCCCGCCTACGATGATTGATCCTCACACCTGAGTGAC...,WILDTYPE,False
9041,GGCCCGCCCAGGCAGCCGCTGTCGCCGTACCGCCCCTTCGCTCCCT...,WILDTYPE,False
5376,CGCGGGGCGGAGGCGGGGCTGCTCCCCAGGGGCGGGAGGAACCCGC...,WILDTYPE,False
312,AACATGGGGCTTCCTCTTTCACACATATACAGCATTTGGTATGTGC...,WILDTYPE,False


## 11. create overall annotation file

In [44]:
dna_cols = [x for x in dna_counts_piv_filt.columns if "samp:" in x]
rna_cols = [x for x in rna_counts_piv_filt.columns if "samp:" in x]

In [45]:
dna_col_ann = {}
rna_col_ann = {}
for cols, ann in zip([dna_cols, rna_cols], [dna_col_ann, rna_col_ann]):
    for col in cols:
        samp = col.split("__")[0].split("_")[-1]
        cond = col.split(":")[1].split("_")[0]
        barc = col.split(":")[-1]
        ann[col] = {"sample": samp, "condition": cond, "barcode": barc}

dna_col_ann = pd.DataFrame.from_dict(dna_col_ann, orient="index")
rna_col_ann = pd.DataFrame.from_dict(rna_col_ann, orient="index")
rna_col_ann.sample(5)

Unnamed: 0,sample,condition,barcode
samp:HUES64_rep1__barc:7,rep1,HUES64,7
samp:HUES64_rep1__barc:6,rep1,HUES64,6
samp:mESC_rep1__barc:7,rep1,mESC,7
samp:mESC_rep1__barc:3,rep1,mESC,3
samp:mESC_rep1__barc:11,rep1,mESC,11


## 12. write final files [for quantification analysis]

In [46]:
dna_counts_piv_filt.set_index("element", inplace=True)
rna_counts_piv_filt.set_index("element", inplace=True)

In [47]:
# write final files
dna_col_ann.to_csv("%s/dna_col_ann.mpranalyze.for_quantification.txt" % mpranalyze_dir, sep="\t")
rna_col_ann.to_csv("%s/rna_col_ann.mpranalyze.for_quantification.txt" % mpranalyze_dir, sep="\t")

ctrls = ctrls[["element", "ctrl_status"]]
ctrls.to_csv("%s/ctrl_status.mpranalyze.for_quantification.txt" % mpranalyze_dir, sep="\t", index=False)

dna_counts_piv_filt.to_csv("%s/dna_counts.mpranalyze.for_quantification.txt" % mpranalyze_dir, sep="\t", index=True)
rna_counts_piv_filt.to_csv("%s/rna_counts.mpranalyze.for_quantification.txt" % mpranalyze_dir, sep="\t", index=True)