# 04__preprocess_mpranalyze_compare

in this notebook, i re-shape the counts data to run MPRAnalyze comparison mode. importantly, i also include the negative controls for comparison mode that I made in the previous notebook (01). i only set MPRAnalyze comparison mode to run on the TSSs that we are interested in: that is, the MAXIMUM tile for each orthologous TSS pair.

In [1]:
import warnings
warnings.filterwarnings('ignore')

import itertools
import pandas as pd
import math
import matplotlib.pyplot as plt
import numpy as np
import re
import seaborn as sns
import sys

from scipy.stats import spearmanr

# import utils
sys.path.append("../../../utils")
from plotting_utils import *

%matplotlib inline
%config InlineBackend.figure_format = 'svg'
mpl.rcParams['figure.autolayout'] = False

In [2]:
sns.set(**PAPER_PRESET)
fontsize = PAPER_FONTSIZE

In [3]:
np.random.seed(2019)

## functions

In [4]:
def ctrl_status(row):
    if "CONTROL" in row.comp_id:
        return True
    else:
        return False

## variables

In [5]:
mpranalyze_dir = "../../../data/02__mpra/01__counts/mpranalyze_files"

In [6]:
dna_counts_f = "%s/dna_counts.mpranalyze.for_quantification.txt" % mpranalyze_dir
rna_counts_f = "%s/rna_counts.mpranalyze.for_quantification.txt" % mpranalyze_dir

In [7]:
data_dir = "../../../data/02__mpra/02__activs"

In [8]:
human_max_f = "%s/human_TSS_vals.max_tile.txt" % data_dir
mouse_max_f = "%s/mouse_TSS_vals.max_tile.txt" % data_dir

In [9]:
tss_map_f = "../../../data/01__design/00__mpra_list/mpra_tss.with_ids.UPDATED.txt"

## 1. import data

In [10]:
dna_counts = pd.read_table(dna_counts_f)
dna_counts.head()

Unnamed: 0,element,samp:dna_1__barc:1,samp:dna_1__barc:10,samp:dna_1__barc:11,samp:dna_1__barc:12,samp:dna_1__barc:13,samp:dna_1__barc:2,samp:dna_1__barc:3,samp:dna_1__barc:4,samp:dna_1__barc:5,samp:dna_1__barc:6,samp:dna_1__barc:7,samp:dna_1__barc:8,samp:dna_1__barc:9
0,AAAAAAAAAAAAAAAAACCCTGCAGAGAGCCTGCAAAGTCACTGCC...,614.0,126.0,94.0,2024.0,968.0,806.0,592.0,78.0,0.0,224.0,478.0,32.0,320.0
1,AAAAAAAAAAAAAAAGAAAAGAAAAGAAAAAAAAGAAAGGATTGAG...,64.0,12.0,52.0,0.0,16.0,94.0,44.0,128.0,178.0,44.0,0.0,20.0,340.0
2,AAAAAAAAAAAAGAGGAGAAATAGATTGTTACCTTATATTATTTAA...,218.0,54.0,170.0,22.0,66.0,182.0,116.0,8.0,28.0,0.0,72.0,116.0,0.0
3,AAAAAAAAAACCGGCAAAATGTCCTTTTCCTTGTTTTGAAAAGACT...,460.0,346.0,626.0,448.0,324.0,502.0,86.0,162.0,210.0,414.0,352.0,524.0,468.0
4,AAAAAAAAAGGCCACGCTCAAAACCCCAGACTAGTTTTCCTCACCA...,596.0,242.0,456.0,180.0,0.0,828.0,226.0,42.0,302.0,446.0,750.0,540.0,650.0


In [11]:
rna_counts = pd.read_table(rna_counts_f)
rna_counts.head()

Unnamed: 0,element,samp:HUES64_rep1__barc:1,samp:HUES64_rep1__barc:10,samp:HUES64_rep1__barc:11,samp:HUES64_rep1__barc:12,samp:HUES64_rep1__barc:13,samp:HUES64_rep1__barc:2,samp:HUES64_rep1__barc:3,samp:HUES64_rep1__barc:4,samp:HUES64_rep1__barc:5,...,samp:mESC_rep1__barc:12,samp:mESC_rep1__barc:13,samp:mESC_rep1__barc:2,samp:mESC_rep1__barc:3,samp:mESC_rep1__barc:4,samp:mESC_rep1__barc:5,samp:mESC_rep1__barc:6,samp:mESC_rep1__barc:7,samp:mESC_rep1__barc:8,samp:mESC_rep1__barc:9
0,AAAAAAAAAAAAAAAAACCCTGCAGAGAGCCTGCAAAGTCACTGCC...,2086.0,622.0,14.0,6770.0,2330.0,2797.0,1481.0,158.0,0.0,...,6298.0,1606.0,2441.0,1465.0,145.0,0.0,199.0,824.0,133.0,884.0
1,AAAAAAAAAAAAAAAGAAAAGAAAAGAAAAAAAAGAAAGGATTGAG...,4.0,0.0,0.0,0.0,3.0,12.0,54.0,47.0,61.0,...,0.0,2.0,22.0,0.0,0.0,113.0,10.0,0.0,29.0,185.0
2,AAAAAAAAAAAAGAGGAGAAATAGATTGTTACCTTATATTATTTAA...,10.0,83.0,28.0,0.0,2.0,86.0,6.0,5.0,0.0,...,0.0,1.0,0.0,88.0,0.0,3.0,0.0,0.0,18.0,0.0
3,AAAAAAAAAACCGGCAAAATGTCCTTTTCCTTGTTTTGAAAAGACT...,55.0,6.0,246.0,241.0,201.0,123.0,22.0,1.0,192.0,...,226.0,64.0,130.0,15.0,0.0,260.0,88.0,7.0,81.0,131.0
4,AAAAAAAAAGGCCACGCTCAAAACCCCAGACTAGTTTTCCTCACCA...,311.0,46.0,243.0,28.0,0.0,689.0,186.0,0.0,503.0,...,22.0,0.0,461.0,12.0,12.0,167.0,511.0,263.0,403.0,255.0


In [12]:
human_max = pd.read_table(human_max_f)
mouse_max = pd.read_table(mouse_max_f)
human_max.head()

Unnamed: 0,element,tss_id,biotype_hg19,tss_tile_num,HUES64,HUES64_log,HUES64_padj,HUES64_sig,cleaner_biotype
0,GCCCTCTCGCCCCGCCCTTGCCCAGGCAGCCCCCGGTCGCGACGGC...,h.999,div_pc,tile2,0.588526,-0.230235,0.01848145,sig,mRNA
1,GGGCGGGACGGAGACTCTGGGCTCAAGGCTCCTGGAAATGGGCGGG...,h.998,div_pc,tile2,0.885958,-0.052587,1.044316e-06,sig,mRNA
2,AAAAGGCAGTGCTTGATTCAATTCAACATTCACTGCGCCACTTACC...,h.997,div_pc,tile2,0.423129,-0.373527,0.2996315,not sig,mRNA
3,CGGAGGGGCGGGGCAAGAGTGGGAGGAGACCCTGCGCGCGGCCGCC...,h.996,div_pc,tile2,2.452145,0.389546,5.019212e-74,sig,mRNA
4,AGGGTGGTGCGTGGTCTACGGCGAGCGGAGTGGGGCGGGGTCGCGC...,h.995,div_pc,tile1,0.994045,-0.002594,6.400385e-09,sig,mRNA


In [13]:
tss_map = pd.read_table(tss_map_f, index_col=0)
tss_map.head()

Unnamed: 0,chr_tss_hg19,start_tss_hg19,end_tss_hg19,strand_tss_hg19,cage_id_hg19,biotype_hg19,name_peak_hg19,seq_orth,cage_orth,chr_tss_mm9,...,avg_exp_hg19,avg_exp_mm9,stem_exp_hg19,stem_exp_mm9,har,orig_species,lift_species,oligo_type,hg19_id,mm9_id
0,chr1,2984981,2984982,-,"chr1:2984976..2984989,-",div_lnc,ENSG00000177133.6,True,True,chr4,...,0.1,0.43,0.0277778,0.0175,False,human,mouse,human__divergent:mouse__divergent,h.0,m.0
1,chr1,2985002,2985003,-,"chr1:2984997..2985037,-",div_lnc,ENSG00000177133.6,True,True,chr4,...,0.29,0.43,0.0311111,0.0175,False,human,mouse,human__divergent:mouse__divergent,h.1,m.0
2,chr1,2985430,2985431,-,"chr1:2985420..2985438,-",div_lnc,ENSG00000177133.6,True,True,chr4,...,2.54,0.85,0.0711111,0.0,False,human,mouse,human__divergent:mouse__divergent,h.2,m.1
3,chr1,8086552,8086553,+,"chr1:8086546..8086571,+",div_lnc,ENSG00000238290.1,True,True,chr4,...,0.3,0.14,0.104444,0.0,False,human,mouse,human__divergent:mouse__antisense,h.3,m.2
4,chr1,26498322,26498323,-,"chr1:26498321..26498327,-",antisense_upep,ENSG00000236782.1,True,True,chr4,...,0.19,0.25,0.03,0.0,False,human,mouse,human__antisense:mouse__protein_coding,h.4,m.3


## 2. remove any sequences in TSS map that we removed at initial MPRAnalyze (low counts)

In [14]:
# filter out any elements we removed at initial steps (low dna counts)
human_max = human_max[human_max["element"].isin(dna_counts["element"])]
mouse_max = mouse_max[mouse_max["element"].isin(dna_counts["element"])]

## 3. get positive ctrl dna/rna counts

In [15]:
dna_counts_ctrl = dna_counts[dna_counts["element"].str.contains("samp")]
print(len(dna_counts_ctrl))
rna_counts_ctrl = rna_counts[rna_counts["element"].str.contains("samp")]
print(len(rna_counts_ctrl))

400
400


# first make files needed for seq. comparison (native and cis effects)

## 1. merge max. ortholog pairs w/ counts

In [16]:
dna_counts_human_max = human_max[["element", "tss_id"]].merge(dna_counts, on="element")
dna_counts_mouse_max = mouse_max[["element", "tss_id"]].merge(dna_counts, on="element")
dna_counts_human_max.head()

Unnamed: 0,element,tss_id,samp:dna_1__barc:1,samp:dna_1__barc:10,samp:dna_1__barc:11,samp:dna_1__barc:12,samp:dna_1__barc:13,samp:dna_1__barc:2,samp:dna_1__barc:3,samp:dna_1__barc:4,samp:dna_1__barc:5,samp:dna_1__barc:6,samp:dna_1__barc:7,samp:dna_1__barc:8,samp:dna_1__barc:9
0,GCCCTCTCGCCCCGCCCTTGCCCAGGCAGCCCCCGGTCGCGACGGC...,h.999,10.0,36.0,0.0,0.0,64.0,0.0,0.0,38.0,58.0,0.0,66.0,0.0,164.0
1,GGGCGGGACGGAGACTCTGGGCTCAAGGCTCCTGGAAATGGGCGGG...,h.998,342.0,110.0,0.0,24.0,88.0,8.0,0.0,0.0,322.0,336.0,336.0,10.0,18.0
2,AAAAGGCAGTGCTTGATTCAATTCAACATTCACTGCGCCACTTACC...,h.997,74.0,454.0,150.0,298.0,626.0,350.0,222.0,42.0,472.0,120.0,486.0,76.0,482.0
3,CGGAGGGGCGGGGCAAGAGTGGGAGGAGACCCTGCGCGCGGCCGCC...,h.996,788.0,0.0,920.0,0.0,172.0,1020.0,2314.0,722.0,438.0,198.0,24.0,140.0,0.0
4,AGGGTGGTGCGTGGTCTACGGCGAGCGGAGTGGGGCGGGGTCGCGC...,h.995,0.0,0.0,18.0,184.0,202.0,0.0,0.0,196.0,0.0,0.0,58.0,42.0,110.0


In [17]:
print(len(dna_counts_human_max))
print(len(dna_counts_mouse_max))

3344
3260


In [18]:
rna_counts_human_max = human_max[["element", "tss_id"]].merge(rna_counts, on="element")
rna_counts_mouse_max = mouse_max[["element", "tss_id"]].merge(rna_counts, on="element")
rna_counts_human_max.head()

Unnamed: 0,element,tss_id,samp:HUES64_rep1__barc:1,samp:HUES64_rep1__barc:10,samp:HUES64_rep1__barc:11,samp:HUES64_rep1__barc:12,samp:HUES64_rep1__barc:13,samp:HUES64_rep1__barc:2,samp:HUES64_rep1__barc:3,samp:HUES64_rep1__barc:4,...,samp:mESC_rep1__barc:12,samp:mESC_rep1__barc:13,samp:mESC_rep1__barc:2,samp:mESC_rep1__barc:3,samp:mESC_rep1__barc:4,samp:mESC_rep1__barc:5,samp:mESC_rep1__barc:6,samp:mESC_rep1__barc:7,samp:mESC_rep1__barc:8,samp:mESC_rep1__barc:9
0,GCCCTCTCGCCCCGCCCTTGCCCAGGCAGCCCCCGGTCGCGACGGC...,h.999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25.0,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,50.0
1,GGGCGGGACGGAGACTCTGGGCTCAAGGCTCCTGGAAATGGGCGGG...,h.998,369.0,132.0,0.0,14.0,45.0,5.0,0.0,0.0,...,10.0,25.0,101.0,0.0,0.0,398.0,393.0,330.0,2.0,2.0
2,AAAAGGCAGTGCTTGATTCAATTCAACATTCACTGCGCCACTTACC...,h.997,3.0,520.0,154.0,119.0,74.0,91.0,72.0,0.0,...,204.0,326.0,100.0,297.0,13.0,276.0,50.0,173.0,0.0,109.0
3,CGGAGGGGCGGGGCAAGAGTGGGAGGAGACCCTGCGCGCGGCCGCC...,h.996,2443.0,0.0,5498.0,0.0,84.0,3029.0,8235.0,3203.0,...,0.0,401.0,4645.0,15133.0,3796.0,4297.0,238.0,36.0,950.0,0.0
4,AGGGTGGTGCGTGGTCTACGGCGAGCGGAGTGGGGCGGGGTCGCGC...,h.995,0.0,0.0,1.0,688.0,593.0,0.0,0.0,335.0,...,276.0,769.0,0.0,0.0,730.0,0.0,0.0,0.0,278.0,233.0


In [19]:
print(len(rna_counts_human_max))
print(len(rna_counts_mouse_max))

3344
3260


## 2. merge human/mouse counts into 1 dataframe

In [20]:
tss_map_mpra = tss_map.merge(rna_counts_human_max, left_on="hg19_id", 
                             right_on="tss_id").merge(rna_counts_mouse_max, left_on="mm9_id", right_on="tss_id",
                                                      suffixes=("__seq:human", "__seq:mouse"))
tss_map_mpra.drop_duplicates(inplace=True)
tss_map_mpra.head(5)

Unnamed: 0,chr_tss_hg19,start_tss_hg19,end_tss_hg19,strand_tss_hg19,cage_id_hg19,biotype_hg19,name_peak_hg19,seq_orth,cage_orth,chr_tss_mm9,...,samp:mESC_rep1__barc:12__seq:mouse,samp:mESC_rep1__barc:13__seq:mouse,samp:mESC_rep1__barc:2__seq:mouse,samp:mESC_rep1__barc:3__seq:mouse,samp:mESC_rep1__barc:4__seq:mouse,samp:mESC_rep1__barc:5__seq:mouse,samp:mESC_rep1__barc:6__seq:mouse,samp:mESC_rep1__barc:7__seq:mouse,samp:mESC_rep1__barc:8__seq:mouse,samp:mESC_rep1__barc:9__seq:mouse
0,chr1,2985430,2985431,-,"chr1:2985420..2985438,-",div_lnc,ENSG00000177133.6,True,True,chr4,...,805.0,0.0,271.0,321.0,88.0,160.0,185.0,93.0,162.0,777.0
1,chr1,8086552,8086553,+,"chr1:8086546..8086571,+",div_lnc,ENSG00000238290.1,True,True,chr4,...,0.0,1946.0,219.0,2102.0,93.0,262.0,351.0,104.0,3.0,0.0
2,chr1,26498322,26498323,-,"chr1:26498321..26498327,-",antisense_upep,ENSG00000236782.1,True,True,chr4,...,60.0,440.0,134.0,138.0,1097.0,4.0,80.0,1096.0,332.0,48.0
3,chr1,65533428,65533429,-,"chr1:65533390..65533443,-",intergenic,ENSG00000231485.1,True,True,chr4,...,1315.0,166.0,139.0,4139.0,0.0,43.0,2631.0,2327.0,2732.0,2064.0
4,chr1,65533462,65533463,-,"chr1:65533457..65533465,-",intergenic,ENSG00000231485.1,True,True,chr4,...,1315.0,166.0,139.0,4139.0,0.0,43.0,2631.0,2327.0,2732.0,2064.0


In [21]:
tss_map_dna = tss_map.merge(dna_counts_human_max, left_on="hg19_id", 
                            right_on="tss_id").merge(dna_counts_mouse_max, left_on="mm9_id", right_on="tss_id",
                                                     suffixes=("__seq:human", "__seq:mouse"))
tss_map_dna.drop_duplicates(inplace=True)
tss_map_dna.head(5)

Unnamed: 0,chr_tss_hg19,start_tss_hg19,end_tss_hg19,strand_tss_hg19,cage_id_hg19,biotype_hg19,name_peak_hg19,seq_orth,cage_orth,chr_tss_mm9,...,samp:dna_1__barc:12__seq:mouse,samp:dna_1__barc:13__seq:mouse,samp:dna_1__barc:2__seq:mouse,samp:dna_1__barc:3__seq:mouse,samp:dna_1__barc:4__seq:mouse,samp:dna_1__barc:5__seq:mouse,samp:dna_1__barc:6__seq:mouse,samp:dna_1__barc:7__seq:mouse,samp:dna_1__barc:8__seq:mouse,samp:dna_1__barc:9__seq:mouse
0,chr1,2985430,2985431,-,"chr1:2985420..2985438,-",div_lnc,ENSG00000177133.6,True,True,chr4,...,2336.0,126.0,1098.0,1518.0,226.0,176.0,150.0,372.0,550.0,2208.0
1,chr1,8086552,8086553,+,"chr1:8086546..8086571,+",div_lnc,ENSG00000238290.1,True,True,chr4,...,0.0,1212.0,200.0,2244.0,138.0,590.0,422.0,206.0,74.0,38.0
2,chr1,26498322,26498323,-,"chr1:26498321..26498327,-",antisense_upep,ENSG00000236782.1,True,True,chr4,...,442.0,296.0,288.0,330.0,1914.0,228.0,446.0,1158.0,358.0,190.0
3,chr1,65533428,65533429,-,"chr1:65533390..65533443,-",intergenic,ENSG00000231485.1,True,True,chr4,...,1096.0,296.0,166.0,1798.0,0.0,54.0,780.0,1336.0,1494.0,2028.0
4,chr1,65533462,65533463,-,"chr1:65533457..65533465,-",intergenic,ENSG00000231485.1,True,True,chr4,...,1096.0,296.0,166.0,1798.0,0.0,54.0,780.0,1336.0,1494.0,2028.0


## 3. assign each pair an ID

In [22]:
HUES64_rna_cols = [x for x in tss_map_mpra.columns if "samp:HUES64" in x]
mESC_rna_cols = [x for x in tss_map_mpra.columns if "samp:mESC" in x]
all_dna_cols = [x for x in tss_map_dna.columns if "samp:dna" in x]

human_cols = ["hg19_id", "biotype_hg19", "mm9_id", "biotype_mm9"]
human_cols.extend(HUES64_rna_cols)

mouse_cols = ["hg19_id", "biotype_hg19", "mm9_id", "biotype_mm9"]
mouse_cols.extend(mESC_rna_cols)

dna_cols = ["hg19_id", "biotype_hg19", "mm9_id", "biotype_mm9"]
dna_cols.extend(all_dna_cols)

tss_map_mpra_human = tss_map_mpra[human_cols]
tss_map_mpra_mouse = tss_map_mpra[mouse_cols]

tss_map_dna = tss_map_dna[dna_cols]

tss_map_mpra_human.head()

Unnamed: 0,hg19_id,biotype_hg19,mm9_id,biotype_mm9,samp:HUES64_rep1__barc:1__seq:human,samp:HUES64_rep1__barc:10__seq:human,samp:HUES64_rep1__barc:11__seq:human,samp:HUES64_rep1__barc:12__seq:human,samp:HUES64_rep1__barc:13__seq:human,samp:HUES64_rep1__barc:2__seq:human,...,samp:HUES64_rep2__barc:12__seq:mouse,samp:HUES64_rep2__barc:13__seq:mouse,samp:HUES64_rep2__barc:2__seq:mouse,samp:HUES64_rep2__barc:3__seq:mouse,samp:HUES64_rep2__barc:4__seq:mouse,samp:HUES64_rep2__barc:5__seq:mouse,samp:HUES64_rep2__barc:6__seq:mouse,samp:HUES64_rep2__barc:7__seq:mouse,samp:HUES64_rep2__barc:8__seq:mouse,samp:HUES64_rep2__barc:9__seq:mouse
0,h.2,div_lnc,m.1,div_lnc,2442.0,1066.0,1997.0,127.0,633.0,809.0,...,1587.0,24.0,672.0,927.0,145.0,71.0,125.0,189.0,158.0,1279.0
1,h.3,div_lnc,m.2,antisense,0.0,0.0,13.0,0.0,61.0,0.0,...,0.0,824.0,161.0,1770.0,52.0,369.0,150.0,115.0,2.0,15.0
2,h.4,antisense_upep,m.3,protein_coding,1646.0,22.0,178.0,9.0,1414.0,46.0,...,190.0,43.0,144.0,100.0,1006.0,140.0,156.0,1241.0,133.0,126.0
3,h.5,intergenic,m.4,intergenic_upep,1085.0,124.0,978.0,103.0,3397.0,306.0,...,854.0,232.0,149.0,3349.0,0.0,44.0,1090.0,998.0,2193.0,1827.0
4,h.6,intergenic,m.4,intergenic_upep,4454.0,7768.0,2758.0,1874.0,3283.0,3985.0,...,854.0,232.0,149.0,3349.0,0.0,44.0,1090.0,998.0,2193.0,1827.0


In [23]:
tss_map_mpra["comp_id"] = tss_map_mpra["hg19_id"] + "__" + tss_map_mpra["biotype_hg19"] + "__" + tss_map_mpra["mm9_id"] + "__" + tss_map_mpra["biotype_mm9"]
tss_map_mpra_human["comp_id"] = tss_map_mpra_human["hg19_id"] + "__" + tss_map_mpra_human["biotype_hg19"] + "__" + tss_map_mpra_human["mm9_id"] + "__" + tss_map_mpra_human["biotype_mm9"] 
tss_map_mpra_mouse["comp_id"] = tss_map_mpra_mouse["hg19_id"] + "__" + tss_map_mpra_mouse["biotype_hg19"] + "__" + tss_map_mpra_mouse["mm9_id"] + "__" + tss_map_mpra_mouse["biotype_mm9"]
tss_map_dna["comp_id"] = tss_map_dna["hg19_id"] + "__" + tss_map_dna["biotype_hg19"] + "__" + tss_map_dna["mm9_id"] + "__" + tss_map_dna["biotype_mm9"]

tss_map_mpra_human.drop(["hg19_id", "biotype_hg19", "mm9_id", "biotype_mm9"], axis=1, inplace=True)
tss_map_mpra_mouse.drop(["hg19_id", "biotype_hg19", "mm9_id", "biotype_mm9"], axis=1, inplace=True)
tss_map_dna.drop(["hg19_id", "biotype_hg19", "mm9_id", "biotype_mm9"], axis=1, inplace=True)

human_cols = ["comp_id"]
human_cols.extend(HUES64_rna_cols)

mouse_cols = ["comp_id"]
mouse_cols.extend(mESC_rna_cols)

dna_cols = ["comp_id"]
dna_cols.extend(all_dna_cols)

tss_map_mpra_human = tss_map_mpra_human[human_cols]
tss_map_mpra_mouse = tss_map_mpra_mouse[mouse_cols]
tss_map_dna = tss_map_dna[dna_cols]

tss_map_mpra_human.head()

Unnamed: 0,comp_id,samp:HUES64_rep1__barc:1__seq:human,samp:HUES64_rep1__barc:10__seq:human,samp:HUES64_rep1__barc:11__seq:human,samp:HUES64_rep1__barc:12__seq:human,samp:HUES64_rep1__barc:13__seq:human,samp:HUES64_rep1__barc:2__seq:human,samp:HUES64_rep1__barc:3__seq:human,samp:HUES64_rep1__barc:4__seq:human,samp:HUES64_rep1__barc:5__seq:human,...,samp:HUES64_rep2__barc:12__seq:mouse,samp:HUES64_rep2__barc:13__seq:mouse,samp:HUES64_rep2__barc:2__seq:mouse,samp:HUES64_rep2__barc:3__seq:mouse,samp:HUES64_rep2__barc:4__seq:mouse,samp:HUES64_rep2__barc:5__seq:mouse,samp:HUES64_rep2__barc:6__seq:mouse,samp:HUES64_rep2__barc:7__seq:mouse,samp:HUES64_rep2__barc:8__seq:mouse,samp:HUES64_rep2__barc:9__seq:mouse
0,h.2__div_lnc__m.1__div_lnc,2442.0,1066.0,1997.0,127.0,633.0,809.0,0.0,13.0,102.0,...,1587.0,24.0,672.0,927.0,145.0,71.0,125.0,189.0,158.0,1279.0
1,h.3__div_lnc__m.2__antisense,0.0,0.0,13.0,0.0,61.0,0.0,1.0,0.0,73.0,...,0.0,824.0,161.0,1770.0,52.0,369.0,150.0,115.0,2.0,15.0
2,h.4__antisense_upep__m.3__protein_coding,1646.0,22.0,178.0,9.0,1414.0,46.0,389.0,388.0,171.0,...,190.0,43.0,144.0,100.0,1006.0,140.0,156.0,1241.0,133.0,126.0
3,h.5__intergenic__m.4__intergenic_upep,1085.0,124.0,978.0,103.0,3397.0,306.0,3144.0,707.0,418.0,...,854.0,232.0,149.0,3349.0,0.0,44.0,1090.0,998.0,2193.0,1827.0
4,h.6__intergenic__m.4__intergenic_upep,4454.0,7768.0,2758.0,1874.0,3283.0,3985.0,2814.0,941.0,389.0,...,854.0,232.0,149.0,3349.0,0.0,44.0,1090.0,998.0,2193.0,1827.0


In [24]:
# also add dataframe for native comparisons
native_cols = ["comp_id"]
native_human_cols = [x for x in tss_map_mpra.columns if "HUES64" in x and "human" in x]
native_mouse_cols = [x for x in tss_map_mpra.columns if "mESC" in x and "mouse" in x]
native_cols.extend(native_human_cols)
native_cols.extend(native_mouse_cols)
tss_map_mpra_native = tss_map_mpra[native_cols]
tss_map_mpra_native.head()

Unnamed: 0,comp_id,samp:HUES64_rep1__barc:1__seq:human,samp:HUES64_rep1__barc:10__seq:human,samp:HUES64_rep1__barc:11__seq:human,samp:HUES64_rep1__barc:12__seq:human,samp:HUES64_rep1__barc:13__seq:human,samp:HUES64_rep1__barc:2__seq:human,samp:HUES64_rep1__barc:3__seq:human,samp:HUES64_rep1__barc:4__seq:human,samp:HUES64_rep1__barc:5__seq:human,...,samp:mESC_rep1__barc:12__seq:mouse,samp:mESC_rep1__barc:13__seq:mouse,samp:mESC_rep1__barc:2__seq:mouse,samp:mESC_rep1__barc:3__seq:mouse,samp:mESC_rep1__barc:4__seq:mouse,samp:mESC_rep1__barc:5__seq:mouse,samp:mESC_rep1__barc:6__seq:mouse,samp:mESC_rep1__barc:7__seq:mouse,samp:mESC_rep1__barc:8__seq:mouse,samp:mESC_rep1__barc:9__seq:mouse
0,h.2__div_lnc__m.1__div_lnc,2442.0,1066.0,1997.0,127.0,633.0,809.0,0.0,13.0,102.0,...,805.0,0.0,271.0,321.0,88.0,160.0,185.0,93.0,162.0,777.0
1,h.3__div_lnc__m.2__antisense,0.0,0.0,13.0,0.0,61.0,0.0,1.0,0.0,73.0,...,0.0,1946.0,219.0,2102.0,93.0,262.0,351.0,104.0,3.0,0.0
2,h.4__antisense_upep__m.3__protein_coding,1646.0,22.0,178.0,9.0,1414.0,46.0,389.0,388.0,171.0,...,60.0,440.0,134.0,138.0,1097.0,4.0,80.0,1096.0,332.0,48.0
3,h.5__intergenic__m.4__intergenic_upep,1085.0,124.0,978.0,103.0,3397.0,306.0,3144.0,707.0,418.0,...,1315.0,166.0,139.0,4139.0,0.0,43.0,2631.0,2327.0,2732.0,2064.0
4,h.6__intergenic__m.4__intergenic_upep,4454.0,7768.0,2758.0,1874.0,3283.0,3985.0,2814.0,941.0,389.0,...,1315.0,166.0,139.0,4139.0,0.0,43.0,2631.0,2327.0,2732.0,2064.0


In [25]:
# remove duplicates
tss_map_dna.drop_duplicates(inplace=True)
print(len(tss_map_dna))
print(len(tss_map_dna["comp_id"].unique()))

tss_map_mpra_human.drop_duplicates(inplace=True)
print(len(tss_map_mpra_human))
print(len(tss_map_mpra_human["comp_id"].unique()))

tss_map_mpra_mouse.drop_duplicates(inplace=True)
print(len(tss_map_mpra_mouse))
print(len(tss_map_mpra_mouse["comp_id"].unique()))

tss_map_mpra_native.drop_duplicates(inplace=True)
print(len(tss_map_mpra_native))
print(len(tss_map_mpra_native["comp_id"].unique()))

3279
3279
3279
3279
3279
3279
3279
3279


## 4. pair positive controls together to serve as negative controls
for each down-sampled control element (there are 4), randomly choose 100 pairs to serve as human/mouse

In [26]:
ctrl_ids = rna_counts_ctrl.element.unique()
ctrl_ids[0:5]

array([ 'AGTTCCGCTTACATAACTTACGGTAAATGGCCCGCCTGGCTGACCGCCCAACGACCCCCGCCCATTGACGTCAATAATGACGTATGTTCCCATAGTAACGCCAATAGGGACTTTCCATTGACGTCAATGGGTGGAGTATTTACG__samp1',
       'AGTTCCGCTTACATAACTTACGGTAAATGGCCCGCCTGGCTGACCGCCCAACGACCCCCGCCCATTGACGTCAATAATGACGTATGTTCCCATAGTAACGCCAATAGGGACTTTCCATTGACGTCAATGGGTGGAGTATTTACG__samp2',
       'AGTTCCGCTTACATAACTTACGGTAAATGGCCCGCCTGGCTGACCGCCCAACGACCCCCGCCCATTGACGTCAATAATGACGTATGTTCCCATAGTAACGCCAATAGGGACTTTCCATTGACGTCAATGGGTGGAGTATTTACG__samp3',
       'AGTTCCGCTTACATAACTTACGGTAAATGGCCCGCCTGGCTGACCGCCCAACGACCCCCGCCCATTGACGTCAATAATGACGTATGTTCCCATAGTAACGCCAATAGGGACTTTCCATTGACGTCAATGGGTGGAGTATTTACG__samp4',
       'AGTTCCGCTTACATAACTTACGGTAAATGGCCCGCCTGGCTGACCGCCCAACGACCCCCGCCCATTGACGTCAATAATGACGTATGTTCCCATAGTAACGCCAATAGGGACTTTCCATTGACGTCAATGGGTGGAGTATTTACG__samp5'], dtype=object)

In [27]:
ctrl_seqs = set([x.split("__")[0] for x in ctrl_ids])
samp_ids = set([x.split("__")[1] for x in ctrl_ids])

In [28]:
all_samp_id_pairs = list(itertools.combinations(samp_ids, 2))
all_samp_id_pairs_str = ["%s__%s" % (x[0], x[1]) for x in all_samp_id_pairs]
all_samp_id_pairs_str[0:5]

['samp43__samp61',
 'samp43__samp73',
 'samp43__samp37',
 'samp43__samp48',
 'samp43__samp28']

In [29]:
sampled_samp_id_pairs = np.random.choice(all_samp_id_pairs_str, size=100)
sampled_samp_id_pairs[0:5]

array(['samp2__samp19', 'samp41__samp34', 'samp19__samp18',
       'samp83__samp97', 'samp58__samp82'],
      dtype='<U15')

In [30]:
neg_ctrls_dna = pd.DataFrame()
neg_ctrls_human = pd.DataFrame()
neg_ctrls_mouse = pd.DataFrame()
neg_ctrls_native = pd.DataFrame()

for i, seq in enumerate(ctrl_seqs):
    print("ctrl #: %s" % (i+1))
    
    for j, samp_id_pair in enumerate(sampled_samp_id_pairs):
        if j % 50 == 0:
            print("...samp pair #: %s" % (j+1))
            
        samp1 = samp_id_pair.split("__")[0] # arbitrarily call 'human' seq
        samp2 = samp_id_pair.split("__")[1] # arbitrarily call 'mouse' seq
        
        human_elem = "%s__%s" % (seq, samp1)
        mouse_elem = "%s__%s" % (seq, samp2)
        
        human_sub_dna = dna_counts_ctrl[dna_counts_ctrl["element"] == human_elem]
        mouse_sub_dna = dna_counts_ctrl[dna_counts_ctrl["element"] == mouse_elem]
        
        human_sub_rna = rna_counts_ctrl[rna_counts_ctrl["element"] == human_elem]
        mouse_sub_rna = rna_counts_ctrl[rna_counts_ctrl["element"] == mouse_elem]
        
        # re-name columns w/ species name
        human_dna_cols = ["element"]
        mouse_dna_cols = ["element"]
        human_rna_cols = ["element"]
        mouse_rna_cols = ["element"]
        
        human_dna_cols.extend(["%s__seq:human" % x for x in human_sub_dna.columns if x != "element"])
        mouse_dna_cols.extend(["%s__seq:mouse" % x for x in mouse_sub_dna.columns if x != "element"])
        
        human_rna_cols.extend(["%s__seq:human" % x for x in human_sub_rna.columns if x != "element"])
        mouse_rna_cols.extend(["%s__seq:mouse" % x for x in mouse_sub_rna.columns if x != "element"])
        
        human_sub_dna.columns = human_dna_cols
        mouse_sub_dna.columns = mouse_dna_cols
        human_sub_rna.columns = human_rna_cols
        mouse_sub_rna.columns = mouse_rna_cols
        
        # add comp_id to each df
        comp_id = "CONTROL:%s__SAMP_PAIR:%s" % ((i+1), (j+1))
        human_sub_dna["comp_id"] = comp_id
        mouse_sub_dna["comp_id"] = comp_id
        human_sub_rna["comp_id"] = comp_id
        mouse_sub_rna["comp_id"] = comp_id
        
        # merge each df into 1
        human_sub_dna.drop("element", axis=1, inplace=True)
        mouse_sub_dna.drop("element", axis=1, inplace=True)
        human_sub_rna.drop("element", axis=1, inplace=True)
        mouse_sub_rna.drop("element", axis=1, inplace=True)
        
        sub_dna = human_sub_dna.merge(mouse_sub_dna, on="comp_id")
        sub_rna = human_sub_rna.merge(mouse_sub_rna, on="comp_id")
        
        # subset rna appropriately into each negative control bucket
        sub_rna_human_cols = [x for x in sub_rna.columns if x == "comp_id" or "HUES64" in x]
        sub_rna_mouse_cols = [x for x in sub_rna.columns if x == "comp_id" or "mESC" in x]
        sub_rna_native_cols = [x for x in sub_rna.columns if x == "comp_id" or ("HUES64" in x and "human" in x) or ("mESC" in x and "mouse" in x)]
        
        sub_rna_human = sub_rna[sub_rna_human_cols]
        sub_rna_mouse = sub_rna[sub_rna_mouse_cols]
        sub_rna_native = sub_rna[sub_rna_native_cols]
        
        # append
        neg_ctrls_dna = neg_ctrls_dna.append(sub_dna)
        neg_ctrls_human = neg_ctrls_human.append(sub_rna_human)
        neg_ctrls_mouse = neg_ctrls_mouse.append(sub_rna_mouse)
        neg_ctrls_native = neg_ctrls_native.append(sub_rna_native)

ctrl #: 1
...samp pair #: 1
...samp pair #: 51
ctrl #: 2
...samp pair #: 1
...samp pair #: 51
ctrl #: 3
...samp pair #: 1
...samp pair #: 51
ctrl #: 4
...samp pair #: 1
...samp pair #: 51


In [31]:
all_dna = tss_map_dna.append(neg_ctrls_dna)
all_dna.set_index("comp_id", inplace=True)
len(all_dna)

3679

In [32]:
all_rna_human = tss_map_mpra_human.append(neg_ctrls_human)
all_rna_human.set_index("comp_id", inplace=True)
len(all_rna_human)

3679

In [33]:
all_rna_mouse = tss_map_mpra_mouse.append(neg_ctrls_mouse)
all_rna_mouse.set_index("comp_id", inplace=True)
len(all_rna_mouse)

3679

In [34]:
all_rna_native = tss_map_mpra_native.append(neg_ctrls_native)
all_rna_native.set_index("comp_id", inplace=True)
len(all_rna_native)

3679

In [35]:
# also make file w/ everything together to test interactions!
tmp_human = all_rna_human.reset_index()
tmp_mouse = all_rna_mouse.reset_index()
all_rna = tmp_human.merge(tmp_mouse, on="comp_id")
all_cols = all_rna.columns
all_rna.set_index("comp_id", inplace=True)
len(all_rna)

3679

## 5. make annotation files

In [36]:
dna_col_ann = {}
human_col_ann = {}
mouse_col_ann = {}
native_col_ann = {}
all_col_ann = {}

for cols, ann in zip([all_dna_cols, human_cols, mouse_cols, native_cols, all_cols], 
                     [dna_col_ann, human_col_ann, mouse_col_ann, native_col_ann, all_col_ann]):
    for col in cols:
        if col == "comp_id":
            continue
        samp = col.split("__")[0].split("_")[-1][-1]
        cond = col.split(":")[1].split("_")[0]
        barc = col.split(":")[2].split("_")[0]
        seq = col.split(":")[-1]
        ann[col] = {"sample": samp, "condition": cond, "barcode": barc, "seq": seq}

dna_col_ann = pd.DataFrame.from_dict(dna_col_ann, orient="index")
human_col_ann = pd.DataFrame.from_dict(human_col_ann, orient="index")
mouse_col_ann = pd.DataFrame.from_dict(mouse_col_ann, orient="index")
native_col_ann = pd.DataFrame.from_dict(native_col_ann, orient="index")
all_col_ann = pd.DataFrame.from_dict(all_col_ann, orient="index")
native_col_ann.sample(5)

Unnamed: 0,sample,condition,barcode,seq
samp:HUES64_rep1__barc:12__seq:human,1,HUES64,12,human
samp:HUES64_rep2__barc:3__seq:human,2,HUES64,3,human
samp:HUES64_rep2__barc:9__seq:human,2,HUES64,9,human
samp:HUES64_rep1__barc:1__seq:human,1,HUES64,1,human
samp:HUES64_rep1__barc:4__seq:human,1,HUES64,4,human


In [37]:
all_col_ann.sample(5)

Unnamed: 0,sample,condition,barcode,seq
samp:HUES64_rep1__barc:12__seq:human,1,HUES64,12,human
samp:mESC_rep1__barc:7__seq:mouse,1,mESC,7,mouse
samp:mESC_rep1__barc:8__seq:human,1,mESC,8,human
samp:mESC_rep1__barc:6__seq:human,1,mESC,6,human
samp:HUES64_rep2__barc:3__seq:mouse,2,HUES64,3,mouse


## 6. make control ID files

In [38]:
ctrls = all_rna.reset_index()[["comp_id", "samp:HUES64_rep1__barc:10__seq:human"]]
ctrls["ctrl_status"] = ctrls.apply(ctrl_status, axis=1)
ctrls.drop("samp:HUES64_rep1__barc:10__seq:human", axis=1, inplace=True)
ctrls.ctrl_status.value_counts()

False    3279
True      400
Name: ctrl_status, dtype: int64

In [39]:
ctrls.head()

Unnamed: 0,comp_id,ctrl_status
0,h.2__div_lnc__m.1__div_lnc,False
1,h.3__div_lnc__m.2__antisense,False
2,h.4__antisense_upep__m.3__protein_coding,False
3,h.5__intergenic__m.4__intergenic_upep,False
4,h.6__intergenic__m.4__intergenic_upep,False


## 7. write seq comparison files

In [40]:
dna_col_ann.to_csv("%s/dna_col_ann.all_comp.mpranalyze.txt" % mpranalyze_dir, sep="\t")
human_col_ann.to_csv("%s/HUES64_col_ann.seq_comp.mpranalyze.txt" % mpranalyze_dir, sep="\t")
mouse_col_ann.to_csv("%s/mESC_col_ann.seq_comp.mpranalyze.txt" % mpranalyze_dir, sep="\t")
native_col_ann.to_csv("%s/native_col_ann.seq_comp.mpranalyze.txt" % mpranalyze_dir, sep="\t")
all_col_ann.to_csv("%s/all_col_ann.seq_comp.mpranalyze.txt" % mpranalyze_dir, sep="\t")

ctrls.to_csv("%s/ctrl_status.all_comp.mpranalyze.txt" % mpranalyze_dir, sep="\t", index=False)

all_dna.to_csv("%s/dna_counts.all_comp.mpranalyze.txt" % mpranalyze_dir, sep="\t", index=True)
all_rna_human.to_csv("%s/HUES64_rna_counts.seq_comp.mpranalyze.txt" % mpranalyze_dir, sep="\t", index=True)
all_rna_mouse.to_csv("%s/mESC_rna_counts.seq_comp.mpranalyze.txt" % mpranalyze_dir, sep="\t", index=True)
all_rna_native.to_csv("%s/native_rna_counts.seq_comp.mpranalyze.txt" % mpranalyze_dir, sep="\t", index=True)
all_rna.to_csv("%s/all_rna_counts.seq_comp.mpranalyze.txt" % mpranalyze_dir, sep="\t", index=True)

# then make files for cell line comparisons (trans effects)

## 1. run trans effects separately for human seqs & mouse seqs, so subset counts dataframe

In [41]:
human_columns = [x for x in all_rna.columns if "seq:human" in x]
mouse_columns = [x for x in all_rna.columns if "seq:mouse" in x]

In [42]:
human_trans = all_rna[human_columns]
mouse_trans = all_rna[mouse_columns]

In [43]:
print(len(human_trans))

3679


In [44]:
print(len(mouse_trans))

3679


## 2. subset annotation dataframe

In [45]:
tmp = all_col_ann.reset_index()
tmp.head()

Unnamed: 0,index,sample,condition,barcode,seq
0,samp:HUES64_rep1__barc:10__seq:human,1,HUES64,10,human
1,samp:HUES64_rep1__barc:10__seq:mouse,1,HUES64,10,mouse
2,samp:HUES64_rep1__barc:11__seq:human,1,HUES64,11,human
3,samp:HUES64_rep1__barc:11__seq:mouse,1,HUES64,11,mouse
4,samp:HUES64_rep1__barc:12__seq:human,1,HUES64,12,human


In [46]:
human_trans_col_ann = tmp[tmp["index"].isin(human_columns)].set_index("index")
del human_trans_col_ann.index.name
human_trans_col_ann.sample(5)

Unnamed: 0,sample,condition,barcode,seq
samp:HUES64_rep2__barc:12__seq:human,2,HUES64,12,human
samp:HUES64_rep2__barc:2__seq:human,2,HUES64,2,human
samp:HUES64_rep2__barc:9__seq:human,2,HUES64,9,human
samp:HUES64_rep2__barc:4__seq:human,2,HUES64,4,human
samp:mESC_rep1__barc:4__seq:human,1,mESC,4,human


In [47]:
mouse_trans_col_ann = tmp[tmp["index"].isin(mouse_columns)].set_index("index")
del mouse_trans_col_ann.index.name
mouse_trans_col_ann.sample(5)

Unnamed: 0,sample,condition,barcode,seq
samp:HUES64_rep2__barc:2__seq:mouse,2,HUES64,2,mouse
samp:HUES64_rep2__barc:6__seq:mouse,2,HUES64,6,mouse
samp:mESC_rep1__barc:11__seq:mouse,1,mESC,11,mouse
samp:HUES64_rep2__barc:7__seq:mouse,2,HUES64,7,mouse
samp:HUES64_rep2__barc:5__seq:mouse,2,HUES64,5,mouse


In [48]:
print(len(human_columns))
print(len(human_trans_col_ann))
print(len(mouse_columns))
print(len(mouse_trans_col_ann))

39
39
39
39


## 3. write cell comparison files

In [49]:
human_trans_col_ann.to_csv("%s/human_col_ann.cell_comp.mpranalyze.txt" % mpranalyze_dir, sep="\t")
mouse_trans_col_ann.to_csv("%s/mouse_col_ann.cell_comp.mpranalyze.txt" % mpranalyze_dir, sep="\t")

human_trans.to_csv("%s/human_rna_counts.cell_comp.mpranalyze.txt" % mpranalyze_dir, sep="\t", index=True)
mouse_trans.to_csv("%s/mouse_rna_counts.cell_comp.mpranalyze.txt" % mpranalyze_dir, sep="\t", index=True)