In [1]:
import pandas as pd

from datapaths import *

pd.set_option("display.max_rows", 6)


In [2]:
def get_group(x: list):
    ccres = ["dELS", "pELS", "PLS", "DNase-H3K4me3"]
    any_ccre = False
    for ccre in ccres:
        if ccre in x[0]:
            any_ccre = True
            break
    any_ctcf = "CTCF" in x[0]
    any_reps = x[1] != ""

    return (
        " & ".join([any_ccre * "cCRE", any_reps * "LINE/LTR", any_ctcf * "CTCF"])
        .replace("&  &", "&")
        .strip()
    )


def get_rep_cats(x: str):
    return " + ".join(sorted(set([y.strip() for y in x.split(",") if y])))

In [3]:
flipon_to_data = (
    pd.read_table(F_FLIPON_TO_DATA_MIRNA).fillna('')
    .replace(regex=r"(@(.+?), )|(@(.+?)$)", value=",")
    .assign(
        ccre_col=lambda x: x["cCRE (+-200bp)"].apply(get_rep_cats),
        rep_col=lambda x: x["LINE/LTR (+-200bp)"].apply(get_rep_cats),
        col=lambda x: (x["ccre_col"] + " + " + x["rep_col"]).str.strip(" +"),
        feature_group=lambda x: x[["ccre_col", "rep_col"]]
        .apply(get_group, axis=1)
        .str.strip("& "),
    )
)

flipon_to_data


Unnamed: 0,Flipon,Coordinates,Gene Name,Gene Strand,Gene Feature,miRNA (+),miRNA (-),miRNA (intersection),cCRE (+-200bp),LINE/LTR (+-200bp),ccre_col,rep_col,col,feature_group
0,g4,chr1:3014794-3014871,4933401J01Rik,+,Distal Intergenic,miR-328 (1),,,,"LINE,LINE,",,LINE,LINE,LINE/LTR
1,g4,chr1:3099888-3099963,Gm26206,+,Promoter (2-3kb),,miR-328 (1),,,"LINE,LTR,",,LINE + LTR,LINE + LTR,LINE/LTR
2,g4,chr1:3535948-3535996,Gm7341,+,Intron,miR-129 (1),,,,"LINE,LINE,",,LINE,LINE,LINE/LTR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31687,z-dna,chrY:1049956-1049970,Gm29650,+,Promoter (1-2kb),miR-455 (1),,,,"LTR,LTR,",,LTR,LTR,LINE/LTR
31688,z-dna,chrY:2227060-2227083,Uba1y-ps2,+,Distal Intergenic,,miR-377 (1),,,,,,,
31689,z-dna,chrY:2495414-2495442,Gm22119,-,Distal Intergenic,,miR-377 (1),,,"LINE,",,LINE,LINE,LINE/LTR


In [4]:
# kwargs = {'LINE/LTR (+-200bp)': pd.read_table(F_FLIPON_TO_DATA_MIRNA).fillna('')['LINE/LTR (+-200bp)']}
# flipon_to_data.assign(**kwargs)[
#     (
#         flipon_to_data["miRNA (+)"].str.contains("miR-203")
#         | flipon_to_data["miRNA (-)"].str.contains("miR-203")
#     )
#     & (flipon_to_data["rep_col"] != "")
# ][['Flipon', 'Coordinates', 'miRNA (+)', 'miRNA (-)', 'Gene Feature', 'Gene Strand', 'LINE/LTR (+-200bp)']]


In [5]:
# flipon_to_data[(flipon_to_data['group']=="g4") & (flipon_to_data['miRNA (+ strand)'].str.contains('miR-22')) & (flipon_to_data['miRNA (- strand)'].str.contains('miR-744'))]

In [6]:
data_of_interest = [
    ('z-dna', 'miR-147', 'Promoter (<=1kb)'),
    ('z-dna', 'miR-210', 'Promoter (<=1kb)'),
    ('z-dna', 'miR-329/362', 'Promoter (<=1kb)'),
    ('z-dna', 'miR-744', 'Promoter (<=1kb)'),
    ('g4', 'miR-532', 'Promoter (<=1kb)'),
    ('g4', 'miR-491', 'Promoter (<=1kb)'),
    ('g4', 'miR-423', 'Promoter (<=1kb)'),
    ('g4', 'miR-296', 'Promoter (<=1kb)'),
    ('g4', 'miR-744', 'Promoter (<=1kb)'),
]

In [7]:
gene_lists = pd.DataFrame()

for group, mirna, annotation in data_of_interest:
    print(group, mirna)
    gene_list = flipon_to_data[
        (flipon_to_data["Flipon"] == group)
        & (
            (flipon_to_data["miRNA (+)"].str.contains(mirna))
            | (flipon_to_data["miRNA (-)"].str.contains(mirna))
        )
        & (flipon_to_data["Gene Feature"] == annotation)
    ]['Gene Name'].unique()

    temp_df = pd.DataFrame({
        'Flipon' : group,
        'miRNA' : mirna,
        'Gene Feature': annotation,
        'Feature group': 'any',
        '# of Genes': len(gene_list),
        'Gene List': [','.join(map(str, sorted(gene_list)))]
    })
    
    gene_lists = pd.concat([gene_lists, temp_df], ignore_index=True)

gene_lists


z-dna miR-147
z-dna miR-210
z-dna miR-329/362
z-dna miR-744
g4 miR-532
g4 miR-491
g4 miR-423
g4 miR-296
g4 miR-744


Unnamed: 0,Flipon,miRNA,Gene Feature,Feature group,# of Genes,Gene List
0,z-dna,miR-147,Promoter (<=1kb),any,117,"1200007C13Rik,5330417C22Rik,6430548M08Rik,A830..."
1,z-dna,miR-210,Promoter (<=1kb),any,326,"1700016J18Rik,1700025G04Rik,1700041B01Rik,1700..."
2,z-dna,miR-329/362,Promoter (<=1kb),any,158,"1700025L06Rik,1700067K01Rik,2010009K17Rik,5033..."
...,...,...,...,...,...,...
6,g4,miR-423,Promoter (<=1kb),any,341,"1200007C13Rik,1700016P03Rik,1700018A04Rik,1700..."
7,g4,miR-296,Promoter (<=1kb),any,273,"2900026A02Rik,4930430O22Rik,4930478K11Rik,4930..."
8,g4,miR-744,Promoter (<=1kb),any,664,"1700030C10Rik,1700039E22Rik,2510009E07Rik,4930..."


In [8]:
for feature_group in sorted(flipon_to_data['feature_group'].unique()):
    for flipon_group in sorted(flipon_to_data['Flipon'].unique()):

        gene_list = flipon_to_data[
            (flipon_to_data["Flipon"] == flipon_group)
            & (flipon_to_data["feature_group"] == feature_group)
            & (flipon_to_data['Gene Feature']=="Promoter (<=1kb)")
        ]
        
        if len(gene_list):
            gene_list=gene_list['Gene Name'].unique()
        else:
            gene_list = list()

        temp_df = pd.DataFrame({
            'Flipon' : flipon_group,
            'miRNA' : 'any',
            'Gene Feature': 'Promoter (<=1kb)',
            'Feature group': feature_group,
            '# of Genes': len(gene_list),
            'Gene List': [','.join(map(str, sorted(gene_list)))]
        })
        
        gene_lists = pd.concat([gene_lists, temp_df], ignore_index=True)

gene_lists

Unnamed: 0,Flipon,miRNA,Gene Feature,Feature group,# of Genes,Gene List
0,z-dna,miR-147,Promoter (<=1kb),any,117,"1200007C13Rik,5330417C22Rik,6430548M08Rik,A830..."
1,z-dna,miR-210,Promoter (<=1kb),any,326,"1700016J18Rik,1700025G04Rik,1700041B01Rik,1700..."
2,z-dna,miR-329/362,Promoter (<=1kb),any,158,"1700025L06Rik,1700067K01Rik,2010009K17Rik,5033..."
...,...,...,...,...,...,...
38,h-dna,any,Promoter (<=1kb),cCRE & LINE/LTR & CTCF,3,"Akain1,Gm28960,Lmo1"
39,sidd,any,Promoter (<=1kb),cCRE & LINE/LTR & CTCF,10,"Col27a1,Gm13429,Gm19989,Gm39463,Gm7527,Lap3,Rg..."
40,z-dna,any,Promoter (<=1kb),cCRE & LINE/LTR & CTCF,15,"2700033N17Rik,Asic1,Bmp3,Gm49890,Gm6623,Il17d,..."


In [9]:
gene_lists.to_csv(D_TABLES / "gene_lists.tsv", sep='\t', index=False)

## Strands

### G4

In [9]:
mir296_gene_list = "BARHL2, COL13A1, DHH, DLX6, DOCK7, TWIST1, HOXD13, HOXA13, HOXA10, ARHGAP22, NEUROD2, DMBT1, SLIT1, MSX1, OTX1, PITX1, NKX2-4, AMN, WNT3A, ODF3, DAB2IP, SHROOM2, EBF4, BMP6, TBX18, TLX3, ETL4, HOXB7, KCTD15, VSX2".split(", ")
mir_dev_gene_list = "SEMA5A, DRAXIN, BARHL2, WNT2B, SERPINE2, CTNND2, DOCK7, IHH, CELSR1, HOXA13, FGF3, IGF1R, HOXA10, DMBX1, SOX6, HOXA6, DACT1, HOXA4, NKX2-9, EMX1, SHROOM3, PAX3, EBF3, WNT9A, EBF4, NPNT, HIC1, OVOL2, TMEFF1, MN1, OLFM1, RIPPLY3, SFRP1, TRNP1, KCTD15, CILK1, CRB2, DHH, HOXD1, PDGFB, TWIST1, ASCL1, NKD1, ARHGAP22, SCUBE2, ARVCF, ERBB3, FRZB, NSD1, SLIT3, NKX2-4, WNT4, NKX2-2, TBX1, NTRK1, FZD2, WNT3A, CDX1, RYK, TDRD7, TDRD5, LBX1, VAX2, BMP6, VANGL1, HEYL, DAB1, BMP1, FEV, ZFP64, PLXNB2, FGFR3".split(", ")

In [10]:
def get_mirna_strands(s: pd.Series):
    mirna_strand: str
    nans = ~s.replace('', None).isna()
    
    if nans['miRNA (+ strand)'] and nans['miRNA (- strand)']:
        mirna_strand = 'both'
    elif nans['miRNA (+ strand)']:
        mirna_strand = '+'
    elif nans['miRNA (- strand)']:
        mirna_strand = '-'

    if mirna_strand == 'both':
        return 'both strands'
    if mirna_strand == s['gene_strand']:
        return 'same strand'
    return 'other strand'


flipon_to_data_gi = flipon_to_data.loc[:,['coordinates', 'group', 'miRNA (+ strand)', 'miRNA (- strand)', 'annotation']]
flipon_to_data_gi[['gene_name', 'gene_strand']] = flipon_to_data['gene_info'].str.split(' ', expand=True).values
flipon_to_data_gi['gene_strand'] = flipon_to_data_gi['gene_strand'].str.replace(')','').str.replace('(','')
flipon_to_data_gi['mirna_strands'] = flipon_to_data_gi[['gene_strand', 'miRNA (+ strand)', 'miRNA (- strand)']].apply(get_mirna_strands, axis=1)

flipon_to_data_gi

  flipon_to_data_gi['gene_strand'] = flipon_to_data_gi['gene_strand'].str.replace(')','').str.replace('(','')


Unnamed: 0,coordinates,group,miRNA (+ strand),miRNA (- strand),annotation,gene_name,gene_strand,mirna_strands
0,chr10:100119606-100119743,sidd,"miR-539, miR-670","miR-143, miR-186, miR-30",Distal Intergenic,Gm22918,+,both strands
1,chr10:100146768-100146986,sidd,"miR-155, miR-374",miR-448,Promoter (2-3kb),Gm25287,+,both strands
2,chr10:100160840-100160994,sidd,"miR-188, miR-204/211","miR-203a, miR-653",Distal Intergenic,Gm25287,+,both strands
...,...,...,...,...,...,...,...,...
31683,chrY:3773694-3773741,h-dna,,miR-185,Promoter (1-2kb),Gm3376,+,other strand
31684,chrY:3865190-3865212,h-dna,miR-122,,Distal Intergenic,Gm8521,+,same strand
31685,chrY:3879642-3879666,h-dna,miR-185,,Distal Intergenic,Gm18177,-,other strand


In [18]:
def get_gene_flipon_strands(s: pd.Series):
    if s['gene_strand'] == s['strand']:
        return 'same strand'
    if s['gene_strand'] != s['strand']:
        return 'other strand'


g4_anno = pd.read_table(
    d_data / "nonB_DNA_ssDNA_enriched" / "mouse_mm10" / "actB_ssDNA_enriched_G4.bed",
    header=None,
    names=["chr", "start", "end", "name", "score", "strand"],
).assign(
    coordinates=lambda x: x["chr"]
    + ":"
    + x["start"].astype(str)
    + "-"
    + x["end"].astype(str)
)

t = flipon_to_data_gi[
    (flipon_to_data_gi["group"] == "g4")
    & (flipon_to_data_gi["annotation"].str.contains("Promoter"))
    & (flipon_to_data_gi["gene_name"].str.upper().isin(mir296_gene_list))
].merge(g4_anno, how='left', on='coordinates').assign(gene_flipon_strand = lambda x: x[['gene_strand', 'strand']].apply(get_gene_flipon_strands, axis=1))

with pd.option_context('display.max_rows', None):
    display(t.replace('+','\'+').assign(gene_list='mir296_gene_list')[['gene_list', 'coordinates', 'annotation', 'gene_name', 'gene_strand', 'strand']])

same, other, both = 0, 0, 0
for idx, group in t.groupby('gene_name'):
    if len(group['gene_flipon_strand'].unique()) == 2:
        both += 1
    else:
        if group['gene_flipon_strand'].values[0] == 'same strand':
            same += 1
        else:
            other += 1

print(f'same: {same}, other: {other}, both: {both}')

Unnamed: 0,gene_list,coordinates,annotation,gene_name,gene_strand,strand
0,mir296_gene_list,chr10:61978558-61978585,Promoter (<=1kb),Col13a1,-,-
1,mir296_gene_list,chr11:22001748-22001784,Promoter (<=1kb),Otx1,-,-
2,mir296_gene_list,chr11:22001916-22001944,Promoter (<=1kb),Otx1,-,-
3,mir296_gene_list,chr11:33201050-33201084,Promoter (2-3kb),Tlx3,-,'+
4,mir296_gene_list,chr11:33203301-33203328,Promoter (<=1kb),Tlx3,-,-
5,mir296_gene_list,chr11:33203462-33203497,Promoter (<=1kb),Tlx3,-,'+
6,mir296_gene_list,chr11:59290855-59290882,Promoter (<=1kb),Wnt3a,-,'+
7,mir296_gene_list,chr11:59290905-59290954,Promoter (<=1kb),Wnt3a,-,'+
8,mir296_gene_list,chr11:96285479-96285505,Promoter (1-2kb),Hoxb7,'+,'+
9,mir296_gene_list,chr11:96287363-96287392,Promoter (<=1kb),Hoxb7,'+,-


same: 10, other: 7, both: 13


In [20]:
t = flipon_to_data_gi[
    (flipon_to_data_gi["group"] == "g4")
    & (flipon_to_data_gi["annotation"].str.contains("Promoter"))
    & (flipon_to_data_gi["gene_name"].str.upper().isin(mir_dev_gene_list))
].merge(g4_anno, how='left', on='coordinates').assign(gene_flipon_strand = lambda x: x[['gene_strand', 'strand']].apply(get_gene_flipon_strands, axis=1))

with pd.option_context('display.max_rows', None):
    display(t.replace('+', '\'+').assign(gene_list='mir_dev_gene_list')[['gene_list', 'coordinates', 'annotation', 'gene_name', 'gene_strand', 'strand']])

same, other, both = 0, 0, 0
for idx, group in t.groupby('gene_name'):
    if len(group['gene_flipon_strand'].unique()) == 2:
        both += 1
    else:
        if group['gene_flipon_strand'].values[0] == 'same strand':
            same += 1
        else:
            other += 1

print(f'same: {same}, other: {other}, both: {both}')

Unnamed: 0,gene_list,coordinates,annotation,gene_name,gene_strand,strand
0,mir_dev_gene_list,chr10:128589791-128589826,Promoter (<=1kb),Erbb3,-,-
1,mir_dev_gene_list,chr10:87493713-87493742,Promoter (<=1kb),Ascl1,-,-
2,mir_dev_gene_list,chr11:102604438-102604463,Promoter (<=1kb),Fzd2,'+,'+
3,mir_dev_gene_list,chr11:102604738-102604771,Promoter (<=1kb),Fzd2,'+,-
4,mir_dev_gene_list,chr11:35121042-35121066,Promoter (<=1kb),Slit3,'+,'+
5,mir_dev_gene_list,chr11:35121348-35121377,Promoter (<=1kb),Slit3,'+,-
6,mir_dev_gene_list,chr11:59290855-59290882,Promoter (<=1kb),Wnt3a,-,'+
7,mir_dev_gene_list,chr11:59290905-59290954,Promoter (<=1kb),Wnt3a,-,'+
8,mir_dev_gene_list,chr11:59307096-59307128,Promoter (<=1kb),Wnt9a,'+,'+
9,mir_dev_gene_list,chr11:59308205-59308232,Promoter (1-2kb),Wnt9a,'+,-


same: 32, other: 15, both: 25


### Other

In [48]:
# 1
flipon_to_data_gi[
    (flipon_to_data_gi["group"] == "g4")
    & (flipon_to_data_gi['annotation'].str.contains('Promoter'))
    & (flipon_to_data_gi["gene_name"].str.upper().isin(mir296_gene_list))
]['mirna_strands'].value_counts()


other strand    23
same strand     23
both strands    15
Name: mirna_strands, dtype: int64

In [49]:
# 2
flipon_to_data_gi[
    (flipon_to_data_gi["group"].isin(["g4", 'z-dna']))
    & (flipon_to_data_gi['annotation'].str.contains('Promoter'))
    & (flipon_to_data_gi["gene_name"].str.upper().isin(mir_dev_gene_list))
][['group', 'mirna_strands']].value23_counts()


group  mirna_strands
g4     same strand      71
       other strand     42
       both strands     23
z-dna  other strand     15
       same strand      14
       both strands      3
dtype: int64

In [62]:
import tempfile

path_to_zdna = d_data / "mm10_kouzine_ssDNA_bed" / "mm10_kouzine_actb_ssdna_enriched_z-dna.bed"
path_to_g4 = d_data / "mm10_kouzine_ssDNA_bed" / "mm10_kouzine_actb_ssdna_enriched_quadruplex.bed"

with tempfile.TemporaryDirectory() as tmpdir:
    zdna_and_g4 = f"{tmpdir}/zdna_and_g4.bed"
    g4_and_zdna = f"{tmpdir}/g4_and_zdna.bed"

    !bedtools intersect -a {path_to_zdna} -b {path_to_g4} -u > {zdna_and_g4}
    !bedtools intersect -b {path_to_zdna} -a {path_to_g4} -u > {g4_and_zdna}

    intersection_coords = pd.concat([
        pd.read_table(zdna_and_g4, header=None),
        pd.read_table(g4_and_zdna, header=None),
    ])[3].values

    intersection = flipon_to_data_gi[flipon_to_data_gi['coordinates'].isin(intersection_coords)]
    display(intersection[
        (intersection['annotation'].str.contains('Promoter'))
        & (intersection["gene_name"].str.upper().isin(mir_dev_gene_list))
    ][['mirna_strands']].value_counts()
    )

mirna_strands
other strand     9
same strand      9
both strands     1
dtype: int64