# Обогащение R-loops

## 1. Читаем из файла

In [3]:
import pandas as pd
import numpy as np

In [4]:
loops_info = pd.read_excel('../data/dicty_loop_positions_Chr1_Chr6.xlsx')
window = 2000
loops_info.head()

Unnamed: 0,Chr,ID,"Genomic bin, Left base","Genomic bin, Right base",Size (Kb),"""Extrusion track""","Note for extrusion tracks: 0=n/a, 1=from the left base, 2=from the right base"
0,1,1,107,114,16,0,
1,1,2,114,123,20,1,
2,1,3,176,187,24,2,
3,1,4,187,194,16,0,
4,1,5,194,209,32,2,


In [5]:
chr_pattern = 'chr'
chr_with_loops = sorted(loops_info.Chr.unique())
chr_with_loops = [chr_pattern + str(chr) for chr in chr_with_loops]
chr_with_loops

['chr1', 'chr6']

### 1.1 Позиции R-loops в хромосомах, где есть петли

In [49]:
chr_r_loop_pos = {}
genome_occurs = 0
lines = []
with open('../data/R-loop/r_loops.out.bed') as inp:
    for line in inp:
        line = line.replace('\n', '')   
        row = line.split('\t')
        chr_name = row[0]
        if not(chr_name in chr_with_loops):
            continue
        start, end = int(row[1]) / window, int(row[2]) / window
        chr_r_loop_pos[chr_name] = chr_r_loop_pos.get(chr_name, {})
        chr_r_loop_pos[chr_name][start] = chr_r_loop_pos[chr_name].get(start, 0) + 1
        genome_occurs = genome_occurs + 1
        lines.append([chr_name, int(row[1]), int(row[2])])

In [23]:
chr_r_loop_pos

{'chr1': {338: 1, 520: 3, 584: 1}, 'chr6': {1366: 14, 1395: 3, 1682: 11}}

### 1.2 Схлопываем overlaps

In [66]:
lines_chr1 = [ar for ar in lines if ar[0] == 'chr1']
lines_chr6 = [ar for ar in lines if ar[0] == 'chr6']

In [56]:
lines_chr1.sort(key = lambda ar: ar[1])
lines_chr6.sort(key = lambda ar: ar[1])

In [58]:
lines = lines_chr1 + lines_chr6
lines

[['chr1', 676313, 676429],
 ['chr1', 1040105, 1040259],
 ['chr1', 1040105, 1040263],
 ['chr1', 1040123, 1040256],
 ['chr1', 1168811, 1168934],
 ['chr6', 2732148, 2733223],
 ['chr6', 2732220, 2733183],
 ['chr6', 2732292, 2733105],
 ['chr6', 2732437, 2733034],
 ['chr6', 2732505, 2733004],
 ['chr6', 2732527, 2733003],
 ['chr6', 2732595, 2732881],
 ['chr6', 2732595, 2732881],
 ['chr6', 2732690, 2732881],
 ['chr6', 2732694, 2732881],
 ['chr6', 2732804, 2733034],
 ['chr6', 2732808, 2733034],
 ['chr6', 2732852, 2733011],
 ['chr6', 2732852, 2733011],
 ['chr6', 2790735, 2790894],
 ['chr6', 2790735, 2790894],
 ['chr6', 2790758, 2790894],
 ['chr6', 3364852, 3365309],
 ['chr6', 3364852, 3365313],
 ['chr6', 3364886, 3365281],
 ['chr6', 3364886, 3365287],
 ['chr6', 3364922, 3365260],
 ['chr6', 3364922, 3365273],
 ['chr6', 3364966, 3365248],
 ['chr6', 3364974, 3365212],
 ['chr6', 3364979, 3365206],
 ['chr6', 3365023, 3365179],
 ['chr6', 3365023, 3365196]]

In [65]:
with open('../data/R-loop/r_loops_chr1_6.bed', 'w') as out:
    for line in lines:
        s = '\t'.join(str(l) for l in line) + '\n'
        out.write(s)

In [67]:
!bedtools merge -i '../data/R-loop/r_loops_chr1_6.bed' > '../data/R-loop/r_loops_merged.out.bed'

In [68]:
chr_r_loop_merged_pos = {}
genome_occurs = 0
with open('../data/R-loop/r_loops_merged.out.bed') as inp:
    for line in inp:
        line = line.replace('\n', '')   
        row = line.split('\t')
        chr_name = row[0]
        if not(chr_name in chr_with_loops):
            continue
        start, end = int(row[1]) / window, int(row[2]) / window
        chr_r_loop_merged_pos[chr_name] = chr_r_loop_merged_pos.get(chr_name, {})
        chr_r_loop_merged_pos[chr_name][start] = chr_r_loop_merged_pos[chr_name].get(start, 0) + 1
        genome_occurs = genome_occurs + 1

In [69]:
chr_r_loop_merged_pos

{'chr1': {338: 1, 520: 1, 584: 1}, 'chr6': {1366: 1, 1395: 1, 1682: 1}}

In [11]:
r_df = pd.DataFrame(columns=['Chr', 'start', 'end'])
for chr_name, d in chr_r_loop_merged_pos.iteritems():
    for start, end in d.iteritems():
        r_df = r_df.append({'Chr': chr_name, 'start': start, 'end': end}, ignore_index=True)

In [12]:
r_df

Unnamed: 0,Chr,start,end
0,chr6,3364852,3365313
1,chr6,2732148,2733223
2,chr6,2790735,2790894
3,chr1,676313,676429
4,chr1,1168811,1168934
5,chr1,1040105,1040263


### 1.3 Сопоставление с позициями петель 

In [81]:
df_rloop = pd.DataFrame(columns=['chr', 'rloop_pos', 'left_loop_cnt', 'right_loop_cnt', 'inner_loop_cnt'])

for chr_name,d in chr_r_loop_pos.iteritems():
    chr_name = int(chr_name[3:])
    l_info = loops_info[loops_info['Chr'] == chr_name] # фильтруем только петли для данной хромосомы
    for rloop_pos,cnt in d.iteritems():
        left_cnt = 0
        right_cnt = 0
        inner_cnt = 0
        for _,row in loops_info.iterrows():
            left = row['Genomic bin, Left base']
            right = row['Genomic bin, Right base']
            if rloop_pos < left - 1:
                break
                
            if (rloop_pos >= left-1) and (rloop_pos <= left+1):
                left_cnt = left_cnt + cnt
            elif (rloop_pos >= right-1) and (rloop_pos <= right+1):
                right_cnt = right_cnt + cnt
            elif (rloop_pos > left+1) and (rloop_pos < right-1):
                inner_cnt = inner_cnt + cnt
        df_rloop = df_rloop.append({'chr': 'chr' + str(chr_name), \
                              'rloop_pos': rloop_pos, \
                              'left_loop_cnt': left_cnt, \
                              'right_loop_cnt': right_cnt, \
                              'inner_loop_cnt': inner_cnt}, ignore_index = True)

In [83]:
df_merged_rloop = pd.DataFrame(columns=['chr', 'rloop_pos', 'left_loop_cnt', 'right_loop_cnt', 'inner_loop_cnt'])

for chr_name,d in chr_r_loop_merged_pos.iteritems():
    chr_name = int(chr_name[3:])
    l_info = loops_info[loops_info['Chr'] == chr_name] # фильтруем только петли для данной хромосомы
    for rloop_pos,cnt in d.iteritems():
        left_cnt = 0
        right_cnt = 0
        inner_cnt = 0
        for _,row in loops_info.iterrows():
            left = row['Genomic bin, Left base']
            right = row['Genomic bin, Right base']
            if rloop_pos < left - 1:
                break
                
            if (rloop_pos >= left-1) and (rloop_pos <= left+1):
                left_cnt = left_cnt + cnt
            elif (rloop_pos >= right-1) and (rloop_pos <= right+1):
                right_cnt = right_cnt + cnt
            elif (rloop_pos > left+1) and (rloop_pos < right-1):
                inner_cnt = inner_cnt + cnt
        df_merged_rloop = df_merged_rloop.append({'chr': 'chr' + str(chr_name), \
                              'rloop_pos': rloop_pos, \
                              'left_loop_cnt': left_cnt, \
                              'right_loop_cnt': right_cnt, \
                              'inner_loop_cnt': inner_cnt}, ignore_index = True)

In [85]:
df_rloop

Unnamed: 0,chr,rloop_pos,left_loop_cnt,right_loop_cnt,inner_loop_cnt
0,chr6,1682,0,0,0
1,chr6,1395,3,0,0
2,chr6,1366,0,0,0
3,chr1,520,3,3,0
4,chr1,584,0,0,0
5,chr1,338,0,0,0


In [84]:
df_merged_rloop

Unnamed: 0,chr,rloop_pos,left_loop_cnt,right_loop_cnt,inner_loop_cnt
0,chr6,1682,0,0,0
1,chr6,1395,1,0,0
2,chr6,1366,0,0,0
3,chr1,520,1,1,0
4,chr1,584,0,0,0
5,chr1,338,0,0,0
