In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [8]:
from itertools import count
import json
import argparse
import os
import random
import pandas as pd
import numpy as np
import pyBigWig

NARROWPEAK_SCHEMA = ["chr", "start", "end", "1", "2", "3", "4", "5", "6", "summit"]

peak_regions_df = pd.read_csv("/mnt/lab_data2/vir/tf_chr_atlas/temp/docker_modelling/ENCSR142IGM_peaks_inliers.bed.gz", sep='\t', names=NARROWPEAK_SCHEMA)
peak_regions_df['region']='peak'
peak_regions_df['ind']=range(len(peak_regions_df))
nonpeak_regions_df = pd.read_csv("/mnt/lab_data2/vir/tf_chr_atlas/temp/docker_modelling/ENCSR142IGM_gc_neg_only.bed.gz", sep='\t', names=NARROWPEAK_SCHEMA)
nonpeak_regions_df['region']='nonpeak'
nonpeak_regions_df['ind']=range(len(nonpeak_regions_df))


all_regions_df = pd.concat([peak_regions_df,nonpeak_regions_df])
all_regions_df['pos']=all_regions_df['start']+all_regions_df['summit']
all_regions_df.sort_values(by=['chr', 'pos'], inplace=True)
all_regions_df=all_regions_df.reset_index(drop=True)

print("Creating Splits")

group_dict = {}

inputlen=2114
max_jitter=32

cur_chrom = ''
cur_group = ''
last_pos = 0
for index,row in all_regions_df.iterrows():
    if cur_chrom != '':
        if row['chr'] != cur_chrom:
            cur_chrom = row['chr']
            cur_group += 1
            group_dict[cur_group] = [row]
        else:
            if row['pos'] <= int(last_pos) + int(inputlen) + int(2 * max_jitter):
                group_dict[cur_group].append(row)
            else:
                cur_group += 1
                group_dict[cur_group] = [row]
    else:
        cur_chrom = row['chr']
        cur_group = 0
        group_dict[cur_group] = [row]
    last_pos = row['pos']
    
groups = []
group_counts = []
bigwig = '/mnt/lab_data2/vir/tf_chr_atlas/temp/docker_modelling/ENCSR142IGM_plus.bigWig'
bw = pyBigWig.open(bigwig)

for group in group_dict:
    groups.append(group)
    sum = 0
    for element in group_dict[group]:
        labels = bw.values(element['chr'], int(element['pos'] - (inputlen // 2)), int(element['pos'] + (inputlen // 2)))
        labels = np.array(labels)
        labels = np.nan_to_num(labels)
        labels = np.sum(labels)
        sum += labels
    group_counts.append(sum)
group_df = pd.DataFrame({'groups': groups, 'group_counts': group_counts})
group_df = group_df.sort_values(by='group_counts').reset_index(drop=True)



Creating Splits


In [9]:
number_of_folds = 5

In [10]:
# split the data into x number of chuncks for x folds. allocate the chuncks to train, test, valid folds in unique ways
# across folds
chuncksets_dict={}
for fold in range(number_of_folds):
    chuncksets_dict[f"chuncksets_{fold}"]=list(range(fold,len(group_df),number_of_folds))
    
val_chuncks = list(range(0,number_of_folds))
print("val_chuncks:",val_chuncks)
test_chuncks = list(range(1,number_of_folds))+[0]
print("test_chuncks:",test_chuncks)

group_fold_df=pd.DataFrame(index=np.arange(len(group_df)))
for fold in range(number_of_folds):
    group_fold_df[f"fold{fold}"]='train'
    group_fold_df[f"fold{fold}"][chuncksets_dict[f"chuncksets_{val_chuncks[fold]}"]] = 'valid'
    group_fold_df[f"fold{fold}"][chuncksets_dict[f"chuncksets_{test_chuncks[fold]}"]] = 'test'
print(group_fold_df)

for fold in range(number_of_folds):
    print("fold:",fold)
    len(group_fold_df[group_fold_df[f"fold{fold}"]=="train"])/len(group_fold_df)

    len(group_fold_df[group_fold_df[f"fold{fold}"]=="test"])/len(group_fold_df)

    len(group_fold_df[group_fold_df[f"fold{fold}"]=="valid"])/len(group_fold_df)



val_chuncks: [0, 1, 2, 3, 4]
test_chuncks: [1, 2, 3, 4, 0]
        fold0  fold1  fold2  fold3  fold4
0       valid  train  train  train   test
1        test  valid  train  train  train
2       train   test  valid  train  train
3       train  train   test  valid  train
4       train  train  train   test  valid
...       ...    ...    ...    ...    ...
213354  train  train  train   test  valid
213355  valid  train  train  train   test
213356   test  valid  train  train  train
213357  train   test  valid  train  train
213358  train  train   test  valid  train

[213359 rows x 5 columns]
fold: 0


0.5999981252255588

0.2000009373872206

0.2000009373872206

fold: 1


0.5999981252255588

0.2000009373872206

0.2000009373872206

fold: 2


0.5999981252255588

0.2000009373872206

0.2000009373872206

fold: 3


0.6000028121616618

0.1999962504511176

0.2000009373872206

fold: 4


0.6000028121616618

0.2000009373872206

0.1999962504511176

In [31]:
peak_regions_slice_df
peak_regions_df.iloc[peak_indices,0:10]

Unnamed: 0,chr,start,end,1,2,3,4,5,6,summit,pos
1334,chr7,75228599,75230777,.,1.0,.,36.85567,-1.0,5.19781,152,75229688
9,chr2,110250636,110252814,.,1.0,.,26.82164,-1.0,5.19781,152,110251725
37,chrX,32867572,32869750,.,1.0,.,29.43084,-1.0,5.19781,152,32868661
627,chr16,16250525,16252703,.,1.0,.,33.14505,-1.0,5.19781,152,16251614
1326,chrX,27521799,27523977,.,1.0,.,24.75781,-1.0,5.19781,152,27522888
...,...,...,...,...,...,...,...,...,...,...,...
57136,chr20,22523198,22525376,.,1.0,.,285.81033,-1.0,5.19781,172,22524287
56949,chr20,22523404,22525582,.,1.0,.,24.30614,-1.0,5.19781,378,22524493
57418,chr20,22524110,22526288,.,1.0,.,319.52598,-1.0,5.19781,150,22525199
52029,chr2,161158933,161161111,.,1.0,.,42.72515,-1.0,5.19781,55,161160022


Unnamed: 0,chr,start,end,1,2,3,4,5,6,summit
1334,chr7,75229536,75229840,.,1.0,.,36.85567,-1.0,5.19781,152
9,chr2,110251573,110251877,.,1.0,.,26.82164,-1.0,5.19781,152
37,chrX,32868509,32868813,.,1.0,.,29.43084,-1.0,5.19781,152
627,chr16,16251462,16251766,.,1.0,.,33.14505,-1.0,5.19781,152
1326,chrX,27522736,27523040,.,1.0,.,24.75781,-1.0,5.19781,152
...,...,...,...,...,...,...,...,...,...,...
57136,chr20,22524115,22524583,.,1.0,.,285.81033,-1.0,5.19781,172
56949,chr20,22524115,22524583,.,1.0,.,24.30614,-1.0,5.19781,378
57418,chr20,22525049,22525399,.,1.0,.,319.52598,-1.0,5.19781,150
52029,chr2,161159967,161160194,.,1.0,.,42.72515,-1.0,5.19781,55


In [32]:
75229688-75228599
75229536+152

1089

75229688

In [42]:
# group_fold_dict = {}
# for fold in range(number_of_folds):
#     group_fold_dict[f"fold{fold}"]=[]

# count = 0
# valid_used = []



# for index,row in group_df.iterrows():
#     if index % 10000 == 0:
#         print(index)
#     if count % 2 == 0:
#         test_or_valid = 'valid'
#     else:
#         test_or_valid = 'test'
#     test_or_valid_fold = random.choice([i for i in range(number_of_folds) if i not in valid_used])
#     for fold in range(number_of_folds):
#         if fold != test_or_valid_fold:
#             group_fold_dict[f"fold{fold}"].append('train')
#         else:
#             group_fold_dict[f"fold{fold}"].append(test_or_valid)
#     count += 1
#     valid_used.append(test_or_valid_fold)
#     if len(valid_used) == number_of_folds:
#         valid_used = []


for fold in range(number_of_folds):
    group_df['fold' + str(fold)] = group_fold_df['fold' + str(fold)]
    
group_df
output_path ="."
print("Saving Splits")
for fold in range(number_of_folds):
    print("fold:",fold)
    for split in ['valid','train','test']:
        temp_lst = [group_dict.get(key) for key in group_df['groups'][group_df[f"fold{fold}"]==split]] 
        peak_indices = [i['ind'] for b in map(lambda x:[x] if not isinstance(x, list) else x, temp_lst) for i in b if i['region']=='peak']
        nonpeak_indices = [i['ind'] for b in map(lambda x:[x] if not isinstance(x, list) else x, temp_lst) for i in b if i['region']=='nonpeak']
        print("split:",split)
        print("proportion of peaks:",len(peak_indices)/len(peak_regions_df))
        print("proportion of nonpeaks:",len(nonpeak_indices)/len(nonpeak_regions_df))
        
        # f = open(f"{output_path}/loci_{split}_indices_fold{fold}.txt", "w")
        # for items in peak_indices:
        #     f.writelines(str(items)+'\n')
        # f.close()
        # f = open(f"{output_path}/background_{split}_indices_fold{fold}.txt", "w")
        # for items in nonpeak_indices:
        #     f.writelines(str(items)+'\n')
        # f.close()
        nonpeak_regions_slice_df = nonpeak_regions_df.iloc[nonpeak_indices,0:10]
        nonpeak_regions_slice_df['pos']=nonpeak_regions_slice_df['start']+nonpeak_regions_slice_df['summit']
        nonpeak_regions_slice_df['start']=nonpeak_regions_slice_df['pos']- int(inputlen//2) - int(max_jitter)
        nonpeak_regions_slice_df['end']=nonpeak_regions_slice_df['pos']+ int(inputlen//2) + int(max_jitter)
        nonpeak_regions_slice_df = nonpeak_regions_slice_df[nonpeak_regions_slice_df['start']>=0]
        
        peak_regions_slice_df = peak_regions_df.iloc[peak_indices,0:10]
        peak_regions_slice_df['pos']=peak_regions_slice_df['start']+peak_regions_slice_df['summit']
        peak_regions_slice_df['start']=peak_regions_slice_df['pos']- int(inputlen//2) - int(max_jitter)
        peak_regions_slice_df['end']=peak_regions_slice_df['pos']+ int(inputlen//2) + int(max_jitter)
        peak_regions_slice_df = peak_regions_slice_df[peak_regions_slice_df['start']>=0]
        
        nonpeak_regions_slice_df.iloc[:,0:10].to_csv(f"{output_path}/background_peaks_{split}_fold{fold}.bed",sep="\t",header=False,index=False)
        peak_regions_slice_df.iloc[:,0:10].to_csv(f"{output_path}/peaks_{split}_fold{fold}.bed",sep="\t",header=False,index=False)
    print("\n")

from plotnine import *
group_df["log_groupcounts"]=np.log10(group_df["group_counts"]+1)
for fold in range(number_of_folds):
    print("fold:",fold)
    plot = (ggplot(group_df,aes("log_groupcounts",color=f"fold{fold}"))
                    +stat_ecdf()
                    +theme_classic()
           )
    plot.save(f'fold{fold}_counts_histogram_plot.png')

Unnamed: 0,groups,group_counts,fold0,fold1,fold2,fold3,fold4,log_groupcounts
0,164291,0.0,valid,train,train,train,test,0.000000
1,127077,0.0,test,valid,train,train,train,0.000000
2,150211,0.0,train,test,valid,train,train,0.000000
3,205607,0.0,train,train,test,valid,train,0.000000
4,205606,0.0,train,train,train,test,valid,0.000000
...,...,...,...,...,...,...,...,...
213354,45877,5272.0,train,train,train,test,valid,3.722058
213355,101000,5402.0,valid,train,train,train,test,3.732635
213356,46663,5468.0,test,valid,train,train,train,3.737908
213357,76680,7740.0,train,test,valid,train,train,3.888797


Saving Splits
fold: 0
split: valid
proportion of peaks: 0.19959687839164814
proportion of nonpeaks: 0.2004384894223639
split: train
proportion of peaks: 0.5998415077437249
proportion of nonpeaks: 0.5994500458295142
split: test
proportion of peaks: 0.20056161386462695
proportion of nonpeaks: 0.20011146474812191


fold: 1
split: valid
proportion of peaks: 0.20056161386462695
proportion of nonpeaks: 0.20011146474812191
split: train
proportion of peaks: 0.5994625045221975
proportion of nonpeaks: 0.600127125084635
split: test
proportion of peaks: 0.19997588161317553
proportion of nonpeaks: 0.1997614101672432


fold: 2
split: valid
proportion of peaks: 0.19997588161317553
proportion of nonpeaks: 0.1997614101672432
split: train
proportion of peaks: 0.6005306045101383
proportion of nonpeaks: 0.600140943028617
split: test
proportion of peaks: 0.19949351387668612
proportion of nonpeaks: 0.20009764680413986


fold: 3
split: valid
proportion of peaks: 0.19949351387668612
proportion of nonpeaks: 0.



fold: 1




fold: 2




fold: 3




fold: 4




In [None]:
group_df

In [20]:
%%bash
bedtools intersect -a /mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/background_peaks_valid_fold0.bed -b /mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/peaks_valid_fold0.bed


11586
34819
11642
43517
130146
43446


In [43]:

%%bash
bedtools intersect -a /mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/peaks_test_fold0.bed -b /mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/peaks_valid_fold0.bed



In [44]:
%%bash
for fold in {0..4}
do
echo "fold${fold}"
bedtools intersect -a "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/peaks_test_fold${fold}.bed" -b "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/peaks_train_fold${fold}.bed"
bedtools intersect -a "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/background_peaks_test_fold${fold}.bed" -b "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/background_peaks_train_fold${fold}.bed"
bedtools intersect -a "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/peaks_test_fold${fold}.bed" -b "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/peaks_valid_fold${fold}.bed"
bedtools intersect -a "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/background_peaks_test_fold${fold}.bed" -b "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/background_peaks_valid_fold${fold}.bed"
bedtools intersect -a "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/peaks_train_fold${fold}.bed" -b "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/peaks_valid_fold${fold}.bed"
bedtools intersect -a "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/background_peaks_train_fold${fold}.bed" -b "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/background_peaks_valid_fold${fold}.bed"
done


fold0
fold1
fold2
fold3
fold4


In [73]:
%%bash
for ((foldA = 0,foldB = 1 ; foldA <= 4,foldB <= 5 ; foldA++, foldB++))
do
    if [[ "${foldB}" == "5" ]]; then
        echo "${foldA},0"
        bedtools intersect -a "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/peaks_train_fold${foldA}.bed" -b "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/peaks_train_fold0.bed"
    else
        echo "${foldA},${foldB}"
        bedtools intersect -a "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/peaks_train_fold${foldA}.bed" -b "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/peaks_train_fold${foldB}.bed"
    fi
done

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [78]:
%%bash
for ((foldA = 0,foldB = 1 ; foldA <= 4,foldB <= 5 ; foldA++, foldB++))
do
    if [[ "${foldB}" == "5" ]]; then
        echo "${foldA},0"
        bedtools intersect -a "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/background_peaks_train_fold${foldA}.bed" -b "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/background_peaks_train_fold0.bed"
    else
        echo "${foldA},${foldB}"
        bedtools intersect -a "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/background_peaks_train_fold${foldA}.bed" -b "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/background_peaks_train_fold${foldB}.bed"
    fi
done

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [50]:
%%bash
for ((foldA = 0; foldA <= 4; foldA++))
do
for ((foldB = 0; foldB <= 4; foldB++))
do
    if [[ "${foldA}" == "${foldB}" ]]; then
        echo ""
    else
        echo "${foldA},${foldB}"
        bedtools intersect -a "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/peaks_valid_fold${foldA}.bed" -b "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/peaks_valid_fold${foldB}.bed"
        bedtools intersect -a "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/background_peaks_valid_fold${foldA}.bed" -b "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/background_peaks_valid_fold${foldB}.bed"
        bedtools intersect -a "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/peaks_test_fold${foldA}.bed" -b "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/peaks_test_fold${foldB}.bed"
        bedtools intersect -a "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/background_peaks_test_fold${foldA}.bed" -b "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/background_peaks_test_fold${foldB}.bed"
    fi
done
done


0,1
0,2
0,3
0,4
1,0

1,2
1,3
1,4
2,0
2,1

2,3
2,4
3,0
3,1
3,2

3,4
4,0
4,1
4,2
4,3



In [75]:
# %%bash
# for ((foldA = 0,foldB = 1 ; foldA <= 4,foldB <= 5 ; foldA++, foldB++))
# do
#     if [[ "${foldB}" == "5" ]]; then
#         echo "${foldA},0"
#         bedtools intersect -a "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/background_peaks_valid_fold${foldA}.bed" -b "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/background_peaks_valid_fold0.bed"
#     else
#         echo "${foldA},${foldB}"
#         bedtools intersect -a "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/background_peaks_valid_fold${foldA}.bed" -b "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/background_peaks_valid_fold${foldB}.bed"
#     fi
# done

0,1
1,2
2,3
3,4
4,0


In [76]:
# %%bash
# for ((foldA = 0,foldB = 1 ; foldA <= 4,foldB <= 5 ; foldA++, foldB++))
# do
#     if [[ "${foldB}" == "5" ]]; then
#         echo "${foldA},0"
#         bedtools intersect -a "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/background_peaks_test_fold${foldA}.bed" -b "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/background_peaks_test_fold0.bed"
#     else
#         echo "${foldA},${foldB}"
#         bedtools intersect -a "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/background_peaks_test_fold${foldA}.bed" -b "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/background_peaks_test_fold${foldB}.bed"
#     fi
# done

0,1
1,2
2,3
3,4
4,0


In [77]:
# %%bash
# for ((foldA = 0,foldB = 1 ; foldA <= 4,foldB <= 5 ; foldA++, foldB++))
# do
#     if [[ "${foldB}" == "5" ]]; then
#         echo "${foldA},0"
#         bedtools intersect -a "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/peaks_test_fold${foldA}.bed" -b "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/peaks_test_fold0.bed"
#     else
#         echo "${foldA},${foldB}"
#         bedtools intersect -a "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/peaks_test_fold${foldA}.bed" -b "/mnt/lab_data2/vir/tf_chr_atlas/scripts/tf-atlas-pipeline/anvil/modeling/peaks_test_fold${foldB}.bed"
#     fi
# done

0,1
1,2
2,3
3,4
4,0


# Fake peak-wise split based on chromosome

In [39]:
from itertools import count
import json
import argparse
import os
import random
import pandas as pd
import numpy as np
import pyBigWig

NARROWPEAK_SCHEMA = ["chr", "start", "end", "1", "2", "3", "4", "5", "6", "summit"]

peak_regions_df = pd.read_csv("/mnt/lab_data2/vir/tf_chr_atlas/temp/peak_wise/Hughes_GR_models_ZNF134/peaks_inliers.bed.gz", sep='\t', names=NARROWPEAK_SCHEMA)
peak_regions_df['region']='peak'
peak_regions_df['ind']=range(len(peak_regions_df))
nonpeak_regions_df = pd.read_csv("/mnt/lab_data2/vir/tf_chr_atlas/temp/peak_wise/Hughes_GR_models_ZNF134/gc_neg_only.bed.gz", sep='\t', names=NARROWPEAK_SCHEMA)
nonpeak_regions_df['region']='nonpeak'
nonpeak_regions_df['ind']=range(len(nonpeak_regions_df))


all_regions_df = pd.concat([peak_regions_df,nonpeak_regions_df])
all_regions_df['pos']=all_regions_df['start']+all_regions_df['summit']
all_regions_df.sort_values(by=['chr', 'pos'], inplace=True)
all_regions_df=all_regions_df.reset_index(drop=True)

In [40]:
splits = {"train": ["chr11", "chr13", "chrM", "chr22", "chr2", "chr5", "chr19", "chr3", "chr7", "chr14", "chr18", "chr20", "chr16", "chr21", "chr17", "chrX", "chrY", "chr4", "chr6", "chr15", "chr12", "chr9"], "valid": ["chr10", "chr8"], "test": ["chr1"]}
for split_key in splits:
    print(split_key,splits[split_key])
peak_regions_df['chr']

train ['chr11', 'chr13', 'chrM', 'chr22', 'chr2', 'chr5', 'chr19', 'chr3', 'chr7', 'chr14', 'chr18', 'chr20', 'chr16', 'chr21', 'chr17', 'chrX', 'chrY', 'chr4', 'chr6', 'chr15', 'chr12', 'chr9']
valid ['chr10', 'chr8']
test ['chr1']


0       chr12
1       chr14
2        chr4
3       chr16
4       chr11
        ...  
2788    chr12
2789    chr15
2790     chrX
2791     chr6
2792     chr5
Name: chr, Length: 2793, dtype: object

In [42]:
fold = 0
for split in splits:
    print(split_key,splits[split])
    output_path="/mnt/lab_data2/vir/tf_chr_atlas/temp/peak_wise/Hughes_GR_models_ZNF134/fake_peak_wise_actually_chrom_wise"
    f = open(f"{output_path}/loci_{split}_indices_fold{fold}.txt", "w")
    peak_indices = list(np.where(peak_regions_df['chr'].isin(splits[split]))[0])
    nonpeak_indices = list(np.where(nonpeak_regions_df['chr'].isin(splits[split]))[0])
    for items in peak_indices:
        f.writelines(str(items)+'\n')
    f.close()
    f = open(f"{output_path}/background_{split}_indices_fold{fold}.txt", "w")
    for items in nonpeak_indices:
        f.writelines(str(items)+'\n')
    f.close()
    nonpeak_regions_df.iloc[nonpeak_indices,0:10].to_csv(f"{output_path}/background_peaks_{split}_fold{fold}.bed",sep="\t",header=False,index=False)
    peak_regions_df.iloc[peak_indices,0:10].to_csv(f"{output_path}/peaks_{split}_fold{fold}.bed",sep="\t",header=False,index=False)



test ['chr11', 'chr13', 'chrM', 'chr22', 'chr2', 'chr5', 'chr19', 'chr3', 'chr7', 'chr14', 'chr18', 'chr20', 'chr16', 'chr21', 'chr17', 'chrX', 'chrY', 'chr4', 'chr6', 'chr15', 'chr12', 'chr9']
test ['chr10', 'chr8']
test ['chr1']
