This script describes how we filtered candidate pcSNPs from previous GWAS results of T2D. We defined two types of pcSNP:
+ pcSNPs that function through regulating the expression of pdTFs;
+ pcSNPs that function through regulating the chromatin accessibility of pdCREs.

We filtered these two types of pcSNPs and their downstream regulators accordingly.

**Note:**

Before running the following scripts, you need to download genomic annotations of the significant GWAS SNPs from the UCSC genome browser or biomart. Here, we provide a processed file "GWAS_T2D_hg19_UCSC.csv" in the "data" folder.

In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
from pyfaidx import Fasta
from verstack import stratified_continuous_split
import matplotlib.pyplot as plt
import time
import math

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
import scipy
from scipy import stats
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch import Tensor
from torch.utils.data import TensorDataset, DataLoader
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import Dataset
from torchvision import datasets
from torcheval.metrics.functional import multiclass_f1_score
from torch_geometric.nn import GCNConv

import h5py
import seaborn as sns
import os
os.chdir('/nfs/public/xixi/scRegulate/T2D')

from typing import Tuple

In [2]:
df_x = pd.read_csv('./data/beta.atac.aggregate_30cells.csv', index_col=0).transpose()
df_y = pd.read_csv('./data/beta.rna.aggregate_30cells.csv', index_col=0).transpose()
df_peaks = pd.DataFrame(df_x.columns)[0].str.split('-',expand=True) 
df_peaks = df_peaks.rename(columns={0: "chrom", 1: "start", 2: "end"})
df_peaks["start"] = pd.to_numeric(df_peaks["start"])
df_peaks["end"] = pd.to_numeric(df_peaks["end"])
geneanno = pd.read_csv('../../ref_genome/hg19_geneanno.txt', sep='\t')
geneanno = geneanno.drop_duplicates(subset=['Gene name'])
motif_files = os.listdir('../../ref_genome/JASPAR_motifs_pfm_homosapiens/pfm.np')
tfs_kept = []
tf_by_region_mat = []
for i in list(motif_files):
    tf = i.split('.')[-2]#.capitalize()
    if tf in df_y.columns:
        if tf in tfs_kept:
            continue
            
        tfs_kept.append(tf)
tfs_kept = sorted(tfs_kept)
h5f = h5py.File('./predict_status/data_T2D_float16.h5', 'r')
#X = h5f['X'][:]
expr = h5f['expr'][:]
num_peaks = h5f['num_peaks'][:]
peaks = pd.DataFrame(np.array([item.decode('utf-8') for j in h5f['peaks'][:] for item in j]).reshape(-1, 3))
h5f.close()
cut = [0]
s = 0
for i in num_peaks:
    s = s+i
    cut.append(s)
y = pd.read_csv('./data/beta.label.aggregate_30cells.csv', index_col=0)

y = y['status'].to_list()
y_new = y
values_to_replace = ['Non-diabetic', 'Pre-T2D', 'T2D']
for value in values_to_replace:
    y_new = np.where(y == value, 0, y_new)
y_new = y_new.reshape((len(y), 1))
enc = OneHotEncoder(handle_unknown='ignore')
y_oht = enc.fit_transform(y_new).toarray()
expr_train, expr_test, y_oht_train, y_oht_test, y_label_train, y_label_test = train_test_split(expr, y_oht, y, test_size=0.3,
                                                                                                    random_state=2024, stratify=y)
expr_test, expr_val, y_oht_test, y_oht_val, y_label_test, y_label_val = train_test_split(expr_test, y_oht_test, y_label_test, 
                                                                                                test_size=0.5, random_state=2024, 
                                                                                            stratify=y_label_test)

class TensorDataset(Dataset[Tuple[Tensor, ...]]):
    r"""Dataset wrapping tensors.

    Each sample will be retrieved by indexing tensors along the first dimension.

    Args:
        *tensors (Tensor): tensors that have the same size of the first dimension.
    """
    tensors: Tuple[Tensor, ...]

    def __init__(self, *tensors: Tensor) -> None:
        assert all(tensors[0].size(0) == tensor.size(0) for tensor in tensors), "Size mismatch between tensors"
        self.tensors = tensors

    def __getitem__(self, index):
        return tuple(tensor[index] for tensor in self.tensors)

    def __len__(self):
        return self.tensors[0].size(0)

class subNet(nn.Module):
    def __init__(self, num_peak, num_tf):
        super(subNet, self).__init__()
        self.num_peak = num_peak
        self.num_tf = num_tf

        self.fc1 = nn.Linear(self.num_peak * self.num_tf, 1)
        self.fc1_activate = nn.ReLU()
        self.abs = nn.ReLU()
        
    def forward(self, x):
        x = x.reshape(x.shape[0], -1)
        x = self.fc1(x)
        return x

class Net(nn.Module):
    def __init__(self, num_genes, num_peaks, num_tf, cut):
        super(Net, self).__init__()
        self.num_peaks = num_peaks
        self.num_tf = num_tf
        self.num_genes = num_genes
        self.cut = cut
        self.gene_dim = 2
               
        self.subnet_modules = nn.ModuleList()
        for i in range(num_genes):
            num_peak = self.num_peaks[i]
            self.subnet = subNet(num_peak, self.num_tf)
            self.subnet_modules.append(self.subnet)
                    
        self.cat_activate = nn.ReLU()
        self.conv = GCNConv(1, self.gene_dim, add_self_loops=False)
        #self.fc1 = nn.Linear(self.num_genes, 100)
        self.conv_activate = nn.ReLU()
        self.out = nn.Linear(self.num_genes*self.gene_dim, 3)
        
        #self.initialize_parameters()
        
    def initialize_parameters(self):
        weight = self.conv.lin.weight
        bias = self.conv.bias
        nn.init.kaiming_uniform_(weight, a=math.sqrt(5))
        fan_in, _ = nn.init._calculate_fan_in_and_fan_out(weight)
        #torch.nn.init.xavier_uniform_(weight)
        if bias is not None:
            bound = 1 / math.sqrt(fan_in)
            nn.init.uniform_(bias, -bound, bound)

    def forward(self, x):
        x_cat = torch.zeros(x.shape[0], 0).to(device)
        for i in range(len(self.subnet_modules)):
            x_sub = x[:, :, self.cut[i]:self.cut[i+1]]
            x_sub = self.subnet_modules[i](x_sub)
            x_cat = torch.cat((x_cat, x_sub), dim=1)

        x_cat = self.cat_activate(x_cat)
        x = torch.unsqueeze(x_cat, 2)
        #x = F.dropout(x, p=0.3)
        x = self.conv_activate(self.conv(x, edge))
        #x = F.dropout(x, p=0.3)
        x = x.reshape(x.shape[0], -1)
        out = self.out(x)
        return x_cat, x, out

def train(model, device, train_loader, optimizer, epoch):
    model.train()
    #train_loss = 0
    for batch_idx, (data, expr, target) in enumerate(train_loader):
        data, expr, target = data.float(), expr.float(), target.float()
        optimizer.zero_grad()
        expr_hat, cluster_repr, output = model(data)
        loss_out = out_criterion(output, target)
        #loss_pseudo = expr_criterion(output[:, -1], target[:, -1])
        loss_expr = expr_criterion(expr_hat, expr)
        loss = loss_out #+ 0.7*loss_pseudo
        #loss = loss_pseudo
        loss.backward()
        optimizer.step()
        #model.fc1.weight.data = model.fc1.weight.mul(torch.repeat_interleave(mask.to(device), 4, dim=0))
#         if batch_idx % batchsize == 0:
#             print('\nTrain Epoch: {} [{}/{} ({:.0f}%)], Expr loss: {:.6f}, Cluster loss: {:.6f}'.
#                   format(
#                 epoch, batch_idx * len(data), len(train_loader.dataset),
#                 100. * batch_idx / len(train_loader), loss_expr.item(), loss_out.item()))
        #return(train_loss)

                  
def test(model, device, test_loader, num_clusters):
    model.eval()
    with torch.no_grad():
        outputs = torch.zeros(0, num_clusters).to(device)
        targets = torch.zeros(0).to(device)
        outputs_pseudo = torch.zeros(0).to(device)
        targets_pseudo = torch.zeros(0).to(device)
        test_loss = 0
        for data, expr, target in test_loader:
            data, expr, target = data.float(), expr.float(), target.float()
            expr_hat, cluster_repr, output = model(data)
            
            loss_out = out_criterion(output, target)
            #loss_pseudo = expr_criterion(output[:, -1], target[:, -1])
            loss_expr = expr_criterion(expr_hat, expr)
            loss = loss_out
            #loss = loss_pseudo
            test_loss = test_loss+loss.item()

            target_cluster = target.argmax(dim=1)
            output_cluster = output.softmax(dim=1)
            outputs = torch.cat((outputs, output_cluster), dim=0)
            targets = torch.cat((targets, target_cluster), dim=0)
            #target_pseudo = target[:, -1]
            #output_pseudo = output[:, -1]
            #outputs_pseudo = torch.cat((outputs_pseudo, output_pseudo), dim=0)
            #targets_pseudo = torch.cat((targets_pseudo, target_pseudo), dim=0)
        f1_score = multiclass_f1_score(outputs, targets, num_classes=num_clusters)
        #pearsonr, _ = stats.pearsonr(targets_pseudo.detach().cpu().numpy(), outputs_pseudo.detach().cpu().numpy())

    return(f1_score, test_loss)

def correlation_score(y_true, y_pred):
    #print(np.corrcoef(y_true, y_pred))
    return np.corrcoef(y_true, y_pred)[1, 0]

def spearman_correlation(y_true, y_pred):
    statistic, pvalue = stats.spearmanr(y_true, y_pred)
    return abs(statistic)

def pearson_correlation(y_true, y_pred):
    statistic, pvalue = stats.pearsonr(y_true, y_pred)
    return abs(statistic[0])

# pcSNP -> pdTF

In [3]:
results = np.load('./predict_status/interpret/input_output_TF_0.9quantile.GCN.string.npy')
import scipy.stats as ss
rank_all = []
for rep in range(10):
    #top = int(0.1*len(tfs_kept))
    cells = results[rep, :, :, :]
    rank = ss.rankdata(np.array([np.sum(abs(cells[:, i, :])) for i in range(len(tfs_kept))])) #rank from the smallest to the biggest
    rank_all.append(rank)

rank_all = np.array(rank_all)
top = 20
top_tfs_ind = np.argsort(np.mean(rank_all, axis=0))[::-1][:top]  # rank from the biggest to the smallest
print([tfs_kept[i] for i in top_tfs_ind])
top_over = [tfs_kept[i] for i in top_tfs_ind]

results = np.load('./predict_status/interpret/input_output_TF*0.GCN.string.npy')
import scipy.stats as ss
rank_all = []
for rep in range(10):
    #top = int(0.1*len(tfs_kept))
    cells = results[rep, :, :, :]
    rank = ss.rankdata(np.array([np.sum(abs(cells[:, i, :])) for i in range(len(tfs_kept))])) #rank from the smallest to the biggest
    rank_all.append(rank)

rank_all = np.array(rank_all)
top = 20
top_tfs_ind0 = np.argsort(np.mean(rank_all, axis=0))[::-1][:top]  # rank from the biggest to the smallest
print([tfs_kept[i] for i in top_tfs_ind0])
top_knockout = [tfs_kept[i] for i in top_tfs_ind0]

['GLIS3', 'RORA', 'RFX3', 'MEIS2', 'FOS', 'HES1', 'FOXP2', 'MLXIPL', 'ETV1', 'NR1D2', 'TRPS1', 'HNF4A', 'PBX3', 'BACH2', 'BACH1', 'TCF4', 'JUND', 'TEAD1', 'KLF6', 'ZKSCAN1']
['GLIS3', 'RFX3', 'RORA', 'MEIS2', 'FOXP2', 'ETV1', 'NR1D2', 'HES1', 'TRPS1', 'BACH1', 'FOS', 'MLXIPL', 'BACH2', 'TCF4', 'PBX3', 'TEAD1', 'ETV5', 'CUX1', 'KLF6', 'FOXO3']


In [4]:
tfs_selected = list(set(top_over + top_knockout))
print(tfs_selected)

['FOXO3', 'RFX3', 'HNF4A', 'FOXP2', 'PBX3', 'BACH2', 'JUND', 'FOS', 'ZKSCAN1', 'MEIS2', 'ETV1', 'TCF4', 'TEAD1', 'CUX1', 'RORA', 'ETV5', 'TRPS1', 'HES1', 'KLF6', 'MLXIPL', 'NR1D2', 'BACH1', 'GLIS3']


In [5]:
len(tfs_selected)

23

In [6]:
geneanno = pd.read_csv('../../ref_genome/hg19_geneanno.txt', sep='\t')
geneanno = geneanno.drop_duplicates(subset=['Gene name'])
geneanno = geneanno[geneanno['Gene name'].isin(tfs_selected)]
geneanno

Unnamed: 0,Gene stable ID,Transcript stable ID,Chromosome/scaffold name,Gene start (bp),Gene end (bp),Strand,Gene name,Transcription start site (TSS)
13069,ENSG00000067082,ENST00000461124,10,3818188,3827473,-1,KLF6,3822409
24038,ENSG00000187079,ENST00000527636,11,12695969,12966298,1,TEAD1,12696138
49405,ENSG00000170345,ENST00000303562,14,75745477,75748933,1,FOS,75745477
52928,ENSG00000134138,ENST00000338564,15,37181406,37393504,-1,MEIS2,37393406
55241,ENSG00000069667,ENST00000335670,15,60780483,61521518,-1,RORA,61521518
83979,ENSG00000196628,ENST00000356073,18,52889562,53332018,-1,TCF4,53255860
89675,ENSG00000130522,ENST00000600972,19,18390563,18392432,-1,JUND,18391739
113343,ENSG00000101076,ENST00000316673,20,42984340,43061485,1,HNF4A,42984340
114053,ENSG00000156273,ENST00000548219,21,30566392,31003071,1,BACH1,30566392
123007,ENSG00000244405,ENST00000306376,3,185764097,185828107,-1,ETV5,185826901


In [7]:
chroms = []
starts = []
ends = []
for i in range(len(geneanno)):
    strand = geneanno['Strand'].iloc[i]
    chroms.append('chr'+geneanno['Chromosome/scaffold name'].astype(str).iloc[i])
    starts.append(geneanno['Transcription start site (TSS)'].iloc[i]-250000)
    ends.append(geneanno['Transcription start site (TSS)'].iloc[i]+250000)
df_generegions = pd.DataFrame({'gene': geneanno['Gene name'],
                             'chrom': chroms,
                             'start': starts,
                             'end': ends,
                              'strand': geneanno['Strand'],
                             'TSS': geneanno['Transcription start site (TSS)']})
df_generegions = df_generegions.reset_index(drop=True)
df_generegions

Unnamed: 0,gene,chrom,start,end,strand,TSS
0,KLF6,chr10,3572409,4072409,-1,3822409
1,TEAD1,chr11,12446138,12946138,1,12696138
2,FOS,chr14,75495477,75995477,1,75745477
3,MEIS2,chr15,37143406,37643406,-1,37393406
4,RORA,chr15,61271518,61771518,-1,61521518
5,TCF4,chr18,53005860,53505860,-1,53255860
6,JUND,chr19,18141739,18641739,-1,18391739
7,HNF4A,chr20,42734340,43234340,1,42984340
8,BACH1,chr21,30316392,30816392,1,30566392
9,ETV5,chr3,185576901,186076901,-1,185826901


In [8]:
df_gwas_hg19 = pd.read_csv('/nfs/public/xixi/scRegulate/GWAS/GWAS_T2D_hg19_UCSC.csv')
df_gwas_hg19 = df_gwas_hg19.iloc[:, :7]
df_gwas_hg19 = df_gwas_hg19.rename(columns={df_gwas_hg19.columns[3]: 'snp_name'})
df_gwas_hg19

Unnamed: 0,chrom,chromStart,chromEnd,snp_name,ref,altCount,alts
0,chr1,183004333,183004334,rs4129858,A,1,"G,"
1,chr17,44083947,44083948,rs8067056,T,2,"C,G,"
2,chr1,51438364,51438365,rs3176466,C,1,"T,"
3,chr1,51457199,51457200,rs72906810,A,2,"C,G,"
4,chr1,51506885,51506886,rs12088739,A,1,"G,"
...,...,...,...,...,...,...,...
2436,chr4,941517,941518,rs2290402,C,1,"T,"
2437,chr5,36257017,36257018,rs16902871,A,1,"G,"
2438,chr11,64100775,64100776,rs1662185,A,1,"G,"
2439,chr3,72803589,72803590,rs9814945,C,2,"A,T,"


In [9]:
df_mapping = pd.DataFrame({'snp_name':[],
                          'gene': []})
for i in range(len(df_generegions)):
    chrom = df_generegions['chrom'][i]
    df_gwas_sub = df_gwas_hg19[df_gwas_hg19['chrom']==chrom]
    snps = []
    tfs = []
    for j in range(len(df_gwas_sub)):
        if df_gwas_sub['chromEnd'].iloc[j]>df_generegions['start'][i] and df_gwas_sub['chromEnd'].iloc[j]<df_generegions['end'][i]:
            snps.append(df_gwas_sub['snp_name'].iloc[j])
            tfs.append(df_generegions['gene'][i])
    df_temp = pd.DataFrame({'snp_name': snps, 'gene': tfs})
    df_mapping = df_mapping.append(df_temp, ignore_index=True)
df_mapping

Unnamed: 0,snp_name,gene
0,rs72926932,TCF4
1,rs28719468,TCF4
2,rs76811102,HNF4A
3,rs6017317,HNF4A
4,rs4812829,HNF4A
5,rs16988991,HNF4A
6,rs113810779,HNF4A
7,rs12625671,HNF4A
8,rs13039863,HNF4A
9,rs6103716,HNF4A


In [10]:
df_mapping.to_csv('/nfs/public/xixi/scRegulate/GWAS/GWAS_SNPs_TFs.csv', index=False)

# pcSNP -> pdCRE

In [13]:
files = os.listdir('/nfs/public/xixi/scRegulate/T2D/nn.best.feature6.learnW')
markers_filtered = []
for file in files:
    if len(file.split('.2.pt'))>1:
        marker = file.split('.2.pt')[0]
        if marker not in markers_filtered:
            markers_filtered.append(marker)

markers = pd.read_csv("/nfs/public/xixi/scRegulate/T2D/data/markers_status.txt", sep='\t')
markers = markers.groupby('cluster').apply(lambda x: x.nlargest(100, 'avg_log2FC')).reset_index(drop=True)
markers = markers['gene'].drop_duplicates().to_list()

genes_filtered = [i for i in markers_filtered if i not in tfs_kept]
genes_filtered = [i for i in genes_filtered if i in markers]
print(len(genes_filtered))

239


In [14]:
percent = 0.05
results = np.load('./predict_status/interpret/input_output_openness_all1_GCN.string.npy')
h5f = h5py.File('./predict_status/data_T2D_float16.h5', 'r')
peaks = pd.DataFrame(np.array([item.decode('utf-8') for j in h5f['peaks'][:] for item in j]).reshape(-1, 3))
h5f.close()
import scipy.stats as ss
rank_all = []
for rep in range(10):
    #top = int(0.1*len(tfs_kept))
    cells = results[rep, :, :, :]
    rank = ss.rankdata(np.array([np.sum(abs(cells[:, i, :])) for i in range(len(peaks))])) #rank from the smallest to the biggest
    rank_all.append(rank)

rank_all = np.array(rank_all)
top = int(percent*len(peaks))
top_peaks_ind = np.argsort(np.mean(rank_all, axis=0))[::-1][:top]  # rank from the biggest to the smallest

results = np.load('./predict_status/interpret/input_output_openness*0_GCN.string.npy')
#results_orig = np.load('./predict_status/interpret/input_output_openness*0_orig_GCN.string.npy')
#results_perturb = np.load('./predict_status/interpret/input_output_openness*0_perturb_GCN.string.npy')
rank_all = []
for rep in range(10):
    #top = int(0.1*len(tfs_kept))
    cells = results[rep, :, :, :]
    rank = ss.rankdata(np.array([np.sum(abs(cells[:, i, :])) for i in range(len(peaks))])) #rank from the smallest to the biggest
    rank_all.append(rank)

rank_all = np.array(rank_all)
top = int(percent*len(peaks))
top_peaks_ind0 = np.argsort(np.mean(rank_all, axis=0))[::-1][:top]  # rank from the biggest to the smallest
print(top)
print(len(set(top_peaks_ind).intersection(set(top_peaks_ind0))))
print(len(set(top_peaks_ind).intersection(set(top_peaks_ind0)))/top)

top_peaks_ind = list(top_peaks_ind)+list(top_peaks_ind0)
top_peaks_ind = list(pd.DataFrame(top_peaks_ind).drop_duplicates()[0])

marker_peaks = []
markers_ = []
_peaks = []
df_bed = pd.DataFrame()
for i in top_peaks_ind:
    for marker_ind in range(len(cut)):
        if i>=cut[marker_ind] and i<cut[marker_ind+1]:
            break
    marker = genes_filtered[marker_ind]
    markers_.append(marker)
    df_bed = df_bed.append(peaks.iloc[i, :])
    peak = str(peaks.iloc[i, 0])+':'+str(peaks.iloc[i, 1])+'-'+str(peaks.iloc[i, 2])
    
    _peaks.append(peak)
    feature = marker + '::' + peak
    marker_peaks.append(feature)
df_bed = df_bed.drop_duplicates().reset_index(drop=True)
df_bed

1259
1015
0.8061953931691819


Unnamed: 0,0,1,2
0,chr9,94664770,94665270
1,chr9,94914257,94914757
2,chr9,94671007,94671507
3,chr9,94558673,94559173
4,chr9,94660876,94661376
...,...,...,...
1404,chr5,138145582,138146082
1405,chr6,53950266,53950766
1406,chr5,137872322,137872822
1407,chr8,6473778,6474278


In [16]:
peaks_selected = marker_peaks
split_elements = [item.split("::") + item.split(":") for item in peaks_selected]
split_elements = [[item[0], item[4], item[5]] for item in split_elements]

df_peaks_selected = pd.DataFrame(split_elements, columns=["gene", "chrom", "startend"])
df_peaks_selected[['start', 'end']] = df_peaks_selected['startend'].str.split('-', expand=True)
df_peaks_selected = df_peaks_selected.drop('startend', axis=1)
df_peaks_selected['peak'] = peaks_selected
df_peaks_selected

Unnamed: 0,gene,chrom,start,end,peak
0,ROR2,chr9,94664770,94665270,ROR2::chr9:94664770-94665270
1,ROR2,chr9,94914257,94914757,ROR2::chr9:94914257-94914757
2,IARS,chr9,94914257,94914757,IARS::chr9:94914257-94914757
3,ROR2,chr9,94671007,94671507,ROR2::chr9:94671007-94671507
4,ROR2,chr9,94558673,94559173,ROR2::chr9:94558673-94559173
...,...,...,...,...,...
1498,HSPA9,chr5,138145582,138146082,HSPA9::chr5:138145582-138146082
1499,MLIP,chr6,53950266,53950766,MLIP::chr6:53950266-53950766
1500,HSPA9,chr5,137872322,137872822,HSPA9::chr5:137872322-137872822
1501,AGPAT5,chr8,6473778,6474278,AGPAT5::chr8:6473778-6474278


In [17]:
df_gwas_hg19 = pd.read_csv('/nfs/public/xixi/scRegulate/GWAS/GWAS_T2D_hg19_UCSC.csv')
df_gwas_hg19 = df_gwas_hg19.iloc[:, :7]
df_gwas_hg19 = df_gwas_hg19.rename(columns={df_gwas_hg19.columns[3]: 'snp_name'})
df_gwas_hg19

Unnamed: 0,chrom,chromStart,chromEnd,snp_name,ref,altCount,alts
0,chr1,183004333,183004334,rs4129858,A,1,"G,"
1,chr17,44083947,44083948,rs8067056,T,2,"C,G,"
2,chr1,51438364,51438365,rs3176466,C,1,"T,"
3,chr1,51457199,51457200,rs72906810,A,2,"C,G,"
4,chr1,51506885,51506886,rs12088739,A,1,"G,"
...,...,...,...,...,...,...,...
2436,chr4,941517,941518,rs2290402,C,1,"T,"
2437,chr5,36257017,36257018,rs16902871,A,1,"G,"
2438,chr11,64100775,64100776,rs1662185,A,1,"G,"
2439,chr3,72803589,72803590,rs9814945,C,2,"A,T,"


In [18]:
df_mapping = pd.DataFrame({'snp_name': [], 'peak': []})
for i in range(len(df_peaks_selected)):
    chrom = df_peaks_selected['chrom'][i]
    df_gwas_sub = df_gwas_hg19[df_gwas_hg19['chrom']==chrom]
    snps = []
    peaks = []
    for j in range(len(df_gwas_sub)):
        if df_gwas_sub['chromEnd'].iloc[j]>int(df_peaks_selected['start'][i]) and df_gwas_sub['chromEnd'].iloc[j]<int(df_peaks_selected['end'][i]):
            snps.append(df_gwas_sub['snp_name'].iloc[j])
            peaks.append(df_peaks_selected['peak'][i])
    df_temp = pd.DataFrame({'snp_name': snps, 'peak': peaks})
    df_mapping = df_mapping.append(df_temp, ignore_index=True)
df_mapping

Unnamed: 0,snp_name,peak
0,rs4655617,SGIP1::chr1:67010359-67010859
1,rs112667817,HSPA9::chr5:137822676-137823176


In [None]:
df_mapping.to_csv('/nfs/public/xixi/scRegulate/GWAS/GWAS_SNPs_cCREs.csv', index=False)