# Template for loading top-ranked (by GWAS) SNPs FASTA and BIGWIG data 

In [1]:
import numpy as np
import pandas as pd
import pysam

In [2]:
# load list of top SNPs
snp_dir_name = "/data/list_of_snps/"

# uncomment if using top 200
# data = pd.read_csv(snp_dir_name+"TOP_200_SNPS.txt",delim_whitespace=True)
# saved_dir_name = dir_name = '/data/top200snps/'

# uncomment if using top 1000
data = pd.read_csv(snp_dir_name+"TOP_1000_SNPS.txt",delim_whitespace=True)
saved_dir_name = dir_name = '/data/top1000snps/'

# data.info()

## Read and process from FASTA 
### Use this block of code to get the sequences while processing.

In [3]:
# global variables
num_bp_in_peak = 3000
num_bp_padding = 5000
margin = num_bp_in_peak/2 + num_bp_padding

# load FASTA
fasta_file = '/data/GRCh38.p3.genome.fa'
fasta_open = pysam.Fastafile(fasta_file)

In [4]:
def one_hot_encode(seq):
    values = {'A':[1,0,0,0], 'C':[0,1,0,0], 'G':[0,0,1,0], 'T':[0,0,0,1], 'N':[0,0,0,0]}
    return_arr = np.empty((len(seq), 4))
    for i in range(return_arr.shape[0]):
        return_arr[i] = values[seq[i]]
    return return_arr
    

In [5]:
def get_snp_seq(fasta_open, idx, snp_chr, snp_coor, ref_snp, alt_snp ):
    snp = fasta_open.fetch(snp_chr, snp_coor-1,snp_coor)
    
    start = int(snp_coor-margin)
    end = int(snp_coor+margin)

    # reference sequence
    snp_seq_ref = fasta_open.fetch(snp_chr, start, end)
    
    # alternative sequence
    if snp==ref_snp:
        alt = alt_snp
    elif snp==alt_snp:
        alt = ref_snp
    else:
        assert False

    snp_seq_before = fasta_open.fetch(snp_chr, start, snp_coor-1)
    snp_seq_after = fasta_open.fetch(snp_chr, snp_coor, end)
    snp_seq_alt = snp_seq_before+alt+snp_seq_after 
    
    return snp_seq_ref, snp_seq_alt

In [16]:
index_list = data.index.values.tolist()
for i in range(0,len(index_list)):
    snp = data.loc[index_list[i],:]
    snp_pos = snp['chr_pos_hg38'].split(":")
    snp_chr = snp_pos[0]
    snp_coor = int(snp_pos[1])
    ref_snp = snp['reference_allele']
    alt_snp = snp['other_allele']
    if not alt_snp in ['A','T','C','G']:
        print(alt_snp)
    if not ref_snp in ['A','T','C','G']:
        print(ref_snp)
    
    # get sequence with reference and alternative allels
    snp_seq_ref, snp_seq_alt = get_snp_seq(fasta_open, i, snp_chr, snp_coor, ref_snp, alt_snp)

    ref_encoding = one_hot_encode(snp_seq_ref).astype(float)
    alt_encoding = one_hot_encode(snp_seq_alt).astype(float)    

    # Apply model to the sequences: ref_encoding and alt_encoding
    # encoding's shape: (13000,4)
    # ......
    

## Read from saved encoding directly
### Use this block of code to get the saved sequences in pairs.

In [3]:
# Load encoding directly from saved files

index_list = data.index.values.tolist()
for i in range(0,len(index_list)):
    snp = data.loc[index_list[i],:]
    ref_encoding = np.load(saved_dir_name +"snp"+str(i)+"_ref.npy", allow_pickle=True) 
    alt_encoding = np.load(saved_dir_name +"snp"+str(i)+"_alt.npy", allow_pickle=True) 

    # Apply model to the sequences: ref_encoding and alt_encoding
    # encoding's shape: (13000,4)
    # ......

In [4]:
print(ref_encoding.shape, alt_encoding.shape)
print(snp)

(13000, 4) (13000, 4)
SNP                         rs2760736
chr_pos_hg38            chr17:2133359
reference_allele                    G
other_allele                        A
ref_allele_frequency           0.2923
pvalue                       3.31e-05
het_pvalue                   0.120969
log_odds                    0.0627338
log_odds_se                 0.0151129
N_case                          20740
N_control                       61672
model                              FE
chr_pos_(b36)           chr17:1983403
Name: 2136339, dtype: object
