# Process List of SNPs: from HG18 to HG38

In [1]:
import numpy as np
import pandas as pd
from pyliftover import LiftOver
from tqdm import tqdm

In [2]:
dir_name = "/data/list_of_snps/"

# load list of snps
data = pd.read_csv(dir_name+"CARDIoGRAM_GWAS_RESULTS.txt",
           delim_whitespace=True)
n = len(data)

In [3]:
# conver hg18 to hg38 for a single SNP
lo = LiftOver('hg18', 'hg38')

In [4]:
pos_hg18s = data["chr_pos_(b36)"].values
pos_hg38s = ["" for x in range(len(pos_hg18s))]
# pos_hg38s = np.full(pos_hg18s.shape, " ")

In [5]:
errors = list()

for i in tqdm(range(len(pos_hg18s))):
    pos_hg18 = pos_hg18s[i]
    try:
        pos_hg38 = lo.convert_coordinate(pos_hg18.split(":")[0],int(pos_hg18.split(":")[1]))
        pos_hg38s[i] = pos_hg38[0][0]+":"+str(pos_hg38[0][1])
    except:
        errors.append((i,pos_hg18))


100%|██████████| 2420360/2420360 [00:25<00:00, 94119.39it/s] 


In [6]:
data.info()
data["chr_pos_hg38"] = pos_hg38s

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2420360 entries, 0 to 2420359
Data columns (total 12 columns):
SNP                     object
chr_pos_(b36)           object
reference_allele        object
other_allele            object
ref_allele_frequency    float64
pvalue                  float64
het_pvalue              float64
log_odds                float64
log_odds_se             float64
N_case                  int64
N_control               int64
model                   object
dtypes: float64(5), int64(2), object(5)
memory usage: 221.6+ MB


In [7]:
header = list(data.columns)
chr_pos = header[-1]
header[-1] = header[1]
header[1] = chr_pos

data=data.reindex(columns=header)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2420360 entries, 0 to 2420359
Data columns (total 13 columns):
SNP                     object
chr_pos_hg38            object
reference_allele        object
other_allele            object
ref_allele_frequency    float64
pvalue                  float64
het_pvalue              float64
log_odds                float64
log_odds_se             float64
N_case                  int64
N_control               int64
model                   object
chr_pos_(b36)           object
dtypes: float64(5), int64(2), object(6)
memory usage: 240.1+ MB


In [13]:
data.to_csv(dir_name+"LIST_OF_SNPS.txt",sep='\t', header=True, index=True)
# np.savetxt(dir_name+"LIST_OF_SNPS.txt", data, header=header)