In [1]:
import os
import re

import numpy as np
import pandas as pd

# os.chdir("/home/yujia/Project/2023-07-20-individual_MR/dat/02_base_data/")

In [3]:
IGF1 = pd.read_csv("/mnt/md0/yujia/project/2023-07-20-individual_MR/dat/02_base_data/IGF1/download/GCST90102623.tsv.gz", sep="\t", compression = 'gzip')

print("Total SNP in the GWAS data: ")
print(IGF1.shape)

Total SNP in the GWAS data: 
(2459688, 10)


In [4]:
IGF1

Unnamed: 0,chromosome,base_pair_location,effect_allele,other_allele,beta,standard_error,effect_allele_frequency,p_value,rs_id,n
0,7,92221824,a,c,-0.026983,0.019768,0.0585,0.184661,rs10,23232
1,12,125456933,a,g,-0.011914,0.010451,0.2108,0.268088,rs1000000,27518
2,4,21227772,t,c,0.022209,0.008525,0.4968,0.011356,rs10000010,27518
3,4,1347325,c,g,-0.011136,0.012498,0.8656,0.386379,rs10000012,27513
4,4,36901464,a,c,0.004987,0.010135,0.7704,0.632576,rs10000013,27518
...,...,...,...,...,...,...,...,...,...,...
2459683,4,4987062,t,c,-0.019480,0.017285,0.0690,0.273526,rs9999987,26052
2459684,4,123121534,a,g,-0.010892,0.020474,0.0731,0.605462,rs9999992,17604
2459685,4,98781694,a,t,0.007998,0.008545,0.5339,0.363304,rs9999993,27517
2459686,4,69817056,a,c,0.025295,0.012085,0.8544,0.042019,rs9999996,27519


In [5]:
# rename the data
IGF1.rename(columns={"chromosome":"CHR", "base_pair_location":"BP", "rs_id":"SNP", "effect_allele":"A1", "other_allele":"A2", "effect_allele_frequency":"EAF", "beta":"BETA", "standard_error":"SE", "p_value":"P", "n":"N"}, inplace=True)

# convert EAF to MAF
IGF1['MAF'] = IGF1['EAF'].apply(lambda x : x if x <= 0.5 else 1-x)

# Uppercase the A1 and A2
IGF1['A1'] = IGF1['A1'].str.upper()
IGF1['A2'] = IGF1['A2'].str.upper()

In [7]:
# reorder the column
cols = ["CHR", "BP", "SNP", "A1", "A2", "N", "SE", "P", "BETA", "MAF", "EAF"]
IGF1 = IGF1[cols]

# Remove NaN row
IGF1 = IGF1.dropna()

# convert to correct column type
IGF1 = IGF1.astype({'CHR':'int', 'BP':'int', "SNP":'str', 'A1':'str', 'A2':'str', 'N':'int', 'SE':'float', 'P':'float', 'BETA':'float', 'MAF':'float', 'EAF':'float'})

# remove SNP without SNPID
IGF1 = IGF1[IGF1['SNP'].str.startswith('rs')]

# save the formatted results
IGF1.to_csv("/mnt/md0/yujia/project/2023-07-20-individual_MR/dat/02_base_data/IGF1/QC/IGF1_formated_nadrop.tsv.gz", sep="\t", index=False)

print("SNP left after basic QC: ")
print(IGF1.shape)

SNP left after basic QC: 
(2459688, 11)


In [8]:
IGF1

Unnamed: 0,CHR,BP,SNP,A1,A2,N,SE,P,BETA,MAF,EAF
0,7,92221824,rs10,A,C,23232,0.019768,0.184661,-0.026983,0.0585,0.0585
1,12,125456933,rs1000000,A,G,27518,0.010451,0.268088,-0.011914,0.2108,0.2108
2,4,21227772,rs10000010,T,C,27518,0.008525,0.011356,0.022209,0.4968,0.4968
3,4,1347325,rs10000012,C,G,27513,0.012498,0.386379,-0.011136,0.1344,0.8656
4,4,36901464,rs10000013,A,C,27518,0.010135,0.632576,0.004987,0.2296,0.7704
...,...,...,...,...,...,...,...,...,...,...,...
2459683,4,4987062,rs9999987,T,C,26052,0.017285,0.273526,-0.019480,0.0690,0.0690
2459684,4,123121534,rs9999992,A,G,17604,0.020474,0.605462,-0.010892,0.0731,0.0731
2459685,4,98781694,rs9999993,A,T,27517,0.008545,0.363304,0.007998,0.4661,0.5339
2459686,4,69817056,rs9999996,A,C,27519,0.012085,0.042019,0.025295,0.1456,0.8544


In [9]:
# =============================
# Step 2: remove SNPs with MAF < 0.01
# =============================

cmd = "gunzip -c /mnt/md0/yujia/project/2023-07-20-individual_MR/dat/02_base_data/IGF1/QC/IGF1_formated_nadrop.tsv.gz |" + \
      "awk 'NR==1 || ($10 > 0.01) {print}' |" + \
      "gzip > /mnt/md0/yujia/project/2023-07-20-individual_MR/dat/02_base_data/IGF1/QC/IGF1.standardGWASQC.gz"
os.system(cmd)

cmd = "zcat /mnt/md0/yujia/project/2023-07-20-individual_MR/dat/02_base_data/IGF1/QC/IGF1.standardGWASQC.gz | wc -l"
print("SNP left after removing SNPs with MAF < 0.01: ")
print(os.popen(cmd).read())

SNP left after removing SNPs with MAF < 0.01: 
2459689



In [10]:
# =============================
# Step 3: remove duplicate SNPS
# =============================

cmd = "gunzip -c /mnt/md0/yujia/project/2023-07-20-individual_MR/dat/02_base_data/IGF1/QC/IGF1.standardGWASQC.gz |" + \
      "awk '{seen[$3]++; if(seen[$3]==1){ print}}' |" + \
      "gzip - > /mnt/md0/yujia/project/2023-07-20-individual_MR/dat/02_base_data/IGF1/QC/IGF1.nodup.gz"
os.system(cmd)

cmd = "zcat /mnt/md0/yujia/project/2023-07-20-individual_MR/dat/02_base_data/IGF1/QC/IGF1.nodup.gz | wc -l"
print("SNP left after removing duplicate SNPs: ")
print(os.popen(cmd).read())

SNP left after removing duplicate SNPs: 
2459689



In [11]:
# =============================
# Step 4: keep non-ambiguous SNPS
# =============================

cmd = "gunzip -c /mnt/md0/yujia/project/2023-07-20-individual_MR/dat/02_base_data/IGF1/QC/IGF1.nodup.gz | " + \
      "awk '!( ($4==\"A\" && $5==\"T\") || ($4==\"T\" && $5==\"A\") || ($4==\"G\" && $5==\"C\") || ($4==\"C\" && $5==\"G\")) {print}' | " + \
      "gzip > /mnt/md0/yujia/project/2023-07-20-individual_MR/dat/02_base_data/IGF1/QC/IGF1.QC.gz"
os.system(cmd)

cmd = "zcat /mnt/md0/yujia/project/2023-07-20-individual_MR/dat/02_base_data/IGF1/QC/IGF1.QC.gz | wc -l"
print("SNP left after removing ambiguous SNPs: ")
print(os.popen(cmd).read())

SNP left after removing ambiguous SNPs: 
2080031

