In [1]:
import pandas as pd
import os

In [53]:
# File paths
input_file = "./equitable-prs-cardiovascular/gwas/HF_Bothsex_afr_inv_var_meta_GBMI_052021_nbbkgt1.txt.gz"
output_dir = "./data/processed_sumstats/"
output_file = os.path.join(output_dir, "HF_Bothsex_afr_19.sumstats")

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

In [17]:
# Load data and assign column names
df = pd.read_csv(input_file, compression='gzip', sep='\t', header=0)

In [18]:
df

Unnamed: 0,#CHR,POS,REF,ALT,rsid,all_meta_AF,inv_var_meta_beta,inv_var_meta_sebeta,inv_var_meta_p,inv_var_het_p,direction,N_case,N_ctrl,n_dataset,n_bbk,is_strand_flip,is_diff_AF_gnomAD
0,1,727233,G,A,rs151190501,0.004810,1.140300,0.692990,0.09989,0.56720,+?+??,688,9281,2,2,no,no
1,1,727242,G,A,rs61769339,0.189800,-0.167700,0.121750,0.16840,0.34610,-?-??,688,9281,2,2,no,no
2,1,730869,C,T,,0.011300,0.169100,0.445660,0.70440,0.50020,+?-??,688,9281,2,2,no,no
3,1,732966,G,A,rs115048193,0.010740,-0.383350,0.385450,0.32000,0.54150,-?+??,688,9281,2,2,no,no
4,1,752895,C,T,rs569317217,0.008124,0.200930,0.465050,0.66570,0.17980,+?-??,688,9281,2,2,no,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19989429,23,155696715,C,T,rs116490668,0.096840,0.165720,0.134860,0.21910,0.08611,??-?+,244,9239,2,2,no,no
19989430,23,155697134,G,A,rs142669955,0.038790,0.394650,0.236710,0.09547,0.52140,??+?+,244,9239,2,2,no,no
19989431,23,155697920,G,A,rs644138,0.845500,0.070104,0.119690,0.55810,0.71490,??+?+,244,9239,2,2,no,no
19989432,23,155699751,C,T,,0.566700,-0.024151,0.081190,0.76610,0.20180,??+?-,244,9239,2,2,no,no


In [19]:
# Extract relevant columns and rename them
df["Z"] = df["inv_var_meta_beta"] / df["inv_var_meta_sebeta"]  # Compute Z-score
df["CHISQ"] = df["Z"] ** 2  # Compute CHISQ


In [20]:
# Select and rename columns
df_sumstats = df.rename(columns={
    "rsid": "SNP",
    "ALT": "A1",
    "REF": "A2"
})[["SNP", "A1", "A2", "N_ctrl", "CHISQ", "Z"]]

In [None]:
# Rename N_ctrl to N
df_sumstats = df_sumstats.rename(columns={"N_ctrl": "N"})
#19989434 rows

In [None]:
df_sumstats = df_sumstats[df_sumstats['SNP'].notna()]
#~16000000 rows

In [None]:
df_sumstats = df_sumstats[df_sumstats['A1'].apply(len) == 1]
df_sumstats = df_sumstats[df_sumstats['A2'].apply(len) == 1]
#15512642 rows

In [38]:
# df_sumstats is still much larger than we would like to work with
# Let me go pull all of the SNPs from the reference data for the chromosome we'll work with

In [50]:
import subprocess
import os

# Define the path to PLINK executable
plink_path = os.path.join("equitable-prs-cardiovascular", "src", "plink")

# Define the command to extract SNPs
command = [
    plink_path,
    "--bfile", "data/LDREF_filtered/1000G.EUR.19",  # Path to your binary files (BED/BIM/FAM)
    "--write-snplist",  # Option to output SNP list
    "--out", "data/LDREF_filtered/snp_list_19"  # Output file
]

# Run the command
subprocess.run(command, check=True)

print("PLINK SNP extraction complete.")

PLINK SNP extraction complete.


In [52]:
snp_set = set()
with open("data/LDREF_filtered/snp_list_19.snplist", "r") as f:
    for line in f:
        snp_set.add(line.strip())

print(f"Loaded {len(snp_set):,} SNPs.")
print("Example SNPs:", list(snp_set)[:5])

Loaded 22,509 SNPs.
Example SNPs: ['rs8102308', 'rs3752203', 'rs3764633', 'rs4482395', 'rs2230611']


In [None]:
df_sumstats = df_sumstats[df_sumstats['SNP'].isin(snp_set)]
#~17865 rows

In [56]:
df_sumstats

Unnamed: 0,SNP,A1,A2,N,CHISQ,Z
18048949,rs8100066,A,G,28783,0.310079,0.556847
18048953,rs8105536,G,A,15777,0.997402,0.998700
18048976,rs2312724,C,T,28783,0.312550,-0.559062
18048979,rs1020382,T,C,15777,0.720259,0.848681
18049034,rs11084928,A,G,29835,1.203963,-1.097252
...,...,...,...,...,...,...
18482450,rs1550813,T,C,15777,0.359334,0.599445
18482472,rs10448,C,T,28783,8.432388,2.903857
18482474,rs7910,T,C,15777,3.745604,1.935356
18482480,rs8106379,G,A,16829,0.320022,-0.565705


In [57]:
# Save the processed file
df_sumstats.to_csv(output_file, sep="\t", index=False)

print(f"Processed file saved to: {output_file}")

Processed file saved to: ./data/processed_sumstats/HF_Bothsex_afr_19.sumstats
