In [None]:
import os
import subprocess
import dxdy

## QC process

In [None]:
# download command
def dx_download(project_name, file_name, dest_name):
    quoted_path = f'"{project_name}:{file_name}"'
    download_command = f"dx download {quoted_path} -o {dest_name}"
    print(f"Downloading {file_name} from {project_name} to {dest_name}")
    subprocess.run(download_command, shell=True, check=True)

# upload command
def dx_upload(file_name, dest_name):
    upload_command = f"dx upload {file_name} -o {dest_name}"
    print(f"Uploading {file_name} to {dest_name}")
    subprocess.run(upload_command, shell=True, check=True)

In [None]:
## Download genotype data from DNA Nexus
# make and check dirctory
saving_dir = "UKB62k_genotype_data"
if not os.path.exists(saving_dir):
    os.makedirs(saving_dir)
    
    
# project_id = dxpy.api.user_get_project({'project': dxpy.DXProject('').get_id()})['project']
# project_name = dxpy.DXProject(project_id).describe()['name']
project_name = "project-GxqpVq0Jpp5Py82xVbZV198y"

# cloud genotype data path
genotype_origin_folder = "/GWAS_pipeline/Genotype_data"


# file_list
chr_num_list = list(range(1, 23))
chr_num_list = list(map(str, chr_num_list))
chr_num_list.append('X')
chr_num_list.append('XY')


## Download all chromosomes plink files

In [None]:
# download all chromosomes
for i in chr_num_list:
    file_name = f"chr{i}_62k.bed"
    dest_name = f"{saving_dir}/chr{i}_62k.bed"
    dx_download(project_name, genotype_origin_folder + "/" + file_name, dest_name)
    file_name = f"chr{i}_62k.bim"
    dest_name = f"{saving_dir}/chr{i}_62k.bim"
    dx_download(project_name, genotype_origin_folder + "/" + file_name, dest_name)
    file_name = f"chr{i}_62k.fam"
    dest_name = f"{saving_dir}/chr{i}_62k.fam"
    dx_download(project_name, genotype_origin_folder + "/" + file_name, dest_name)

## Create merge list for plink merge test and remove triallelic SNPs


In [None]:
# Create merge_list for plink merge
with open("merge_list.txt", "w") as f:
    for i in range(2, 22):
        f.write(f"{saving_dir}/chr{i}_62k\n")
        

command = f"plink --bfile {saving_dir}/chr1_62k --merge-list merge_list.txt --out {saving_dir}/test_merge"
try:
    subprocess.run(command,shell=True,check=True)
except subprocess.CalledProcessError as e:
    print("!Merge failed — expected during SNP cleanup.")
    print(f"Return code: {e.returncode}")

In [None]:
# Remove triallelic SNPs and create cleaned files
def clean_chromosomes(base_path, missnp_file, output_suffix="_cleaned"):
    for chr in range(1, 23):#chr 1-22
        prefix = f"{base_path}/chr{chr}_62k"
        output = f"{prefix}{output_suffix}"
        
        cmd = f"plink --bfile {prefix} --exclude {missnp_file} --make-bed --out {output}"
        
        print(f"Running PLINK cleaning for chr{chr}...")
        subprocess.run(cmd, shell=True, check=True)
        print(f"chr cleaned: {output}")
        
clean_chromosomes(saving_dir, saving_dir+"/test_merge.missnp")

In [None]:
# Recreate merge_list_cleaned for plink merge
with open("merge_list_cleaned.txt", "w") as f:
    for i in range(2, 22):
        f.write(f"{saving_dir}/chr{i}_62k_cleaned\n")
        

# Combine all plink files to one autosome file: file name is ukb62k_autosome
command = f"plink --bfile {saving_dir}/chr1_62k_cleaned --merge-list merge_list_cleaned.txt --make-bed --out {saving_dir}/ukb62k_autosome"
subprocess.run(command, shell=True, check=True)

## QC starting

In [None]:
#Sample QC function
def sample_qc(input_file, output_file, mind, geno, hwe):
    print(f"Performing sample QC on {input_file}...")
    cmd = f"plink2 --bfile {input_file} \
                --mind {mind} \
                --geno {geno} \
                --hwe {hwe} \
                --make-bed \
                --out {output_file}"
    print(cmd)
    subprocess.run(cmd, shell=True, check=True)
    
    
# Sample QC
input_file = f"{saving_dir}/ukb62k_autosome"
output_file = f"{saving_dir}/ukb62k_autosome_qc"
mind = 0.1
geno = 0.05
hwe = 1e-6
sample_qc(input_file, output_file, mind, geno, hwe)

In [None]:
# upload QCed ukb62k_autosome
dx_upload(f"{saving_dir}/ukb62k_autosome_qc.bed", f"{genotype_origin_folder}/ukb62k_autosome_qc.bed")
dx_upload(f"{saving_dir}/ukb62k_autosome_qc.bim", f"{genotype_origin_folder}/ukb62k_autosome_qc.bim")
dx_upload(f"{saving_dir}/ukb62k_autosome_qc.fam", f"{genotype_origin_folder}/ukb62k_autosome_qc.fam")

## Subset and filter subjects (to be updated)

In [None]:
# Subset and filter sample file create (to be updated)
#Filter csv file
def subset_samples(input_file, output_file, filter_caucasian: bool = True):
    print(f"Subsetting samples from {input_file} to {output_file}...")
    # Load CSV and standardize column names
    df = pd.read_csv(input_file, sep=",")
    df.columns = [col.strip() for col in df.columns]
    # Filter by Caucasian
    if filter_caucasian:
        df = df[df["p22006"] == "1"]
    else:
        print("No filtering applied for Caucasian samples.")
    
    # format for plink keep
    keep_df = df[["eid"]].rename(columns={"eid": "IID"})
    keep_df = keep_df.assign(FID=keep_df["IID"])[["FID", "IID"]]
    
    # Save to file
    keep_df.to_csv(output_file, sep="\t", index=False, header=False)
    print(f"Saved {len(keep_df)} samples to {output_file}")
    
    
# Filter participant for plink
def filter_participant(input_file, output_file, keep_file):
    print(f"Performing filtering on {input_file}...")
    cmd = f"plink2 --bfile {input_file} \
            --keep {keep_file} \
            --make-bed \
            --out {output_file}"
    print(cmd)
    subprocess.run(cmd, shell=True, check=True)


In [None]:
# Filter for Caucasian individuals
# make and check dirctory

# Subset samples: initial file has already prepared from the cohort app
dx_download(project_name, "full_participant.csv", "participant_table.csv")

if not os.path.exists(f"{saving_dir}/ukb62k_autosome_qced"):
    os.makedirs(f"{saving_dir}/ukb62k_autosome_qced")
    

# Filter the participant table
subset_samples("participant_table.csv", f"{saving_dir}/ukb62k_autosome_qced/keep.txt", filter_caucasian=True)

# Filter the genotype data
filter_participant(f"{saving_dir}/ukb62k_autosome_qced", f"{saving_dir}/ukb54k_EUR_qced", f"{saving_dir}/ukb62k_autosome_qced/keep.txt")


## Create GRM



In [None]:
# LD_pruned SNP list
dx_download(project_name, f"{genotype_origin_folder}/hapmap3.prune.in", "UKB_genotype_data/hapmap3.prune.in")

In [None]:
# Create sparse GRM
print("Creating sparse GRM...")
if os.path.exists(f"{saving_dir}/GRM/ukb62k_autosome_qced_sparse.grm.bin"):
    print("Sparse GRM already exists. Skipping creation.")
else:
    os.makedirs(f"{saving_dir}/GRM", exist_ok=True)

# Create GRM function
def grm_create(input_file, output_file, extract_snp_file, thread=10, sparse_cutoff=0.05):
    print(f"Creating GRM for {input_file}...")
    cmd = f"gcta64 --bfile {input_file} \
                --autosome \
                --extract {extract_snp_file} \
                --make-grm \
                --thread-num {thread} \
                --sparse-cutoff {sparse_cutoff} \
                --out {output_file}"
    print(cmd)
    subprocess.run(cmd, shell=True, check=True)

In [None]:
# Create GRM
grm_create(f"{saving_dir}/ukb54k_EUR_qced", f"{saving_dir}/GRM/ukb54k_EUR_qced_sprs_grm", "hapmap3.prune.in", thread=30)

In [None]:
# Upload GRM files to DNA Nexus
print("Uploading GRM files to DNA Nexus...")
dx_upload(f"{saving_dir}/GRM/ukb54k_EUR_qced_sprs_grm.grm.id", f"{genotype_origin_folder}/GRM/ukb54k_EUR_qced_sprs_grm.grm.id")
dx_upload(f"{saving_dir}/GRM/ukb54k_EUR_qced_sprs_grm.grm.sp", f"{genotype_origin_folder}/GRM/ukb54k_EUR_qced_sprs_grm.grm.sp")
dx_upload(f"{saving_dir}/GRM/ukb54k_EUR_qced_sprs_grm.log", f"{genotype_origin_folder}/GRM/ukb54k_EUR_qced_sprs_grm.log")