In [None]:
import os
import subprocess
import pandas as pd

In [None]:
# download command
def dx_download(project_name, file_name, dest_name):
    quoted_path = f'"{project_name}:{file_name}"'
    download_command = f"dx download {quoted_path} -o {dest_name}"
    print(f"Downloading {file_name} from {project_name} to {dest_name}")
    subprocess.run(download_command, shell=True, check=True)

# upload command
def dx_upload(file_name, dest_name):
    upload_command = f"dx upload {file_name} -o {dest_name}"
    print(f"Uploading {file_name} to {dest_name}")
    subprocess.run(upload_command, shell=True, check=True)

# FastGWAS

In [None]:

# project name    
# project_id = dxpy.api.user_get_project({'project': dxpy.DXProject('').get_id()})['project']
# project_name = dxpy.DXProject(project_id).describe()['name']
project_name = "project-GxqpVq0Jpp5Py82xVbZV198y"

# cloud genotype data path
genotype_origin_folder = "/GWAS_pipeline"


# Instance storage path: make and check dirctory
In_saving_folder = "UKB54k_genotype_data"
if not os.path.exists(f"{In_saving_folder}"):
    os.makedirs(In_saving_folder)

## Data Preparation

In [None]:
##################################################################
# Download the genotype data
genotype_folder = f"{genotype_origin_folder}/QCed_autosomes"
file_name = "ukb54k_EUR_qced.bed"
dest_name = f"{In_saving_folder}/ukb54k_EUR_qced.bed"
dx_download(project_name, genotype_folder + "/" + file_name, dest_name)
file_name = "ukb54k_EUR_qced.bim"
dest_name = f"{In_saving_folder}/ukb54k_EUR_qced.bim"
dx_download(project_name, genotype_folder + "/" + file_name, dest_name)
file_name = "ukb54k_EUR_qced.fam"
dest_name = f"{In_saving_folder}/ukb54k_EUR_qced.fam"
dx_download(project_name, genotype_folder + "/" + file_name, dest_name)

##################################################################
# Download the GRM data
if not os.path.exists(f"{In_saving_folder}/GRM"):
    os.makedirs(f"{In_saving_folder}/GRM")
    

grm_file = f"{genotype_origin_folder}/GRM/ukb54k_EUR_qced_sprs_grm.grm.id"
dest_name = f"{In_saving_folder}/GRM/ukb54k_EUR_qced_sprs_grm.grm.id"
dx_download(project_name, grm_file, dest_name)
grm_file = f"{genotype_origin_folder}/GRM/ukb54k_EUR_qced_sprs_grm.grm.sp"
dest_name = f"{In_saving_folder}/GRM/ukb54k_EUR_qced_sprs_grm.grm.sp"
dx_download(project_name, grm_file, dest_name)

##################################################################
# Download the phenotype data
if not os.path.exists(f"{In_saving_folder}/phenotype"):
    os.makedirs(f"{In_saving_folder}/phenotype")
phenotype_file = f"{genotype_origin_folder}/Image_dataset/final_pheno_caucasian.txt"
dest_name = f"{In_saving_folder}/phenotype/UKB_cortical_phenotypes.csv"
dx_download(project_name, phenotype_file, dest_name)


##################################################################
# Download the covariate data
if not os.path.exists(f"{In_saving_folder}/covariates"):
    os.makedirs(f"{In_saving_folder}/covariates")

covariate_file = f"{genotype_origin_folder}/Covariates/ukb_covar.txt"
dest_name = f"{In_saving_folder}/covariates/ukb_covar.txt"
dx_download(project_name, covariate_file, dest_name)
qcov_file = f"{genotype_origin_folder}/Covariates/ukb_qcovar.txt"
dest_name = f"{In_saving_folder}/covariates/ukb_qcovar.txt"
dx_download(project_name, qcov_file, dest_name)

## Step 1

In [None]:
# Create the feature list for the phenotype files
pheno = pd.read_csv(f"{In_saving_folder}/phenotypes/UKB_cortical_phenotypes.csv")
pheno_features = [col for col in pheno.columns if col not in ['FID', 'IID', 'version']]
for f in pheno_features:
    ## Create the phenotype file per columns
    pheno_file = f"{In_saving_folder}/phenotypes/pheno_{f}.txt"

    # Write phenotype file
    pheno_df_subset = pheno[["FID", "IID", f]].dropna()
    pheno_df_subset.columns = ["FID", "IID", "PHENO"]
    pheno_df_subset.to_csv(pheno_file, sep="\t", index=False)

## Step 2

In [None]:
# Create fastGWAS function
"""
fastGWA function will auto match the FID and IID from all the files
Be cautious with any missing values in the phenotype file (cause crash immediately)
"""
def fastGWA(input_file, output_file, grm_file, pheno_file, covar_file, qcovar_file, threads=10):
    print("Running fastGWA for ...")
    cmd = f"gcta64 --bfile {input_file} \
                --grm-sparse {grm_file} \
                --fastGWA-mlm \
                --covar {covar_file} \
                --qcovar {qcovar_file} \
                --pheno {pheno_file} \
                --threads {threads} \
                --out {output_file}"
    print(cmd)
    subprocess.run(cmd, shell=True, check=True)

In [None]:
# Run fastGWA for each phenotype

if not os.path.exists(f"{In_saving_folder}/fastGWA_results"):
    os.makedirs(f"{In_saving_folder}/fastGWA_results")

for fea in pheno_features:
    pheno_file = f"{In_saving_folder}/phenotypes/pheno_{fea}.txt"
    output_file = f"{In_saving_folder}/fastGWA_results/{fea}_sumstat"
    grm_file = f"{In_saving_folder}/GRM/ukb54k_EUR_qced_sprs_grm"
    covar_file = f"{In_saving_folder}/covariates/ukb_covar.txt"
    qcovar_file = f"{In_saving_folder}/covariates/ukb_qcovar.txt"
    input_file = f"{In_saving_folder}/ukb54k_EUR_qced"

    fastGWA(input_file, output_file, grm_file, pheno_file, covar_file, qcovar_file)

## Step 3

In [None]:
instance_folder = f"{In_saving_folder}/fastGWA_results"
project_folder = f"{genotype_origin_folder}/GWAS_dataset/fastGWAS_sumstats"

# Loop through each file and upload it
for fname in os.listdir(instance_folder):
    local_path = os.path.join(instance_folder, fname)
    if os.path.isfile(local_path):
        dest_path = f"{project_folder}/{fname}"
        dx_upload(local_path, dest_path)