In [4]:
import subprocess
import os

# Path to your input files and desired output folder
original_file = "../data/LDREF/1000G.EUR."
keep_file = "../data/samples.txt"
output_dir_filter = "../data/LDREF_filtered"  # Directory where filtered files will be stored
output_dir_pruned = "../data/LDREF_pruned"  # Directory where filtered files will be stored

os.makedirs(output_dir_filter, exist_ok=True)  # Ensure the output directory exists
os.makedirs(output_dir_pruned, exist_ok=True)  # Ensure the output directory exists

# LD pruning parameters
window_size = "50"
step_size = "5"
r2_threshold = "0.2"

# Loop through chromosomes 1 to 22
for chrom in range(1, 23):
    filtered_file = f"{output_dir_filter}/1000G.EUR.{chrom}"
    pruned_prefix = f"{output_dir_pruned}/1000G.EUR.{chrom}"

    # Step 1: Filter samples using --keep
    filter_command = [
        "plink",
        "--bfile", f"{original_file}{chrom}",
        "--keep", keep_file,
        "--make-bed",
        "--out", filtered_file
    ]
    
    # Step 2: Perform LD pruning
    prune_command = [
        "plink",
        "--bfile", filtered_file,
        "--indep-pairwise", window_size, step_size, r2_threshold,
        "--out", pruned_prefix
    ]

    # Step 3: Apply pruning (extract only pruned SNPs)
    threshold_command = [
        "plink",
        "--bfile", filtered_file,
        "--extract", f"{pruned_prefix}.prune.in",
        "--make-bed",
        "--out", pruned_prefix
    ]

    # Run the filtering step
    try:
        subprocess.run(filter_command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        print(f"Successfully filtered chromosome {chrom}")
    except subprocess.CalledProcessError as e:
        print(f"Error filtering chromosome {chrom}: {e.stderr.decode('utf-8')}")
        continue  # Skip to next chromosome if error occurs

    # Run the pruning step
    try:
        subprocess.run(prune_command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        print(f"Successfully performed LD pruning for chromosome {chrom}")
    except subprocess.CalledProcessError as e:
        print(f"Error in LD pruning for chromosome {chrom}: {e.stderr.decode('utf-8')}")
        continue

    # Run the thresholding step
    try:
        subprocess.run(threshold_command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        print(f"Successfully applied pruning for chromosome {chrom}")
    except subprocess.CalledProcessError as e:
        print(f"Error applying pruning for chromosome {chrom}: {e.stderr.decode('utf-8')}")

print("Filtering, LD pruning, and thresholding completed for all chromosomes!")


Successfully filtered chromosome 1
Successfully performed LD pruning for chromosome 1
Successfully applied pruning for chromosome 1
Successfully filtered chromosome 2
Successfully performed LD pruning for chromosome 2
Successfully applied pruning for chromosome 2
Successfully filtered chromosome 3
Successfully performed LD pruning for chromosome 3
Successfully applied pruning for chromosome 3
Successfully filtered chromosome 4
Successfully performed LD pruning for chromosome 4
Successfully applied pruning for chromosome 4
Successfully filtered chromosome 5
Successfully performed LD pruning for chromosome 5
Successfully applied pruning for chromosome 5
Successfully filtered chromosome 6
Successfully performed LD pruning for chromosome 6
Successfully applied pruning for chromosome 6
Successfully filtered chromosome 7
Successfully performed LD pruning for chromosome 7
Successfully applied pruning for chromosome 7
Successfully filtered chromosome 8
Successfully performed LD pruning for chr