In [2]:
import subprocess
import pandas as pd
import numpy as np
import random
from helper_fns import *

random.seed(42)

In [3]:
EUR_ge_regressed, YRI_ge_regressed, EUR_protein_genes, YRI_protein_genes = load_data()

Shapes of the dataframes: (373, 13942) (89, 13942) (13942, 5) (13942, 5)


In [22]:
# Function to extract SNPs for a gene and split into train/test
def process_gene(gene, chr, start, end, full_y_df=EUR_ge_regressed, ancestry='EUR', plink_path='/new-stg/home/banghua/anaconda3/envs/CSE_284/bin/plink'):
    # base_name = f'{gene}_chr{chr}_{start}_{end}'
    if not os.path.exists(f'/new-stg/home/banghua/TWAS_ASSOC/project_data/geno_gene_specific/{ancestry}/{gene}'):
        os.makedirs(f'/new-stg/home/banghua/TWAS_ASSOC/project_data/geno_gene_specific/{ancestry}/{gene}')
    
    b_file_path = "/new-stg/home/banghua/TWAS_ASSOC/project_data/geno/" + ancestry + "/GEUVADIS_EUR_chr" + str(chr)
    out_b_file_path = f'/new-stg/home/banghua/TWAS_ASSOC/project_data/geno_gene_specific/{ancestry}/{gene}/{gene}'
    # Step 2: Extract gene-specific SNPs
    subprocess.run([plink_path, '--bfile', b_file_path, '--chr', str(chr), 
                    '--from-bp', str(start), '--to-bp', str(end), 
                    '--make-bed', '--out', out_b_file_path])
    
    # Count the number of individuals
    with open(f'{out_b_file_path}.fam') as f:
        num_individuals = sum(1 for line in f)
    
    # Calculate split
    train_num = int(num_individuals * 0.8)
    
    # Shuffle individuals
    individuals = pd.read_csv(f'{out_b_file_path}.fam', sep='\s+', header=None)
    shuffled = individuals.sample(frac=1).reset_index(drop=True)
    
    # Split into train/test
    train = shuffled.head(train_num)
    test = shuffled.tail(num_individuals - train_num)
    
    train[[0, 1]].to_csv(f'{out_b_file_path}_train.txt', sep='\t', index=False, header=False)
    test[[0, 1]].to_csv(f'{out_b_file_path}_test.txt', sep='\t', index=False, header=False)
    
    # Generate train/test datasets
    subprocess.run([plink_path, '--bfile', out_b_file_path, '--keep', f'{out_b_file_path}_train.txt', 
                    '--make-bed', '--out', f'{out_b_file_path}_train'])
    subprocess.run([plink_path, '--bfile', out_b_file_path, '--keep', f'{out_b_file_path}_test.txt', 
                    '--make-bed', '--out', f'{out_b_file_path}_test'])
    
    # Step 3: Generate gene-specific phenotype
    # Get gene expression
    gene_idx = full_y_df.columns.get_loc(gene)
    gene_expression = full_y_df.iloc[:, gene_idx]

    # Generate gene-specific phenotype
    train_phenotype = gene_expression[train[1]]
    test_phenotype = gene_expression[test[1]]

    # Save gene-specific phenotype
    train_phenotype.to_csv(f'{out_b_file_path}_train.pheno', sep='\t', header=False)
    test_phenotype.to_csv(f'{out_b_file_path}_test.pheno', sep='\t', header=False)

In [13]:
os.chdir("/new-stg/home/banghua/TWAS_ASSOC/project_data/geno_gene_specific/")

In [23]:
process_gene('ENSG00000187634', 1, 360260, 1360261)

PLINK v1.90b6.21 64-bit (19 Oct 2020)          www.cog-genomics.org/plink/1.9/
(C) 2005-2020 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to /new-stg/home/banghua/TWAS_ASSOC/project_data/geno_gene_specific/EUR/ENSG00000187634/ENSG00000187634.log.
Options in effect:
  --bfile /new-stg/home/banghua/TWAS_ASSOC/project_data/geno/EUR/GEUVADIS_EUR_chr1
  --chr 1
  --from-bp 360260
  --make-bed
  --out /new-stg/home/banghua/TWAS_ASSOC/project_data/geno_gene_specific/EUR/ENSG00000187634/ENSG00000187634
  --to-bp 1360261

63755 MB RAM detected; reserving 31877 MB for main workspace.
137 out of 85764 variants loaded from .bim file.
373 people (0 males, 0 females, 373 ambiguous) loaded from .fam.
Ambiguous sex IDs written to
/new-stg/home/banghua/TWAS_ASSOC/project_data/geno_gene_specific/EUR/ENSG00000187634/ENSG00000187634.nosex
.
Using 1 thread (no multithreaded calculations invoked).
Before main variant filters, 373 founders and 0 nonfounders present.
Calculating al

In [33]:
import subprocess
import pandas as pd
import numpy as np

def weights_bslmm(input, bv_type=1, out=None, gemma_path="/new-stg/home/banghua/anaconda3/envs/CSE_284/bin/gemma", sys_print=True):
    """
    Run BSLMM analysis using GEMMA and extract effect sizes for specified SNPs.

    Parameters:
    - input: Base name for input files.
    - bv_type: Specifies the type of BSLMM analysis.
    - out: Optional. Specifies the base name for output files. Defaults to None.
    - gemma_path: Path to the GEMMA executable. Defaults to 'gemma'.
    - sys_print: If True, prints the GEMMA command output.

    Returns:
    - A numpy array of effect weights for the input SNPs.
    """
    if out is None:
        out = f"{input}.BSLMM"

    # Constructing the GEMMA command
    arg = f"{gemma_path} -miss 1 -maf 0 -r2 1 -rpace 1000 -wpace 1000 -bfile {input} -bslmm {bv_type} -o {out}"

    # Execute the GEMMA command
    result = subprocess.run(arg, shell=True, capture_output=not sys_print)
    if not sys_print:
        print(result.stdout.decode())  # Optional: print GEMMA output for debugging.

    # Read the output parameter file
    try:
        eff = pd.read_table(f"{out}.param.txt", header=0, sep='\t')
    except FileNotFoundError:
        raise FileNotFoundError("GEMMA output file not found. Check GEMMA execution and output path.")

    # Initialize effect weights with NaN for all SNPs
    snp = pd.read_table(f"{input}.bim", header=None, sep='\t')[1].values
    eff_wgt = pd.Series(np.nan, index=snp)

    # Match SNPs and assign weights
    for i, snp_id in enumerate(snp):
        if snp_id in eff['rs'].values:
            row = eff.loc[eff['rs'] == snp_id].iloc[0]
            eff_wgt.at[snp_id] = row['alpha'] + row['beta'] * row['gamma']

    return eff_wgt.values

In [25]:
input = "/new-stg/home/banghua/TWAS_ASSOC/project_data/geno_gene_specific/EUR/ENSG00000000938/ENSG00000000938_train"

In [34]:
test_weights = weights_bslmm(input)

GEMMA 0.98.3 (2020-11-28) by Xiang Zhou and team (C) 2012-2020
Reading Files ... 
ERROR: Enforce failed for number of analyzed individuals equals 0. in src/param.cpp at line 2073 in ProcessCvtPhen


FileNotFoundError: GEMMA output file not found. Check GEMMA execution and output path.