In [1]:
import pandas as pd
import numpy as np

import os
import sys
from typing import List, Tuple, Dict, Any, Optional
import pickle

os.chdir('/Users/xbh0403/Desktop/TWAS_ASSOC')

from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from pyplink import PyPlink

In [15]:
def load_data():
    EUR_ge_regressed = pd.read_csv("./project_data/GEUVADIS_EUR_ge_regressed.tsv.gz", sep="\t", index_col=0, compression="gzip")
    YRI_ge_regressed = pd.read_csv("./project_data/GEUVADIS_YRI_ge_regressed.tsv.gz", sep="\t", index_col=0, compression="gzip")

    EUR_protein_genes = pd.read_csv("./project_data/GEUVADIS_EUR_protein_genes.tsv.gz", sep="\t", index_col=0, compression="gzip")
    EUR_protein_genes["chr"] = EUR_protein_genes.index
    EUR_protein_genes.reset_index(drop=True, inplace=True)
    YRI_protein_genes = pd.read_csv("./project_data/GEUVADIS_YRI_protein_genes.tsv.gz", sep="\t", index_col=0, compression="gzip")
    YRI_protein_genes["chr"] = YRI_protein_genes.index
    YRI_protein_genes.reset_index(drop=True, inplace=True)

    print("Shapes of the dataframes:", EUR_ge_regressed.shape, YRI_ge_regressed.shape, EUR_protein_genes.shape, YRI_protein_genes.shape)

    return EUR_ge_regressed, YRI_ge_regressed, EUR_protein_genes, YRI_protein_genes

EUR_ge_regressed, YRI_ge_regressed, EUR_protein_genes, YRI_protein_genes = load_data()

Shapes of the dataframes: (373, 13942) (89, 13942) (13942, 5) (13942, 5)


In [16]:
YRI_ge_regressed

Unnamed: 0,ENSG00000187634,ENSG00000188976,ENSG00000187961,ENSG00000187583,ENSG00000187642,ENSG00000188290,ENSG00000187608,ENSG00000188157,ENSG00000131591,ENSG00000186891,...,ENSG00000187609,ENSG00000188747,ENSG00000165802,ENSG00000130653,ENSG00000182154,ENSG00000148399,ENSG00000165724,ENSG00000197070,ENSG00000181090,ENSG00000148408
NA18486,0.094744,-0.351085,-1.932980,-0.321105,-0.515498,-2.079754,-0.273891,0.033437,0.364211,-0.428842,...,-0.928062,-1.769592,0.392149,0.521271,0.846738,-0.396933,0.285021,-0.399697,1.098842,-0.650935
NA18487,-0.902019,-1.003504,-0.070440,-0.136875,-0.813910,-1.176463,0.851783,-0.342203,-2.013583,-1.726217,...,0.363013,1.649147,0.490006,-1.074849,0.904444,-0.928404,-1.029141,1.378654,1.601134,-1.061971
NA18488,0.787695,0.997348,-0.302960,0.569240,0.139434,0.274250,-0.596212,0.165744,0.722004,-1.196606,...,-0.730004,-0.802126,1.158687,1.805478,0.252108,-1.789603,-0.507488,-0.404853,1.079572,-0.565762
NA18489,-0.424581,0.991987,-0.191124,0.463024,0.538183,-0.787763,-0.231541,1.012617,0.530263,-0.254174,...,-1.166257,0.819447,1.003430,-0.000927,-0.424602,-0.038372,-0.091295,-1.516224,0.934990,-1.346285
NA18498,-0.873080,-0.675177,0.048259,-1.092280,0.573952,0.626174,-0.147917,0.465355,0.619457,-0.494885,...,0.757072,-0.423255,0.246138,-0.073313,0.088300,0.017429,-0.811266,0.273057,2.637008,-0.349992
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
NA19236,-0.719438,0.514831,1.334156,-0.265431,1.475308,-1.031606,-0.005093,0.409364,-0.029977,-0.324761,...,-0.779381,-1.241429,0.302309,0.729772,-0.236441,0.554705,-0.371973,0.184894,-0.352576,0.254183
NA19247,0.488869,0.378698,-0.522324,-0.267890,-0.828062,1.360805,-0.170798,0.100119,-0.029440,-0.231236,...,0.310839,0.378431,-0.850058,-1.266535,1.118367,0.303224,-1.360509,0.627126,-1.371108,0.134620
NA19248,1.038800,1.203831,0.051683,-0.004533,0.150353,0.020862,0.338120,1.413611,0.678311,0.289597,...,0.629750,-0.256761,-0.026252,-0.836387,-0.844268,0.802872,-0.047072,-0.640519,-0.734871,-0.343510
NA19256,-1.718986,-0.760087,0.382552,0.990427,0.493457,-0.789862,-0.642850,0.329378,0.879047,-0.565545,...,1.304044,1.385512,0.332867,0.383803,-1.767664,1.280153,-0.669369,0.240813,0.055344,0.389836


In [3]:
def find_snps_in_gene(chr_num: int, start: int, end: int, ancsetry: str) -> pd.DataFrame:
    """
    Find SNPs within a specified genomic region.

    Parameters:
    chr_num (int): Chromosome number.
    start (int): Start position of the genomic region.
    end (int): End position of the genomic region.
    bim (pd.DataFrame): DataFrame containing SNP information.

    Returns:
    pd.DataFrame: DataFrame containing SNPs within the specified genomic region.
    
    """
    with PyPlink("./project_data/geno/"+ancsetry+"/GEUVADIS_"+ancsetry+"_chr"+str(chr_num)) as bed:
        # Getting the BIM and FAM
        bim = bed.get_bim()
        bim["snp"] = bim.index
        bim.reset_index(drop=True, inplace=True)
        bim = bim[["snp", "chrom", "pos", "cm", "a1", "a2"]]
    return bim[(bim["pos"] >= start) & (bim["pos"] <= end)]

In [4]:
def process_geno(ancsetry: str, chr_num: int, snps: List[str], start: int, gene_id: str, save_result: bool = False) -> np.ndarray:
    """
    Process the genotype data.

    Parameters:
    ancsetry (str): Ancestry of the individual.
    chr_num (int): Chromosome number.
    individual (str | None): Individual ID.
    snps (List[str]): List of SNPs.

    Returns:
    pd.DataFrame: Processed genotype data.
    """
    if len(snps) == 0:
        raise ValueError("No SNPs provided.")
    
    with PyPlink("./project_data/geno/"+ancsetry+"/GEUVADIS_"+ancsetry+"_chr"+str(chr_num)) as bed:
        # Getting the BIM and FAM
        bim = bed.get_bim()
        bim["snp"] = bim.index
        bim.reset_index(drop=True, inplace=True)
        bim = bim[["snp", "chrom", "pos", "cm", "a1", "a2"]]
        snp_info = bim[bim["snp"].isin(snps)]

        fam = bed.get_fam()
        iids = fam["iid"].tolist()

        results = {
            "snp_info": snp_info,
            "iids": iids,
        }

        keep_bool = [1] * len(iids)

        for snp_id, genotypes in bed.iter_geno_marker(snps):
            results[snp_id] = genotypes
            genotypes_kept = [0 if x == -1 else 1 for x in genotypes]
            keep_bool = [x*y for x, y in zip(keep_bool, genotypes_kept)]

        results["keep_bool"] = keep_bool

        for snp_id, genotypes in bed.iter_geno_marker(snps):
            results[snp_id+"_filtered"] = [x for x, y in zip(genotypes, keep_bool) if y == 1]
        
        results["iids_filtered"] = [x for x, y in zip(iids, keep_bool) if y == 1]

        X = np.zeros((len(results["iids_filtered"]), snp_info.shape[0]))

        for i in range(snp_info.shape[0]):
            X[:, i] = results[snps[i]+"_filtered"]

        if save_result:
            with open("./project_data/processed_Xy/"+gene_id+"_results.pkl", "wb") as f:
                pickle.dump(results, f)
            # save x as npy
            np.save("./project_data/processed_Xy/X/"+gene_id+"_X.npy", X)

        return results, X

In [5]:
def get_y(gene_id: str, iids: List[str], y_full_df: pd.DataFrame = EUR_ge_regressed, save_y: bool = False) -> np.ndarray:
    """
    Get the Y values for a specific gene and ancestry.

    Parameters:
    gene_id (str): Gene ID.
    iids (List[str]): List of individual IDs.
    y_full_df (pd.DataFrame): DataFrame containing the Y values.

    Returns:
    np.ndarray: Array of Y values for the specified gene and ancestry.
    """
    all_columns = y_full_df.columns
    id_gene = all_columns.get_loc(gene_id)
    all_individuals = y_full_df.index
    iids_keep = [True if x in iids else False for x in all_individuals]
    y = y_full_df.iloc[iids_keep, id_gene].values
    if save_y:
        np.save("./project_data/processed_Xy/y/"+gene_id+"_y.npy", y)
    return y

In [6]:
def process_one_gene(gene_id: str, protein_genes: pd.DataFrame, ancsetry: str) -> np.ndarray:
    gene = protein_genes[protein_genes["gene_id"] == gene_id]
    assert gene.shape[0] == 1
    chr_num = gene["chr"].values[0]
    start = gene["start"].values[0]
    end = gene["end"].values[0]
    gene_name = gene["name"].values[0]
    print("Gene id: ", gene_id, " Gene name: ", gene_name, " Chr: ", chr_num, " Start: ", start, " End: ", end)
    
    snps = find_snps_in_gene(chr_num, start, end, ancsetry)
    snps_name = snps["snp"].tolist()
    print("Number of SNPs: ", len(snps_name))
    processed_geno, X = process_geno(ancsetry, chr_num, snps_name, start, gene_id, save_result=True)
    y = get_y(gene_id, processed_geno["iids_filtered"], save_y=True)
    return processed_geno, X, y

In [7]:
# one_gene_result, X, y = process_one_gene("ENSG00000187634", EUR_protein_genes, "EUR")
one_gene_result, X, y = process_one_gene("ENSG00000187642", EUR_protein_genes, "EUR")
# one_gene_result, X, y = process_one_gene("ENSG00000187583", EUR_protein_genes, "EUR")
# one_gene_result, X, y = process_one_gene("ENSG00000187961", EUR_protein_genes, "EUR")
# one_gene_result, X, y = process_one_gene("ENSG00000144182", EUR_protein_genes, "EUR")
# one_gene_result, X, y = process_one_gene("ENSG00000215784", EUR_protein_genes, "EUR")

Gene id:  ENSG00000187642  Gene name:  PERM1  Chr:  1  Start:  410579  End:  1410580
Number of SNPs:  138


In [8]:
def standardize(X: np.ndarray) -> np.ndarray:
    """
    Standardize the X matrix.

    Parameters:
    X (np.ndarray): X matrix.

    Returns:
    np.ndarray: Standardized X matrix.
    """
    mean = np.mean(X, axis=0)
    std = np.std(X, axis=0)
    X = [(X[i, :] - mean) / std for i in range(X.shape[0]) if std[i] != 0]
    return X

In [9]:
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_std, y, test_size=0.2, random_state=42)

In [13]:
# Linear regression
reg = linear_model.LinearRegression()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
y_pred_train = reg.predict(X_train)
print("Mean squared error: ", np.mean((y_train - y_pred_train)**2))
print("Mean squared error: ", np.mean((y_pred - y_test)**2))

Mean squared error:  0.5590618915480747
Mean squared error:  3.664491328632582e+26


In [10]:
lasso = linear_model.Lasso(alpha=0.1, random_state=42)
lasso.fit(X_train, y_train)
print(lasso.score(X_train, y_train))
print(lasso.score(X_test, y_test))

0.03634878314031986
-0.022963852380138983


In [11]:
def LASSO(X, Y, alpha=0.1, test_size=0.2, random_state=42):
    X_std = StandardScaler().fit_transform(X)
    X_train, X_test, Y_train, Y_test = train_test_split(X_std, Y, test_size=test_size, random_state=random_state)
    clf = linear_model.Lasso(alpha=alpha)
    clf.fit(X_train, Y_train)
    return {
        "clf": clf,
        "X_train": X_train,
        "X_test": X_test,
        "Y_train": Y_train,
        "Y_train_pred": clf.predict(X_train),
        "Y_test": Y_test,
        "Y_test_pred": clf.predict(X_test),
        "r2_train": clf.score(X_train, Y_train),
        "r2_test": clf.score(X_test, Y_test)
    }

In [12]:
lasso_result = LASSO(X, y, alpha=0.1, test_size=0.2, random_state=42)