In [26]:
import gc

import anndata as ad
import numpy as np
import pandas as pd
import scanpy as sc
from cellink.io import read_plink
from liftover import get_lifter
import zarr


from pysrc.cis_pipeline_v7.data import column_normalize
from pysrc.paths import DATA

In [18]:
converter = get_lifter("hg19", "hg38", one_based=True)

def try_liftover(row):
    try:
        return int(converter[str(row.chrom)][row.pos][0][1])
    except:  # noqa
        return np.nan

In [14]:
adata = ad.read_h5ad(DATA / "OneK1K_cohort_gene_expression_matrix_14_celltypes_f32.h5ad")
indis = np.array(list(adata.obs.individual.sort_values().unique()))
obs = adata.obs.groupby("individual", observed=True)[["sex", "age"]].first().sort_index()
obs.index = obs.index.astype(str)
obs["sex"] = obs["sex"] - 1
obs["age_std"] = (obs["age"] - obs["age"].mean()) / obs["age"].std()
obs = obs.loc[indis]

sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
sc.pp.normalize_total(adata)
mdata = sc.get.aggregate(adata, by="individual", func="mean")
mdata.X = mdata.layers.pop("mean")
sc.pp.highly_variable_genes(mdata, n_top_genes=5000)
sc.pp.pca(mdata)
mdata = mdata[indis].copy()
del adata
gc.collect()

4751

In [19]:
chroms = np.arange(1, 23)
gdatas = []
GENODATA = DATA / "OneK1K_imputation_post_qc_r2_08"
plinkdir = GENODATA / "plink_v2"
for chrom in chroms:
    gdata = read_plink(plinkdir / f"chr{chrom}.dose.filtered.R2_0.8")
    gdata.obs = gdata.obs[[]]
    gdata.var = gdata.var.drop(columns=["contig"])
    gdata = gdata[indis].copy()
    new_pos = gdata.var.apply(lambda row: try_liftover(row), axis=1)
    gdata.var["pos_hg19"] = new_pos.astype(pd.Int64Dtype())
    gdata.var["id_hg19"] = (
        gdata.var.chrom
        + "_"
        + gdata.var.pos_hg19.astype(str)
        + "_"
        + gdata.var.a0
        + "_"
        + gdata.var.a1
    )
    gdata.var["id_hg19"] =  gdata.var["id_hg19"].astype(str)
    gdata.obs.index = gdata.obs.index.astype(str)
    gdatas.append(gdata)
gdata = ad.concat(gdatas, axis=1)
gdata.obs = obs
gdata

AnnData object with n_obs × n_vars = 981 × 7406085
    obs: 'sex', 'age', 'age_std'
    var: 'chrom', 'pos', 'a0', 'a1', 'id', 'pos_hg19', 'id_hg19'

In [20]:
gdata.obsm["ePCs"] = pd.DataFrame(
    mdata.obsm["X_pca"],
    index=gdata.obs_names,
    columns=[f"ePC{i}" for i in range(1, mdata.obsm["X_pca"].shape[1] + 1)],
)

In [21]:
gpcdf = (
    pd.read_csv(
        GENODATA / "pcdir_v2/wgs.dose.filtered.R2_0.8.filtered.pruned.eigenvec",
        sep=" ",
        header=None,
    )
    .drop(columns=[0])
    .set_index(1)
)
gpcdf.columns = [f"gPC{i}" for i in range(1, gpcdf.shape[1] + 1)]
gpcdf = gpcdf.loc[indis]
gdata.obsm["gPCs"] = gpcdf
gdata.obsm["gPCs"].index.name = "donor_id"

In [22]:
kindf = pd.read_csv(
    GENODATA / "kinship_v2/wgs.dose.filtered.R2_0.8.filtered.pruned.rel", sep="\t", header=None
)
indi_index = pd.read_csv(
    GENODATA / "kinship_v2/wgs.dose.filtered.R2_0.8.filtered.pruned.rel.id", sep="\t", header=None
)[1].values
kindf.columns = indi_index
kindf.index = indi_index
kindf = kindf.loc[indis, indis]
gdata.obsp["kinship"] = kindf

In [23]:
F = pd.DataFrame(index=gdata.obs_names)
F["const"] = 1
F["sex"] = obs["sex"]
F["age_std"] = obs["age_std"]
F = pd.concat([F, gdata.obsm["ePCs"].iloc[:, :15], gdata.obsm["gPCs"].iloc[:, :20]], axis=1).astype(np.float64)
F.iloc[:, 2:] = column_normalize(F.iloc[:, 2:].values)
gdata.obsm["F"] = F


In [24]:
gdata.write_zarr(GENODATA / "gdata_v2.zarr")

... storing 'chrom' as categorical
... storing 'a0' as categorical
... storing 'a1' as categorical
... storing 'id_hg19' as categorical


In [25]:
print("Gdata created")

Gdata created


In [27]:
SPARSE_CHUNK_SIZE = 100_000

with zarr.open(GENODATA / "gdata_v2.zarr", "r") as f:
    adata = ad.AnnData(**{k: ad.io.read_elem(f[k]) for k in f.keys() if k != "X"})
    adata.X = ad.experimental.read_elem_as_dask(
        f["X"], chunks=(adata.shape[0], SPARSE_CHUNK_SIZE)
    )
adata


AnnData object with n_obs × n_vars = 981 × 7406085
    obs: 'sex', 'age', 'age_std'
    var: 'chrom', 'pos', 'a0', 'a1', 'id', 'pos_hg19', 'id_hg19'
    obsm: 'F', 'ePCs', 'gPCs'
    obsp: 'kinship'

In [28]:
adata.var.pos_hg19.unique()

<IntegerArray>
[  779885,   779987,   782105,   785001,   785910,   791414,   794252,
   794299,   796338,   798969,
 ...
 50759174, 50760141, 50760440, 50760478, 50760570, 50764320, 50770140,
 50772603, 50775185, 50778136]
Length: 7255546, dtype: Int64