In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from pathlib import Path
from scipy.stats import pointbiserialr

from tqdm import tqdm
from gtfparse import read_gtf

import scanpy
import anndata
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
fdir_raw = Path("../data/raw/")
fdir_processed = Path("../data/interim")
fdir_traintest = Path("../data/processed") / 'sex'
fdir_external = Path("../data/external")
ml_models_fdir = Path("../models")


In [None]:
def read_geuvadis(fname_data: Path | str,
                  fname_header: Path | str,
                  fname_gtf: Path | str):
    data_raw = pd.read_csv(fname_data, index_col=0).T
    data_raw = data_raw.astype(np.float32)

    data_header = pd.read_csv(fname_header, index_col=0)

    gtf_rawdata = read_gtf(fname_gtf)
    gtf_data = gtf_rawdata.to_pandas()
    gtf_data = gtf_data.set_index('transcript_id')
    gtf_data['transcript_id'] = gtf_data.index

    gtf_data = gtf_data.drop_duplicates("transcript_id")

    print('Dataset shape: ', data_raw.shape)

    return data_raw, data_header, gtf_data

data_raw, data_header, gtf_data = read_geuvadis(
    fdir_raw / 'Geuvadis.all.csv',
    fdir_raw / 'Geuvadis.SraRunTable.txt',
    fdir_raw / 'all_transcripts_strigtie_merged.gtf'
)

In [None]:
data_raw = data_raw[data_raw.columns.intersection(gtf_data.index)]
gtf_data = gtf_data.loc[data_raw.columns.intersection(gtf_data.index)]

In [None]:
organ = 'HEART'
fname = next((fdir_external / organ / 'reg').glob("*processed.h5"))
fname = fname.name

data_eval = pd.read_hdf(fdir_external / organ / 'reg' / fname, index_col=0)
data_eval_header = pd.read_csv(fdir_external / organ / 'reg' / 'SraRunTable.txt', sep=',', index_col=0)
# data_eval_header


In [None]:
adata = anndata.AnnData(data_raw)
adata.var = gtf_data
adata.obs = data_header

adata

In [None]:
adata_eval = anndata.AnnData(data_eval)
adata_eval.obs = data_eval_header

In [None]:
common_transcripts = adata_eval.var_names.intersection(adata.var_names)

adata_eval = adata_eval[:, common_transcripts]
adata = adata[:, common_transcripts]


In [None]:
pseudoautosoms_Y1 = [10001, 2781479]
pseudoautosoms_X1 = [10001, 2781479]
pseudoautosoms_Y2 = [56887903, 57217415]
pseudoautosoms_X2 = [155701383, 156030895]



In [None]:
adata_X = adata[:, adata.var['seqname'] == 'chrX']
adata_Y = adata[:, adata.var['seqname'] == 'chrY']

adata_X = adata_X[:, ((adata_X.var['end'] < pseudoautosoms_X1[0]) 
            | ((adata_X.var['start'] > pseudoautosoms_X1[1]) & ((adata_X.var['end'] < pseudoautosoms_X2[0])))
            | (adata_X.var['start'] > pseudoautosoms_X2[1]))]

adata_Y = adata_Y[:, ((adata_Y.var['end'] < pseudoautosoms_Y1[0]) 
            | ((adata_Y.var['start'] > pseudoautosoms_Y1[1]) & ((adata_Y.var['end'] < pseudoautosoms_Y2[0])))
            | (adata_Y.var['start'] > pseudoautosoms_Y2[1]))]

adata_autosomes = adata[:, ((adata.var['seqname'] != 'chrX') & (adata.var['seqname'] != 'chrY'))]
adata_XY = adata.copy()

adata_auto_X = adata[:, adata_X.var_names.union(adata_autosomes.var_names)]
adata_auto_Y = adata[:, adata_Y.var_names.union(adata_autosomes.var_names)]


In [None]:
def preprocess(adata):
    adata.layers["counts"] = adata.X.copy()
    scanpy.pp.normalize_total(adata)
    scanpy.pp.log1p(adata)
    adata.obs_names_make_unique()
    return adata


adata_X = preprocess(adata_X)
adata_Y = preprocess(adata_Y)
adata_autosomes = preprocess(adata_autosomes)
adata_XY = preprocess(adata_XY)
adata_auto_X = preprocess(adata_auto_X)
adata_auto_Y = preprocess(adata_auto_Y)

: 

In [235]:
for _data in tqdm([adata_X,adata_Y,adata_autosomes,adata_XY,adata_auto_X,adata_auto_Y]):
        
    scanpy.pp.highly_variable_genes(
        _data,
        flavor="seurat_v3",
        # batch_key="Experimental_Factor:_laboratory (exp)",
        subset=True
        
    )


In [None]:
adata_Y

In [None]:
scanpy.tl.pca(adata_autosomes, 100)
scanpy.pl.pca_variance_ratio(adata_autosomes, n_pcs=100, log=True)
adata_autosomes

In [None]:
adata_autosomes.varm['PCs'][:,0]