In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import make_scorer, accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold, KFold
from pathlib import Path
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay, auc
from sklearn.decomposition import PCA
import umap
import json
import cupy

from sklearn.neighbors import KNeighborsClassifier

from gtfparse import read_gtf

from tqdm import tqdm

# mlflow.set_tracking_uri(uri="http://localhost:8080")


fdir_raw = Path("../data/raw/")
fdir_processed = Path("../data/interim")
fdir_traintest = Path("../data/processed") / 'sex'
fdir_external = Path("../data/external")
ml_models_fdir = Path("../models")

use_CV = True

model_type = 'catboost'
model_type = 'xgboost'
# model_type = 'knn'

feature_importance_method = 'native'
feature_importance_method = 'SHAP'

In [2]:
organ = "HEART"
# organ = "BRAIN0"
# organ = "BRAIN1"

sex = ['chrXY', 'chrX', 'chrY', 'autosome'][0]
# sex = 'female'

In [3]:
def read_dataset(fname_data: Path | str,
                 fname_header: Path | str,
                 fname_gtf: Path | str,
                 separator=','):
    data_raw = pd.read_csv(fname_data, index_col=0, sep=separator).T
    data_raw = data_raw.astype(np.float32)

    data_header = pd.read_csv(fname_header, index_col=0, sep=',')

    gtf_rawdata = read_gtf(fname_gtf)
    gtf_data = gtf_rawdata.to_pandas()
    gtf_data = gtf_data.set_index('transcript_id')
    gtf_data['transcript_id'] = gtf_data.index

    gtf_data = gtf_data.drop_duplicates("transcript_id")

    print('Dataset shape: ', data_raw.shape)

    return data_raw, data_header, gtf_data

In [4]:
data_raw, data_header, gtf_data = read_dataset(
    fdir_raw / 'Geuvadis.all.csv',
    fdir_raw / 'Geuvadis.SraRunTable.txt',
    fdir_raw / 'all_transcripts_strigtie_merged.gtf',
)
data_raw

INFO:root:Extracted GTF attributes: ['gene_id', 'transcript_id', 'exon_number', 'gene_name', 'ref_gene_id']


Dataset shape:  (667, 338725)


Unnamed: 0,ENST00000378604.3,ENST00000307786.8,ENST00000482402.1,ENST00000462293.1,ENST00000378602.3,ENST00000310991.8,ENST00000378598.4,ENST00000470931.2,ENST00000602604.1,ENST00000416272.1,...,ENST00000426199.1,ENST00000517139.1,MSTRG.36845.1,ENST00000619329.1,ENST00000258589.8,ENST00000420810.1,ENST00000431853.1,ENST00000711259.1,ENST00000711266.1,ENST00000711267.1
ERR188021,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
ERR188022,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.074617
ERR188023,0.000000,0.169178,0.114391,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.014311,0.0,0.000000,0.000000,0.000000,0.000000
ERR188024,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.219131,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
ERR188025,0.000000,0.107439,0.017849,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ERR204940,6.127458,0.000000,4.660298,0.0,4.823868,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.180813,0.0,59.465179,2.708632,3.095435,9.169821
ERR205020,0.134626,0.000000,0.000000,0.0,0.152967,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.769428,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
ERR205021,0.009035,0.000000,0.152969,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
ERR205022,0.031064,0.000000,0.192926,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000


In [5]:
organ = ["HEART", "BRAIN0", "BRAIN1"][1]

if organ == "BRAIN1":
    fname = next((fdir_external / organ / 'reg').glob("*.csv"))
    separator = ","
else:
    fname = next((fdir_external / organ / 'reg').glob("*TPM.txt"))
    separator = "\t"
    
fname = fname.name

data_eval_raw, data_eval_header, _ = read_dataset(
    fdir_external / organ / 'reg' / fname,
    fdir_external / organ / 'reg' / 'SraRunTable.txt',
    fdir_raw / 'all_transcripts_strigtie_merged.gtf', 
    separator="\t"
)

INFO:root:Extracted GTF attributes: ['gene_id', 'transcript_id', 'exon_number', 'gene_name', 'ref_gene_id']


Dataset shape:  (34, 377460)


In [6]:
data_raw.shape, data_eval_raw.shape, len(data_raw.columns.intersection(data_eval_raw.columns))

((667, 338725), (34, 377460), 336380)

In [54]:
sex = ['chrXY', 'chrX', 'chrY', 'autosome'][3]

data = pd.read_hdf(fdir_traintest / f'geuvadis.preprocessed.sex.h5', key=sex)

for organ in ["HEART", "BRAIN0", "BRAIN1"]:
    fname = next((fdir_external / organ / 'reg').glob("*processed.h5"))
    fname = fname.name
    data_eval = pd.read_hdf(fdir_external / organ / 'reg' / fname, index_col=0)
    print(organ)
    print(len(data.columns), len(data_eval.columns), len(data.columns.intersection(data_eval.columns)))

HEART
15354 102378 8441
BRAIN0
15354 150715 9408
BRAIN1
15354 146083 9162


In [10]:


for sex in ['chrXY', 'chrX', 'chrY', 'autosome']:
    data = pd.read_hdf(fdir_traintest / f'geuvadis.preprocessed.sex.h5', key=sex)
    print(sex)
    for organ in ["HEART", "BRAIN0", "BRAIN1"]:
        fname = next((fdir_external / organ / 'reg').glob("*processed.h5"))
        fname = fname.name
        data_eval = pd.read_hdf(fdir_external / organ / 'reg' / fname, index_col=0)
        print(organ, ": ", len(data.columns.intersection(data_eval.columns)))

chrXY
HEART :  8668
BRAIN0 :  9703
BRAIN1 :  9429
chrX
HEART :  8660
BRAIN0 :  9671
BRAIN1 :  9400
chrY
HEART :  8441
BRAIN0 :  9460
BRAIN1 :  9175
autosome
HEART :  8441
BRAIN0 :  9448
BRAIN1 :  9162
