In [33]:
import pandas as pd
import numpy as np
from sklearn.metrics import make_scorer, accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold, KFold
from pathlib import Path
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay, auc
from sklearn.decomposition import PCA
import umap
import json

import scanpy
import anndata

from tqdm import tqdm



In [34]:
fdir_raw = Path("../data/raw/")
fdir_processed = Path("../data/interim")
fdir_traintest = Path("../data/processed") / 'sex'
fdir_external = Path("../data/external")
ml_models_fdir = Path("../models")


use_CV = True

model_type = 'catboost'
model_type = 'xgboost'

organ = ["BLOOD1", 'BRAIN0', "HEART", "BRAIN1", 'None'][2]

# feature_importance_method = 'native'
feature_importance_method = 'SHAP'

value_to_predict = 'Sex'

sex = ['chrXY', 'chrX', 'chrY', 'autosome'][2]


In [38]:
data_gtf = pd.read_hdf(fdir_processed / f'geuvadis.preprocessed.h5', key = 'gtf')
data_header = pd.read_hdf(fdir_processed / f'geuvadis.preprocessed.h5', key='header')
data_geuvadis =  pd.read_hdf(fdir_processed / f'geuvadis.preprocessed.h5', key = 'geuvadis')

adata_geuvadis = anndata.AnnData(X=data_geuvadis, 
                                 var=data_gtf.loc[data_geuvadis.columns], 
                                 obs=data_header.loc[data_geuvadis.index]) 
adata_geuvadis

AnnData object with n_obs × n_vars = 667 × 15771
    obs: 'Assay Type', 'AvgSpotLen', 'Bases', 'BioProject', 'BioSample', 'Broker_name', 'Bytes', 'Center Name', 'common_name', 'Consent', 'DATASTORE filetype', 'DATASTORE provider', 'DATASTORE region', 'ENA-FIRST-PUBLIC (run)', 'ena_first_public', 'ENA-LAST-UPDATE (run)', 'ENA_last_update', 'Experiment', 'Experimental_Factor:_laboratory (exp)', 'Experimental_Factor:_population (exp)', 'external_id', 'INSDC_center_alias', 'INSDC_center_name', 'INSDC_first_public', 'INSDC_last_update', 'INSDC_status', 'Instrument', 'Library Name', 'LibraryLayout', 'LibrarySelection', 'LibrarySource', 'Organism', 'Platform', 'population', 'ReleaseDate', 'create_date', 'version', 'Sample Name', 'Sample_name', 'SRA Study', 'strain', 'Submitter_Id', 'Sample', 'Sex'
    var: 'seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'gene_id', 'exon_number', 'gene_name', 'ref_gene_id', 'transcript_id'

In [36]:
features_fname = f"geuvadis_features_{sex}_calibration_{organ}.csv"
features_list = pd.read_csv(ml_models_fdir / model_type / features_fname, index_col=0)
features_list.index

Index(['MSTRG.22335.16', 'MSTRG.22400.11', 'MSTRG.2239.5', 'MSTRG.22356.1',
       'MSTRG.22352.2', 'MSTRG.22336.2', 'MSTRG.22336.1', 'MSTRG.22353.3',
       'MSTRG.22336.3', 'MSTRG.22436.3',
       ...
       'MSTRG.30407.18', 'MSTRG.30237.5', 'MSTRG.30397.15', 'MSTRG.2897.25',
       'MSTRG.30318.2', 'MSTRG.30310.5', 'MSTRG.30309.12', 'MSTRG.303.49',
       'MSTRG.303.37', 'MSTRG.2906.7'],
      dtype='object', length=8440)

In [42]:
features_list.index.intersection(data_autosome.columns)

Index(['MSTRG.22335.16', 'MSTRG.22400.11', 'MSTRG.2239.5', 'MSTRG.22356.1',
       'MSTRG.22352.2', 'MSTRG.22336.2', 'MSTRG.22336.1', 'MSTRG.22353.3',
       'MSTRG.22336.3', 'MSTRG.22436.3',
       ...
       'MSTRG.30407.18', 'MSTRG.30237.5', 'MSTRG.30397.15', 'MSTRG.2897.25',
       'MSTRG.30318.2', 'MSTRG.30310.5', 'MSTRG.30309.12', 'MSTRG.303.49',
       'MSTRG.303.37', 'MSTRG.2906.7'],
      dtype='object', length=8440)

In [47]:
data_autosome = pd.read_hdf(fdir_traintest / f'geuvadis.preprocessed.sex.h5', key='autosome')
data_chrY= pd.read_hdf(fdir_traintest / f'geuvadis.preprocessed.sex.h5', key='chrY')

true_chrY_transcripts = data_chrY.columns.difference(data_autosome.columns)
true_chrY_transcripts

Index(['ENST00000471409.1', 'ENST00000485154.1', 'ENST00000485584.1',
       'ENST00000495478.1', 'MSTRG.36713.13', 'MSTRG.36720.1', 'MSTRG.36778.7',
       'MSTRG.36782.10', 'MSTRG.36782.3', 'MSTRG.36785.2', 'MSTRG.36786.4',
       'MSTRG.36809.22', 'MSTRG.36833.1', 'MSTRG.36838.1'],
      dtype='object')

In [50]:
fname = next((fdir_external / organ / 'reg').glob("*processed.h5"))
fname = fname.name
data_heart = pd.read_hdf(fdir_external / organ / 'reg' / fname, index_col=0)

data_heart.columns.intersection(true_chrY_transcripts)

Index([], dtype='object')

In [51]:
data_heart.columns.intersection(data_chrY.columns)

Index(['ENST00000005260.9', 'ENST00000008180.13', 'ENST00000026218.9',
       'ENST00000039989.9', 'ENST00000040663.8', 'ENST00000064778.8',
       'ENST00000066544.8', 'ENST00000166534.8', 'ENST00000170168.9',
       'ENST00000184956.11',
       ...
       'MSTRG.9922.19', 'MSTRG.9924.24', 'MSTRG.9925.19', 'MSTRG.9929.3',
       'MSTRG.9947.1', 'MSTRG.9947.5', 'MSTRG.9948.7', 'MSTRG.9968.1',
       'MSTRG.9981.8', 'MSTRG.9995.3'],
      dtype='object', length=8441)

In [52]:
data_heart.columns.intersection(data_autosome.columns)

Index(['ENST00000005260.9', 'ENST00000008180.13', 'ENST00000026218.9',
       'ENST00000039989.9', 'ENST00000040663.8', 'ENST00000064778.8',
       'ENST00000066544.8', 'ENST00000166534.8', 'ENST00000170168.9',
       'ENST00000184956.11',
       ...
       'MSTRG.9922.19', 'MSTRG.9924.24', 'MSTRG.9925.19', 'MSTRG.9929.3',
       'MSTRG.9947.1', 'MSTRG.9947.5', 'MSTRG.9948.7', 'MSTRG.9968.1',
       'MSTRG.9981.8', 'MSTRG.9995.3'],
      dtype='object', length=8441)

In [53]:
data_autosome.columns

Index(['MSTRG.239.15', 'MSTRG.249.12', 'MSTRG.250.13', 'ENST00000432521.2',
       'MSTRG.259.5', 'ENST00000435221.6', 'MSTRG.259.7', 'MSTRG.259.9',
       'ENST00000378567.8', 'MSTRG.237.18',
       ...
       'ENST00000509155.1', 'ENST00000513224.1', 'MSTRG.27144.1',
       'MSTRG.29150.1', 'MSTRG.30211.1', 'MSTRG.30408.1', 'MSTRG.30433.1',
       'ENST00000363945.1', 'MSTRG.32721.13', 'ENST00000358022.6'],
      dtype='object', length=15354)