In [10]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from pathlib import Path
from scipy.stats import pointbiserialr

from tqdm import tqdm
from gtfparse import read_gtf

In [11]:
fdir_raw = Path("../data/raw/")
fdir_processed = Path("../data/interim")
fdir_traintest = Path("../data/processed")

In [12]:
def read_geuvadis(fname_data: Path | str,
                  fname_header: Path | str,
                  fname_gtf: Path | str):
    data_raw = pd.read_csv(fname_data, index_col=0).T
    data_raw = data_raw.astype(np.float32)

    data_header = pd.read_csv(fname_header, index_col=0)

    gtf_rawdata = read_gtf(fname_gtf)
    gtf_data = gtf_rawdata.to_pandas()
    gtf_data = gtf_data.set_index('transcript_id')
    gtf_data['transcript_id'] = gtf_data.index

    gtf_data = gtf_data.drop_duplicates("transcript_id")

    print('Dataset shape: ', data_raw.shape)

    return data_raw, data_header, gtf_data

data_raw, data_header, gtf_data = read_geuvadis(
    fdir_raw / 'Geuvadis.all.csv',
    fdir_raw / 'Geuvadis.SraRunTable.txt',
    fdir_raw / 'all_transcripts_strigtie_merged.gtf'
)

INFO:root:Extracted GTF attributes: ['gene_id', 'transcript_id', 'exon_number', 'gene_name', 'ref_gene_id']


Dataset shape:  (667, 338725)


In [13]:
def filter_zero_median(df: pd.DataFrame) -> pd.DataFrame:
    df_median = df.median()
    if (df_median == 0).any():
        cols_to_drop = df.columns[df_median == 0]
        # print(len(cols_to_drop),
        #       " features will be removed, due to a zero median value")
        df = df.drop(columns=cols_to_drop)
        # print("Current dataset size: ", df.shape)
        print('Dataset shape: ', df.shape)
        return df

    # print("Zero median columns aren't found")
    print('Dataset shape: ', df.shape)

    return df

data = filter_zero_median(data_raw)

Dataset shape:  (667, 100253)


In [14]:
def filter_correlated(X: pd.DataFrame, y: pd.DataFrame | pd.Series, threshold=0.8) -> pd.DataFrame:
    X_corr = X
    y_corr = y

    columns_to_drop = []
    for c in tqdm(X_corr.columns):
        corr, pvalue = pointbiserialr(X_corr[c], LabelEncoder().fit_transform(y_corr.values))
        if np.abs(corr) > threshold:
            columns_to_drop.append(c)

    X = X.drop(columns=columns_to_drop)
    print('Dataset shape: ', X.shape)
    return X

data = filter_correlated(data, data_header['Sex'].loc[data.index])
data

100%|██████████| 100253/100253 [01:07<00:00, 1476.75it/s]


Dataset shape:  (667, 100247)


Unnamed: 0,ENST00000401095.9,ENST00000356575.9,ENST00000687047.2,ENST00000378344.7,MSTRG.239.2,ENST00000469733.5,MSTRG.239.15,ENST00000606372.5,ENST00000469374.5,ENST00000475812.1,...,ENST00000601841.1,MSTRG.36543.1,MSTRG.36550.1,ENST00000538162.3,ENST00000654222.1,MSTRG.36809.22,ENST00000465253.1,ENST00000485584.1,MSTRG.36838.1,ENST00000509776.2
ERR188021,0.158521,0.066814,0.132960,30.733223,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ERR188022,0.318309,0.158957,0.894535,24.547054,0.346369,0.043148,0.000000,0.000000,1.735198,0.107082,...,0.149824,0.056120,0.313784,0.000000,0.536379,1.771619,0.454383,1.544379,0.274368,0.085241
ERR188023,0.092998,0.066917,0.343150,23.050041,0.000000,0.000000,1.058649,0.000000,0.678814,0.008838,...,0.019751,0.229643,0.366073,0.144311,0.000000,0.009294,0.000000,0.000000,0.000000,0.083113
ERR188024,0.189705,0.381285,0.666632,23.621725,1.621334,0.000000,2.290493,0.640245,1.034789,0.031561,...,0.108731,1.429073,0.868414,0.047884,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ERR188025,0.033657,0.000000,0.033345,21.236263,0.000000,0.000000,0.066430,0.143671,0.109519,0.057051,...,0.014493,0.174317,0.000000,0.077541,0.000000,0.004546,0.000000,0.000000,0.000000,0.103260
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ERR204940,4.187159,2.133955,5.693484,11.895696,0.000000,0.000000,0.000000,4.800893,4.180713,0.000000,...,8.419396,4.071667,0.000000,1.995198,0.000000,0.161466,0.000000,0.000000,0.087160,1.979575
ERR205020,0.150606,0.146151,0.081598,23.710396,0.000000,0.179394,0.000000,0.000000,0.468108,0.033033,...,0.062772,0.500722,0.696421,0.212891,0.153988,2.622310,0.095074,0.164385,0.240571,0.000000
ERR205021,0.127224,0.154483,0.000000,32.652328,0.499237,0.288096,0.000000,0.000000,0.726109,0.025708,...,0.000000,0.188987,0.000000,0.280794,0.039893,0.503987,0.253193,0.113689,0.197303,0.158687
ERR205022,0.127737,0.079789,0.127716,23.853119,0.473199,0.813438,0.000000,0.000000,1.180683,0.013071,...,0.025737,1.045413,0.691371,0.114437,0.000000,0.018198,0.000000,0.000000,0.000000,0.169997


In [15]:
def logarithmization(df: pd.DataFrame):
    df = np.log2(df + 1)
    return df

data = logarithmization(data)
data

Unnamed: 0,ENST00000401095.9,ENST00000356575.9,ENST00000687047.2,ENST00000378344.7,MSTRG.239.2,ENST00000469733.5,MSTRG.239.15,ENST00000606372.5,ENST00000469374.5,ENST00000475812.1,...,ENST00000601841.1,MSTRG.36543.1,MSTRG.36550.1,ENST00000538162.3,ENST00000654222.1,MSTRG.36809.22,ENST00000465253.1,ENST00000485584.1,MSTRG.36838.1,ENST00000509776.2
ERR188021,0.212284,0.093309,0.180097,4.987922,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ERR188022,0.398689,0.212827,0.921844,4.675085,0.429074,0.060944,0.000000,0.000000,1.451645,0.146762,...,0.201413,0.078774,0.393728,0.000000,0.619534,1.470729,0.540407,1.347314,0.349782,0.118015
ERR188023,0.128291,0.093448,0.425620,4.587967,0.000000,0.000000,1.041698,0.000000,0.747442,0.012695,...,0.028217,0.298239,0.450035,0.194479,0.000000,0.013347,0.000000,0.000000,0.000000,0.115184
ERR188024,0.250604,0.466011,0.736936,4.621860,1.390301,0.000000,1.718304,0.713911,1.024879,0.044829,...,0.148909,1.280406,0.901814,0.067479,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ERR188025,0.047757,0.000000,0.047322,4.474843,0.000000,0.000000,0.092789,0.193672,0.149934,0.080045,...,0.020759,0.231822,0.000000,0.107743,0.000000,0.006544,0.000000,0.000000,0.000000,0.141773
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ERR204940,2.374945,1.647985,2.742757,3.688818,0.000000,0.000000,0.000000,2.536275,2.373151,0.000000,...,3.235635,2.342460,0.000000,1.582651,0.000000,0.215947,0.000000,0.000000,0.120564,1.575107
ERR205020,0.202394,0.196797,0.113164,4.627046,0.000000,0.238046,0.000000,0.000000,0.553958,0.046886,...,0.087832,0.585657,0.762494,0.278450,0.206628,1.856910,0.131028,0.219568,0.311004,0.000000
ERR205021,0.172774,0.207247,0.000000,5.072634,0.584229,0.365240,0.000000,0.000000,0.787524,0.036620,...,0.000000,0.249733,0.000000,0.357038,0.056435,0.588792,0.325609,0.155346,0.259788,0.212491
ERR205022,0.173431,0.110749,0.173404,4.635355,0.558952,0.858727,0.000000,0.000000,1.124780,0.018735,...,0.036661,1.032392,0.758193,0.156315,0.000000,0.026018,0.000000,0.000000,0.000000,0.226505


In [16]:
def filter_cv_threshold(df: pd.DataFrame, threshold: float):
    cv = df.std() / df.mean()
    low_cv_cols = cv[cv < threshold].index

    if len(low_cv_cols) > 0:
        # print(f"{len(low_cv_cols)} features have coefficient of variation below {threshold} and will be removed.")
        df = df.drop(columns=low_cv_cols)
    # else:
    #     print("No features found with coefficient of variation below the threshold.")
    # print(f"Current amount of features is {len(df.columns)}")

    print('Dataset shape: ', df.shape)
    return df
data = filter_cv_threshold(data, 0.7)
data

Dataset shape:  (667, 63084)


Unnamed: 0,ENST00000401095.9,ENST00000356575.9,ENST00000687047.2,MSTRG.239.2,ENST00000469733.5,MSTRG.239.15,ENST00000606372.5,ENST00000469374.5,ENST00000475812.1,ENST00000378518.5,...,ENST00000601841.1,MSTRG.36543.1,MSTRG.36550.1,ENST00000538162.3,ENST00000654222.1,MSTRG.36809.22,ENST00000465253.1,ENST00000485584.1,MSTRG.36838.1,ENST00000509776.2
ERR188021,0.212284,0.093309,0.180097,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ERR188022,0.398689,0.212827,0.921844,0.429074,0.060944,0.000000,0.000000,1.451645,0.146762,0.000000,...,0.201413,0.078774,0.393728,0.000000,0.619534,1.470729,0.540407,1.347314,0.349782,0.118015
ERR188023,0.128291,0.093448,0.425620,0.000000,0.000000,1.041698,0.000000,0.747442,0.012695,1.233444,...,0.028217,0.298239,0.450035,0.194479,0.000000,0.013347,0.000000,0.000000,0.000000,0.115184
ERR188024,0.250604,0.466011,0.736936,1.390301,0.000000,1.718304,0.713911,1.024879,0.044829,0.154032,...,0.148909,1.280406,0.901814,0.067479,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ERR188025,0.047757,0.000000,0.047322,0.000000,0.000000,0.092789,0.193672,0.149934,0.080045,0.986891,...,0.020759,0.231822,0.000000,0.107743,0.000000,0.006544,0.000000,0.000000,0.000000,0.141773
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ERR204940,2.374945,1.647985,2.742757,0.000000,0.000000,0.000000,2.536275,2.373151,0.000000,0.000000,...,3.235635,2.342460,0.000000,1.582651,0.000000,0.215947,0.000000,0.000000,0.120564,1.575107
ERR205020,0.202394,0.196797,0.113164,0.000000,0.238046,0.000000,0.000000,0.553958,0.046886,0.000000,...,0.087832,0.585657,0.762494,0.278450,0.206628,1.856910,0.131028,0.219568,0.311004,0.000000
ERR205021,0.172774,0.207247,0.000000,0.584229,0.365240,0.000000,0.000000,0.787524,0.036620,0.000000,...,0.000000,0.249733,0.000000,0.357038,0.056435,0.588792,0.325609,0.155346,0.259788,0.212491
ERR205022,0.173431,0.110749,0.173404,0.558952,0.858727,0.000000,0.000000,1.124780,0.018735,0.000000,...,0.036661,1.032392,0.758193,0.156315,0.000000,0.026018,0.000000,0.000000,0.000000,0.226505


In [17]:
def filter_median_q34(data: pd.DataFrame):
    mean = data.mean(axis=0)
    median = mean.median()
    data = data.loc[:, mean > median]
    print('Dataset shape: ', data.shape)
    return data

def filter_cv_q34(data: pd.DataFrame):
    cv = data.std() / data.mean()
    median_cv = cv.median()
    data = data.loc[:, cv > median_cv]
    print('Dataset shape: ', data.shape)
    return data

data = filter_median_q34(data)
data = filter_cv_q34(data)
data

Dataset shape:  (667, 31542)
Dataset shape:  (667, 15771)


Unnamed: 0,MSTRG.239.15,MSTRG.249.12,MSTRG.250.13,ENST00000432521.2,MSTRG.259.5,ENST00000435221.6,MSTRG.259.7,MSTRG.259.9,ENST00000378567.8,MSTRG.237.18,...,MSTRG.30211.1,MSTRG.30408.1,MSTRG.30433.1,ENST00000363945.1,MSTRG.32721.13,ENST00000358022.6,MSTRG.36284.1,MSTRG.36809.22,ENST00000485584.1,MSTRG.36838.1
ERR188021,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.717116,0.421942,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ERR188022,0.000000,0.000000,0.799756,1.642583,1.559544,1.346990,1.463835,0.468743,0.000000,0.000000,...,0.278405,0.000000,2.210809,0.373931,0.000000,0.320373,0.382722,1.470729,1.347314,0.349782
ERR188023,1.041698,0.172106,0.181875,0.341884,0.690392,0.000000,0.000000,0.000000,2.055131,0.000000,...,0.982559,0.148974,0.981267,0.794413,0.305703,0.153957,0.728266,0.013347,0.000000,0.000000
ERR188024,1.718304,2.636580,0.000000,2.412521,2.015744,2.011945,1.641659,2.708559,1.183267,0.000000,...,0.764759,0.645045,1.971979,1.484448,0.811534,0.737685,0.000000,0.000000,0.000000,0.000000
ERR188025,0.092789,0.325591,0.000000,0.198449,0.000000,0.310230,1.547893,1.979350,0.000000,1.278243,...,0.488217,0.147106,0.810465,0.543106,0.000000,0.233628,0.000000,0.006544,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ERR204940,0.000000,0.000000,2.407727,0.000000,0.000000,0.000000,0.000000,0.000000,1.551798,0.000000,...,3.289873,3.606890,3.795911,0.000000,0.357103,2.424006,3.423396,0.215947,0.000000,0.120564
ERR205020,0.000000,0.000000,0.789096,0.000000,0.000000,1.115314,2.104318,1.862446,0.000000,0.743505,...,0.400225,0.778359,1.510771,0.000000,0.000000,1.163828,0.473227,1.856910,0.219568,0.311004
ERR205021,0.000000,2.766302,0.238701,0.000000,2.002541,2.427571,4.303934,1.500727,3.062893,0.757271,...,0.000000,0.000000,2.417925,0.000000,0.000000,0.000000,0.000000,0.588792,0.155346,0.259788
ERR205022,0.000000,2.351532,1.025341,1.407937,0.000000,0.000000,0.000000,0.000000,2.688904,0.000000,...,1.020148,0.748980,1.716177,0.000000,0.324686,0.319735,0.000000,0.026018,0.000000,0.000000


In [18]:
def locate_sex_transcripts(gtf_data: pd.DataFrame) -> tuple[pd.Series, pd.Series]:

    # from https://www.ensembl.org/info/genome/genebuild/human_PARS.html

    pseudoautosoms_Y1 = [10001, 2781479]
    pseudoautosoms_X1 = [10001, 2781479]
    pseudoautosoms_Y2 = [56887903, 57217415]
    pseudoautosoms_X2 = [155701383, 156030895]

    transcripts_x = gtf_data.loc[gtf_data['seqname'] == 'chrX']
    transcripts_y = gtf_data.loc[gtf_data['seqname'] == 'chrY']

    true_transcripts_x = transcripts_x.loc[((transcripts_x['end'] < pseudoautosoms_X1[0])
                                            | ((transcripts_x["start"] > pseudoautosoms_X1[1]) & (transcripts_x["end"] < pseudoautosoms_X2[0]))
                                            | (transcripts_x["start"] > pseudoautosoms_X2[1])
                                            )]

    true_transcripts_y = transcripts_y.loc[((transcripts_y['end'] < pseudoautosoms_Y1[0])
                                            | ((transcripts_y["start"] > pseudoautosoms_Y1[1]) & (transcripts_y["end"] < pseudoautosoms_Y2[0]))
                                            | (transcripts_y["start"] > pseudoautosoms_Y2[1])
                                            )]

    transcripts_x = transcripts_x['transcript_id'].unique()
    transcripts_y = transcripts_y['transcript_id'].unique()

    true_transcripts_x = true_transcripts_x['transcript_id'].unique()
    true_transcripts_y = true_transcripts_y['transcript_id'].unique()

    return true_transcripts_x, true_transcripts_y


transcripts_x, transcripts_y = locate_sex_transcripts(gtf_data)

transcripts_x = transcripts_x.tolist()
transcripts_y = transcripts_y.tolist()

transcripts_x = data.columns.intersection(transcripts_x)
transcripts_y = data.columns.intersection(transcripts_y)

data_X_only = data[data.columns.intersection(transcripts_x)]
data_Y_only = data[data.columns.intersection(transcripts_y)]

gtf_transcripts = gtf_data.loc[data.columns]
transcripts_autosomes = gtf_transcripts.loc[(gtf_transcripts['seqname'] != "chrX") & (gtf_transcripts['seqname'] != "chrY")].index


data_XY = data
data_X = data[transcripts_x.union(transcripts_autosomes)]
data_Y = data[transcripts_y.union(transcripts_autosomes)]
data_autosomes = data[transcripts_autosomes]


print('dataXY shape: ', data_XY.shape)
print('dataX shape: ', data_X.shape)
print('dataY shape: ', data_Y.shape)
print('data_autosome shape: ', data_autosomes.shape)

# print(transcripts_x)
# print(transcripts_y)
# print(transcripts_autosomes)
# data_Y_only

dataXY shape:  (667, 15771)
dataX shape:  (667, 15725)
dataY shape:  (667, 15368)
data_autosome shape:  (667, 15354)


In [20]:
# dataXY shape:  (667, 15772)
# dataX shape:  (667, 15720)
# dataY shape:  (667, 15369)
# data_autosome shape:  (667, 15349)


In [None]:
# 12:21:56.237 | INFO    | Task run 'remove_sex_transcripts-0' - dataXY shape:  (667, 15771)
# 12:21:56.237 | INFO    | Task run 'remove_sex_transcripts-0' - dataX shape:  (667, 15725)
# 12:21:56.237 | INFO    | Task run 'remove_sex_transcripts-0' - dataY shape:  (667, 15368)
# 12:21:56.238 | INFO    | Task run 'remove_sex_transcripts-0' - data_autosome shape:  (667, 15354)