In [None]:
import pandas as pd
import numpy as np
from gtfparse import read_gtf
import pyranges as pr 

from sklearn.preprocessing import LabelEncoder
from pathlib import Path
from scipy.stats import pointbiserialr, pearsonr, spearmanr

from tqdm import tqdm
from gtfparse import read_gtf
from prefect import flow, task
import anndata as ad
from loguru import logger

from config import FDIR_EXTERNAL, FDIR_RAW, FDIR_PROCESSED, FDIR_INTEMEDIATE

In [None]:
adata_geuvadis = ad.read_h5ad(FDIR_INTEMEDIATE / "GEUVADIS.raw.h5ad")
adata_heart = ad.read_h5ad(FDIR_INTEMEDIATE / "HEART.raw.h5ad")

In [None]:
adata_geuvadis.var.index.unique()

In [None]:
list(adata_heart.var['seqname'].unique())

In [None]:
def read_dataset(
    fname_data: Path | str, fname_header: Path | str, separator=","
):
    data_raw = pd.read_csv(fname_data, index_col=0, sep=separator).T
    data_raw = data_raw.astype(np.float32)
    
    # print(data_raw)

    samples_annot = pd.read_csv(fname_header, index_col=0, sep=",")

    columns = data_raw.columns
    indices = data_raw.index.intersection(samples_annot.index)

    data_raw = data_raw.loc[indices, columns]
    samples_annot = samples_annot.loc[indices]

    adata = ad.AnnData(X=data_raw, obs=samples_annot)

    return adata

In [None]:
fname = next((FDIR_EXTERNAL / "HEART" / "reg").glob("*TPM.txt"))
adata_heart_raw = read_dataset(FDIR_EXTERNAL / "HEART" / "reg" / fname,
                               FDIR_EXTERNAL / "HEART" / "reg" / "SraRunTable.txt",
                               separator="\t",)

In [None]:
# NEW DATA 

for fname in tqdm((FDIR_EXTERNAL / "HEART"/ "HEART_GTF").glob("*.gtf")):
    patient_id = (fname.stem[:11])

    # gtf_file = read_gtf(fname)
    gtf_rawdata = read_gtf(fname)
    gtf_data = gtf_rawdata.to_pandas()
    gtf_data = gtf_data.loc[gtf_data['TPM'] != ""]
    # gtf_data = gtf_data[['seqname', 'source', "gene_id", 'transcript_id']]
    # gtf_data = gtf_data.set_index("transcript_id")
    # gtf_data["transcript_id"] = gtf_data.index

    gtf_data.to_hdf(FDIR_EXTERNAL / "HEART"/ "HEART_GTF"/'cleaned_gtfs.h5', key=patient_id, format='table')


In [None]:
# pd.read_hdf(FDIR_EXTERNAL / "HEART"/ "HEART_GTF"/'cleaned_gtfs.h5')

with pd.HDFStore(FDIR_EXTERNAL / "HEART"/ "HEART_GTF"/'cleaned_gtfs.h5') as hdf:
    # This prints a list of all group names:
    keys = hdf.keys()
    # data = hdf.get(hdf.keys()[0])
keys = keys[:20]

In [None]:
TPMs = {}

for key in keys:
    data = pd.read_hdf(FDIR_EXTERNAL / "HEART"/ "HEART_GTF"/'cleaned_gtfs.h5', key=key)
    data['TPM'] = data['TPM'].astype(float)


In [None]:
data = pd.read_hdf(FDIR_EXTERNAL / "HEART"/ "HEART_GTF"/'cleaned_gtfs.h5', key=keys[1])
data['TPM'] = data['TPM'].astype(float)
data


In [None]:
len(data['transcript_id']), len(data['transcript_id'].unique())

In [None]:
data["start"]

In [None]:
# fname = 'merged_heart_tpm.txt'
# adata_heart_new = pd.read_csv(FDIR_EXTERNAL / "HEART" / "HEART_GTF" / fname, sep="\t", index_col=0).T
adata_heart_new = pd.read_csv("/home/arsenii/Downloads/Telegram Desktop/merged_heart_tpm.txt", sep=" ", index_col=0).T
adata_heart_new = adata_heart_new.astype(np.float32)
adata_heart_new

In [None]:
gtf_rawdata = read_gtf(FDIR_RAW / "all_transcripts_strigtie_merged.gtf",)
gtf_data = gtf_rawdata.to_pandas()
gtf_data = gtf_data.set_index("transcript_id")
gtf_data["transcript_id"] = gtf_data.index

gtf_data

In [None]:
columns = adata_heart_new.columns.intersection(gtf_data.index)
adata_heart_new_ =adata_heart_new[columns]

In [None]:
adata_heart_new_

In [None]:
gtf_data.index[gtf_data.loc[columns, 'seqname'] == 'chrY']

In [None]:
# adata_heart_new.loc[gtf_data.index[gtf_data.loc[columns, 'seqname'] == 'chrY']]
adata_heart_new