# Parsing dataset

## Import lib

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from collections import OrderedDict
import os

In [None]:
class Data():
    
    def __init__(self,exp_mat:pd.DataFrame=None,meta:pd.DataFrame=None,supp:OrderedDict=None):
        '''
        self.exp_mat: (pd.DataFrame) RNA expression matrix, genes x samples
        self.meta: (pd.DataFrame) metadata, samples x features
        self.supp: (OrderedDict)  supplementary data 
        '''
        self.exp_mat=exp_mat or pd.DataFrame()
        self.meta=meta or pd.DataFrame()
        self.supp=supp or OrderedDict()
    
    def update_exp_mat(self,exp_mat:pd.DataFrame):
        '''
        Update the instance self.exp_mat
        
        exp_mat: (pd.DataFrame) RNA expression matrix, genes x samples
        '''
        self.exp_mat=exp_mat
    
    def update_meta(self,meta:pd.DataFrame):
        '''
        Update the instance self.meta
        
        meta: (pd.DataFrame) metadata, samples x features
        '''
        self.meta=meta
        
    def add_exp_mat(self, path_exp_mat:str, **kwargs):
        '''
        path_exp_mat: (str) RNA expression matrix pathway
        '''
        
        if not os.path.exists(path_exp_mat):
            raise ValueError(f"File {path_exp_mat} does not exist")
        
        # default values for kwargs
        kwargs.setdefault('sep','\t')
        kwargs.setdefault('on_bad_lines','warn')

        self.exp_mat=pd.read_csv(path_exp_mat,**kwargs)
    
    def add_meta(self, path_meta:str, **kwargs):
        '''
        Read the metadata and fill to self.meta instance
        
        path_meta: (str) metadata pathway
        '''
        
        if not os.path.exists(path_meta):
            raise ValueError(f"File {path_meta} does not exist")
        
        self.meta=pd.read_csv(path_meta,**kwargs)
        
    def add_genes_name(self,genes_name:list=[],exp_mat:pd.DataFrame=None,inplace:bool=True) -> pd.DataFrame:
        '''
        Add genes name annotation in RNA expression matrix (genes x samples)
        
        genes_name: (list) list of genes name annotation
        exp_mat: (pd.DataFrame) RNA expression matrix, genes x samples
        inplace: (bool) if inplace = True the instance self.exp_mat will be update otherwise the updated matrix will be returned, brings usability
        '''
        
        if self.exp_mat is None and exp_mat is None:
            raise ValueError(f"Provide RNA expression matrix (genes x samples)")
        
        if exp_mat is None:
            exp_mat = self.exp_mat
        
        if len(genes_name)!=exp_mat.shape[0] or len(genes_name)!=len(set(genes_name)):
            raise ValueError(f"The genes names list mismatching with the expression matrix rows")
        
        # add genes name list as rownames
        exp_mat.index=genes_name
        
        if inplace:
            self.update_exp_mat(exp_mat)
        else:
            return exp_mat
    
    def add_samples_name(self,samples_name:list=[],meta:pd.DataFrame=None,inplace:bool=True):
        '''
        Add samples name annotation in metadata (samples x fetaures)
        
        samples_name: (list) list of samples name annotation
        meta: (pd.DataFrame) metadata, samples x features
        inplace: (bool) if inplace = True the instance self.meta will be update otherwise the updated metadata will be returned
        '''
        
        if self.meta is None and meta is None:
            raise ValueError(f"Provide metadata (samples x features)")
            
        if meta is None:
            meta=self.meta
            
        if len(samples_name)!=meta.shape[0] or len(samples_name)!=len(set(samples_name)):
            raise ValueError(f"The samples names list mismatching with the  metadata rows")
        
        # add samples list as metadata rownames
        meta.index=samples_name
        
        if inplace:
            self.update_meta(meta)
        else:
            return meta

        
    @staticmethod
    def search_duplicates(values:list) -> set:
        '''
        Search for duplicate values in a list of values
        
        values (list) list of values
        '''
        
        return set([x for x in values if values.count(x) >1])
        
    def add_supp(self,key:str,value,inplace=True):
        '''
        Add supplementary data in OrderedDict
        
        key (str) key to OrderedDict
        value: (any) data/variables to store in object
        inplace (bool) replace the key args if the key is already in OrderedDict
        '''
        if key not in set(self.supp.keys()) or inplace:
            self.supp[key]=value
        else:
            raise KeyError(f"The key {key} already exist in self.supp instance")
            
    def sel_samples_and_genes_name(self,samples_name:list=[],genes_name:list=[],exp_mat:pd.DataFrame=None,meta:pd.DataFrame=None,inplace:bool=True):
        '''
        Select the genes and also select the samples in metadata as well as expression matrix in same time
        
        samples_name: (list) list of samples name annotation
        genes_name: (list) list of genes name annotation
        exp_mat: (pd.DataFrame) RNA expression matrix, genes x samples
        meta: (pd.DataFrame) metadata, samples x features
        inplace: (bool) if inplace = True the instance self.meta and self.exp_mat will be update otherwise the updated metadata and expressiion matrix will be returned
        '''
        
        if self.exp_mat is None and exp_mat is None:
            raise ValueError(f"Provide RNA expression matrix (genes x samples)")
            
        if self.meta is None and meta is None:
            raise ValueError(f"Provide metadata (samples x features)")
            
        if meta is None:
            meta=self.meta
        
        if exp_mat is None:
            exp_mat=self.exp_mat
        
        # Select genes and samples name
        meta=meta.loc[samples_name,]
        
        if len(samples_name) > 0:
            exp_mat=exp_mat[samples_name]
        elif len(genes_name) > 0:
            exp_mat=exp_mat.loc[genes_name,]
        
        if inplace:
            self.meta=meta
            self.exp_mat=exp_mat
        else:
            return exp_at, meta

In [None]:
path_exp_mat='/kaggle/input/medulloblastoma-omics-data/GSE85217_M_exp_763_MB_SubtypeStudy_TaylorLab.txt/GSE85217_M_exp_763_MB_SubtypeStudy_TaylorLab.txt'
path_meta='/kaggle/input/medulloblastoma-omics-data/GSE85217_Cavalli_subgroups_information.csv'

In [None]:
data=Data()
data.add_exp_mat(path_exp_mat)
data.add_meta(path_meta)

## Annotation

Get a list of genes and list of samples

### Genes list

1) Drop the Nan values in genes names column: **HGNC_symbol_from_ensemblv77**

2) Check if any gene name are not duplicated, remove them if necessary

3) Add the gene names list as index/rownames to RNA expression matrix


In [None]:
# Drop the Nan
data.exp_mat.dropna(subset=["HGNC_symbol_from_ensemblv77"], how='any', inplace=True)
# Search duplcates gene names
Data.search_duplicates(values=data.exp_mat["HGNC_symbol_from_ensemblv77"].to_list())

#### CCDC7

The notation **ENSG00000150076** and mapping for this genes has been deprecated in Ensembl Database ([ensembl source][EnsCCDC7]).
The transcript will be deleted to expression matrix.

[EnsCCDC7]: http://www.ensembl.org/Human/Search/Results?q=ENSG00000150076%3Bfacet_species%3DHuman%3Bpage%3D1

In [None]:
data.exp_mat[data.exp_mat["HGNC_symbol_from_ensemblv77"]=="CCDC7"]

#### KLK9

The **ENSG00000269741** is interresting ([ensembl source][EnsKL9splice]), it will be reindexed with Ensembl identifier.

[EnsKL9splice]: http://www.ensembl.org/Homo_sapiens/Gene/Summary?db=core;g=ENSG00000269741;r=19:50996018-51009581

![Human_ENSG00000269741.png](attachment:988cb776-53dd-434f-bf69-a0ec26685227.png)

In [None]:
data.exp_mat[data.exp_mat["HGNC_symbol_from_ensemblv77"]=="KLK9"]

#### WDR92/DNAFF10

All transcripts associated with this loci not clearly defined ([enseble source][DNAAF10new]), **ENSG00000273398** will be deleted.

[DNAAF10new]: https://www.ensembl.org/Homo_sapiens/Gene/Summary?db=core%3Bg%3DENSG00000273398%3Br%3D2%3A68131238-68261230%3Bt%3DENST00000406334

![Human_ENSG00000273398.png](attachment:068bbd07-9c3e-4ea0-a79d-7752294272aa.png)

In [None]:
data.exp_mat[data.exp_mat["HGNC_symbol_from_ensemblv77"]=="WDR92"]

**Apply the modifications and add the index**

In [None]:
# Reinde xKLK9 transcriopt
data.exp_mat.loc[21278,"HGNC_symbol_from_ensemblv77"]="ENSG00000269741"
# Delete unclear genes annotations
data.exp_mat=data.exp_mat[np.logical_not(data.exp_mat["EnsemblGeneID_from_ensemblv77"].isin(["ENSG00000273398","ENSG00000150076"]))]

In [None]:
# Add supplementary data
data.add_supp(key="genes",value=data.exp_mat["HGNC_symbol_from_ensemblv77"].to_list())
data.add_supp(key="samples",value=data.meta["Study_ID"].to_list())

In [None]:
# Add genes and samples name
data.add_genes_name(genes_name=data.supp["genes"])
data.add_samples_name(samples_name=data.supp["samples"])

In [None]:
# selection data
data.sel_samples_and_genes_name(samples_name=data.supp["samples"])

## Plot the distribution

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
#ht_tt_genes = sns.histplot(data=data.exp_mat.sum(axis=1))
#ht_tt_genes.set(xlabel='Total genes expression')

In [None]:
#ht_counts = sns.histplot(data=np.asarray(data.exp_mat).ravel())
#ht_counts.set(xlabel='Expression (counts genes/samples)')

## Save

In [None]:
# save exp_mat and meta parsed
# data.exp_mat.to_csv("/kaggle/working/GSE85217_M_exp_763_MB_SubtypeStudy_TaylorLab_parsed.txt",sep='\t',index_label='genes_name')
#data.meta.to_csv("/kaggle/working/GSE85217_Cavalli_subgroups_information_parsed.csv",index_label='samples_name')

In [None]:
# save the fig
# ht_tt_genes.figure.savefig("/kaggle/working/histogram_total_genes_expressions.png")
# ht_counts.figure.savefig("/kaggle/working/histogram_total_counts.png")