# ML Data Formatter

This notebook contains a Python implementation for preprocessing tabular data stored in TSV files for machine learning purposes. It defines a structured approach to format and split the input data into training and testing datasets.

In [1]:
from pathlib import Path
from dataclasses import dataclass
import polars as pl

@dataclass
class MLData:
    """
    Class to store the training and testing dataframes for machine learning models.
    Attributes:
        training_data (pl.DataFrame): Training data.
        testing_data (pl.DataFrame): Testing data.
    """
    training_data: pl.DataFrame
    testing_data: pl.DataFrame
    
class FormatMLData:
    """ Class for formatting the training and testing dataframes for machine learning models."""
    def __init__(self, input_data_path: Path):
        """ 
        Initialise the FormatMLData class.
        Args:
            input_data_path (Path): Path to the input data tsv.
        """
        self.input_data_path = input_data_path
        
    def read_input_data(self) -> pl.DataFrame:
        """
        Read the input data tsv.
        Returns:
            pl.DataFrame: The input data dataframe.
        """
        return pl.read_csv(self.input_data_path, separator="\t", infer_schema_length=100000000)
    
    @staticmethod
    def fix_max_path_null(input_data: pl.DataFrame) -> pl.DataFrame:
        """
        Fix max path null values - fills the Null values with the EXOMISER_VARIANT_SCORE.
        Args:
            input_data (pl.DataFrame): The input data dataframe
        Returns:
            pl.DataFrame: The input data dataframe with Null values replaced with the EXOMISER_VARIANT_SCORE
        """
        return input_data.with_columns(pl.col('MAX_PATH').fill_null(pl.col('EXOMISER_VARIANT_SCORE')))
    
    @staticmethod
    def fix_max_freq_null(input_data: pl.DataFrame) -> pl.DataFrame:
        """
        Fix max freq null values - fills the Null values with 0
        Args:
            input_data (pl.DataFrame): The input data dataframe
        Returns:
            pl.DataFrame: The input data dataframe with Null values replaced with 0
        """
        return input_data.with_columns(pl.col("MAX_FREQ").fill_null(0))
    
    @staticmethod
    def retrieve_training_data(input_data: pl.DataFrame) -> pl.DataFrame:
        """
        Retrieve training data.
        Args:
            input_data (pl.Dataframe): The input data dataframe
        Returns:
            pl.DataFrame: The training data dataframe
        """
        training = input_data.filter(pl.col("TRAIN_STATUS")==1)
        return training

    @staticmethod
    def retrieve_test_data(input_data: pl.DataFrame) -> pl.DataFrame:
        """
        Retrieve testing data.
        Args:
            input_data (pl.Dataframe): The input data dataframe
        Returns:
            pl.DataFrame: The testing data dataframe
        """
        return input_data.filter(pl.col("TRAIN_STATUS")==0)
    
    def return_ml_data(self) -> MLData:
        """
        Retrieve the formatted training and testing data, with Null values replaced.
        Returns:
            MLData: The formatted training and testing data
        """
        input_data = self.read_input_data()
        input_data = self.fix_max_path_null(input_data)
        input_data = self.fix_max_freq_null(input_data)
        return MLData(training_data=self.retrieve_training_data(input_data), testing_data=self.retrieve_test_data(input_data))

In [9]:
# Usage
input_data = '../data/ml_data_subset.tsv'
ml_data = FormatMLData(input_data).return_ml_data()
ml_data

MLData(training_data=shape: (2_734, 144)
┌───────┬────────────┬────────────┬────────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ #RANK ┆ ID         ┆ GENE_SYMBO ┆ ENTREZ_GEN ┆ … ┆ WEIGHTED_ ┆ WEIGHTED_ ┆ WEIGHTED_ ┆ WEIGHTED_ │
│ ---   ┆ ---        ┆ L          ┆ E_ID       ┆   ┆ BS4       ┆ BP6       ┆ BP4       ┆ BA1       │
│ i64   ┆ str        ┆ ---        ┆ ---        ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│       ┆            ┆ str        ┆ f64        ┆   ┆ i64       ┆ i64       ┆ i64       ┆ i64       │
╞═══════╪════════════╪════════════╪════════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 1     ┆ 15-9130346 ┆ BLM        ┆ 641.0      ┆ … ┆ 0         ┆ 0         ┆ 0         ┆ 0         │
│       ┆ 2-G-GA_AR  ┆            ┆            ┆   ┆           ┆           ┆           ┆           │
│ 1     ┆ 15-9134695 ┆ BLM        ┆ 641.0      ┆ … ┆ 0         ┆ 0         ┆ 0         ┆ 0         │
│       ┆ 1-G-A_AR   ┆            ┆            ┆  

In [8]:
ml_data.training_data.head()

#RANK,ID,GENE_SYMBOL,ENTREZ_GENE_ID,MOI,P-VALUE,EXOMISER_GENE_COMBINED_SCORE,EXOMISER_GENE_PHENO_SCORE,EXOMISER_GENE_VARIANT_SCORE,EXOMISER_VARIANT_SCORE,CONTRIBUTING_VARIANT,WHITELIST_VARIANT,VCF_ID,RS_ID,CONTIG,START,END,REF,ALT,CHANGE_LENGTH,QUAL,FILTER,GENOTYPE,FUNCTIONAL_CLASS,HGVS,EXOMISER_ACMG_CLASSIFICATION,EXOMISER_ACMG_EVIDENCE,EXOMISER_ACMG_DISEASE_ID,EXOMISER_ACMG_DISEASE_NAME,CLINVAR_VARIATION_ID,CLINVAR_PRIMARY_INTERPRETATION,CLINVAR_STAR_RATING,GENE_CONSTRAINT_LOEUF,GENE_CONSTRAINT_LOEUF_LOWER,GENE_CONSTRAINT_LOEUF_UPPER,MAX_FREQ_SOURCE,MAX_FREQ,…,EXOMISER_ACMG_EVIDENCE_PP3_Strong,EXOMISER_ACMG_EVIDENCE_PP4,EXOMISER_ACMG_EVIDENCE_PP5,EXOMISER_ACMG_EVIDENCE_PP5_Strong,EXOMISER_ACMG_EVIDENCE_PP5_VeryStrong,EXOMISER_ACMG_EVIDENCE_PS1,EXOMISER_ACMG_EVIDENCE_PS1_Moderate,EXOMISER_ACMG_EVIDENCE_PS1_Supporting,EXOMISER_ACMG_EVIDENCE_PS2,EXOMISER_ACMG_EVIDENCE_PVS1,EXOMISER_ACMG_EVIDENCE_PVS1_Strong,ACMG_EVIDENCE_SCORE,TALLY_P_SUPPORTING,TALLY_P_MODERATE,TALLY_P_STRONG,TALLY_P_VERY_STRONG,TALLY_B_SUPPORTING,TALLY_B_MODERATE,TALLY_B_STRONG,TALLY_B_VERY_STRONG,TALLY_BA1,ACMG_PPP,ACMG_PPP_MEAN,WEIGHTED_PVS1,WEIGHTED_PS1,WEIGHTED_PS2,WEIGHTED_PP5,WEIGHTED_PP4,WEIGHTED_PP3,WEIGHTED_PM5,WEIGHTED_PM4,WEIGHTED_PM2,WEIGHTED_PM1,WEIGHTED_BS4,WEIGHTED_BP6,WEIGHTED_BP4,WEIGHTED_BA1
i64,str,str,f64,str,f64,f64,f64,f64,f64,i64,i64,str,str,i64,i64,i64,str,str,i64,f64,str,str,str,str,str,str,str,str,f64,str,i64,f64,f64,f64,str,f64,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
1,"""15-91303462-G-GA_AR""","""BLM""",641.0,"""AR""",0.0,0.9882,0.8207,1.0,1.0,1,1,,"""rs1358266880""",15,91303462,91303462,"""G""","""GA""",1,100.0,"""PASS""","""1/1""","""frameshift_variant""","""BLM:ENST00000355112.3:c.1176du…","""PATHOGENIC""","""PVS1,PM2_Supporting,PP4,PP5_St…","""OMIM:210900""","""Bloom syndrome""",1069515.0,"""PATHOGENIC""",2,0.71191,0.6,0.847,"""GNOMAD_E_AMR""",0.002891,…,0,1,0,1,0,0,0,0,0,1,0,14.0,2,0,1,1,0,0,0,0,0,1.0,1.0,8,0,0,4,1,0,0,0,1,0,0,0,0,0
1,"""15-91346951-G-A_AR""","""BLM""",641.0,"""AR""",0.0,0.9938,0.8839,1.0,1.0,1,1,,"""rs148969222""",15,91346951,91346951,"""G""","""A""",0,100.0,"""PASS""","""1/1""","""splice_donor_variant""","""BLM:ENST00000355112.3:c.3558+1…","""LIKELY_PATHOGENIC""","""PVS1_Strong,PM2_Supporting,PP4…","""OMIM:210900""","""Bloom syndrome""",2137744.0,"""PATHOGENIC""",1,0.71191,0.6,0.847,"""GNOMAD_E_EAS""",0.005437,…,0,1,1,0,0,0,0,0,0,0,1,7.0,3,0,1,0,0,0,0,0,0,0.949,0.949,4,0,0,1,1,0,0,0,1,0,0,0,0,0
1,"""15-91328204-GACACGTT-G_AR""","""BLM""",641.0,"""AR""",0.0,0.9967,0.9429,1.0,1.0,1,1,,,15,91328204,91328211,"""GACACGTT""","""G""",-7,100.0,"""PASS""","""1/1""","""frameshift_truncation""","""BLM:ENST00000355112.3:c.2720_2…","""PATHOGENIC""","""PVS1,PM2_Supporting,PP4,PP5""","""OMIM:210900""","""Bloom syndrome""",370390.0,"""LIKELY_PATHOGENIC""",1,0.71191,0.6,0.847,,0.0,…,0,1,1,0,0,0,0,0,0,1,0,11.0,3,0,0,1,0,0,0,0,0,0.997,0.997,8,0,0,1,1,0,0,0,1,0,0,0,0,0
1,"""15-91292671-CTG-C_AR""","""BLM""",641.0,"""AR""",0.0,0.992,0.8589,1.0,1.0,1,1,,,15,91292671,91292673,"""CTG""","""C""",-2,100.0,"""PASS""","""1/1""","""frameshift_truncation""","""BLM:ENST00000355112.3:c.175_17…","""PATHOGENIC""","""PVS1,PM2_Supporting,PP4,PP5""","""OMIM:210900""","""Bloom syndrome""",1177404.0,"""PATHOGENIC""",1,0.71191,0.6,0.847,,0.0,…,0,1,1,0,0,0,0,0,0,1,0,11.0,3,0,0,1,0,0,0,0,0,0.997,0.997,8,0,0,1,1,0,0,0,1,0,0,0,0,0
1,"""15-91303840-G-T_AR""","""BLM""",641.0,"""AR""",0.0,0.9954,0.9127,1.0,1.0,1,1,,"""rs887921909""",15,91303840,91303840,"""G""","""T""",0,100.0,"""PASS""","""1/1""","""stop_gained""","""BLM:ENST00000355112.3:c.1237G>…","""PATHOGENIC""","""PVS1,PM2_Supporting,PP4,PP5""","""OMIM:210900""","""Bloom syndrome""",1432994.0,"""PATHOGENIC""",1,0.71191,0.6,0.847,,0.0,…,0,1,1,0,0,0,0,0,0,1,0,11.0,3,0,0,1,0,0,0,0,0,0.997,0.997,8,0,0,1,1,0,0,0,1,0,0,0,0,0
