# ML Data Formatter

This notebook contains a Python implementation for preprocessing tabular data stored in TSV files for machine learning purposes. It defines a structured approach to format and split the input data into training and testing datasets.

In [1]:
from pathlib import Path
from dataclasses import dataclass
import polars as pl

@dataclass
class MLData:
    """
    Class to store the training and testing dataframes for machine learning models.
    Attributes:
        training_data (pl.DataFrame): Training data.
        testing_data (pl.DataFrame): Testing data.
    """
    training_data: pl.DataFrame
    testing_data: pl.DataFrame
    
class FormatMLData:
    """ Class for formatting the training and testing dataframes for machine learning models."""
    def __init__(self, input_data_path: Path):
        """ 
        Initialise the FormatMLData class.
        Args:
            input_data_path (Path): Path to the input data tsv.
        """
        self.input_data_path = input_data_path
        
    def read_input_data(self) -> pl.DataFrame:
        """
        Read the input data tsv.
        Returns:
            pl.DataFrame: The input data dataframe.
        """
        return pl.read_csv(self.input_data_path, separator="\t", infer_schema_length=100000000)
    
    @staticmethod
    def fix_max_path_null(input_data: pl.DataFrame) -> pl.DataFrame:
        """
        Fix max path null values - fills the Null values with the EXOMISER_VARIANT_SCORE.
        Args:
            input_data (pl.DataFrame): The input data dataframe
        Returns:
            pl.DataFrame: The input data dataframe with Null values replaced with the EXOMISER_VARIANT_SCORE
        """
        return input_data.with_columns(pl.col('MAX_PATH').fill_null(pl.col('EXOMISER_VARIANT_SCORE')))
    
    @staticmethod
    def fix_max_freq_null(input_data: pl.DataFrame) -> pl.DataFrame:
        """
        Fix max freq null values - fills the Null values with 0
        Args:
            input_data (pl.DataFrame): The input data dataframe
        Returns:
            pl.DataFrame: The input data dataframe with Null values replaced with 0
        """
        return input_data.with_columns(pl.col("MAX_FREQ").fill_null(0))
    
    @staticmethod
    def retrieve_training_data(input_data: pl.DataFrame) -> pl.DataFrame:
        """
        Retrieve training data.
        Args:
            input_data (pl.Dataframe): The input data dataframe
        Returns:
            pl.DataFrame: The training data dataframe
        """
        training = input_data.filter(pl.col("TRAIN_STATUS")==1)
        return training

    @staticmethod
    def retrieve_test_data(input_data: pl.DataFrame) -> pl.DataFrame:
        """
        Retrieve testing data.
        Args:
            input_data (pl.Dataframe): The input data dataframe
        Returns:
            pl.DataFrame: The testing data dataframe
        """
        return input_data.filter(pl.col("TRAIN_STATUS")==0)
    
    def return_ml_data(self) -> MLData:
        """
        Retrieve the formatted training and testing data, with Null values replaced.
        Returns:
            MLData: The formatted training and testing data
        """
        input_data = self.read_input_data()
        input_data = self.fix_max_path_null(input_data)
        input_data = self.fix_max_freq_null(input_data)
        return MLData(training_data=self.retrieve_training_data(input_data), testing_data=self.retrieve_test_data(input_data))

In [2]:
# Usage
input_data = '../data/ml_data_subset.tsv'
ml_data = FormatMLData(input_data).return_ml_data()
ml_data

MLData(training_data=shape: (1_159, 144)
┌───────┬────────────┬────────────┬────────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ #RANK ┆ ID         ┆ GENE_SYMBO ┆ ENTREZ_GEN ┆ … ┆ WEIGHTED_ ┆ WEIGHTED_ ┆ WEIGHTED_ ┆ WEIGHTED_ │
│ ---   ┆ ---        ┆ L          ┆ E_ID       ┆   ┆ BS4       ┆ BP6       ┆ BP4       ┆ BA1       │
│ i64   ┆ str        ┆ ---        ┆ ---        ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│       ┆            ┆ str        ┆ f64        ┆   ┆ i64       ┆ i64       ┆ i64       ┆ i64       │
╞═══════╪════════════╪════════════╪════════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 1     ┆ 9-13941749 ┆ NOTCH1     ┆ 4851.0     ┆ … ┆ 0         ┆ 0         ┆ 0         ┆ 0         │
│       ┆ 4-G-A_AD   ┆            ┆            ┆   ┆           ┆           ┆           ┆           │
│ 1     ┆ 9-13940130 ┆ NOTCH1     ┆ 4851.0     ┆ … ┆ 0         ┆ 0         ┆ 0         ┆ 0         │
│       ┆ 4-G-T_AD   ┆            ┆            ┆  

In [3]:
ml_data.training_data.head()

#RANK,ID,GENE_SYMBOL,ENTREZ_GENE_ID,MOI,P-VALUE,EXOMISER_GENE_COMBINED_SCORE,EXOMISER_GENE_PHENO_SCORE,EXOMISER_GENE_VARIANT_SCORE,EXOMISER_VARIANT_SCORE,CONTRIBUTING_VARIANT,WHITELIST_VARIANT,VCF_ID,RS_ID,CONTIG,START,END,REF,ALT,CHANGE_LENGTH,QUAL,FILTER,GENOTYPE,FUNCTIONAL_CLASS,HGVS,EXOMISER_ACMG_CLASSIFICATION,EXOMISER_ACMG_EVIDENCE,EXOMISER_ACMG_DISEASE_ID,EXOMISER_ACMG_DISEASE_NAME,CLINVAR_VARIATION_ID,CLINVAR_PRIMARY_INTERPRETATION,CLINVAR_STAR_RATING,GENE_CONSTRAINT_LOEUF,GENE_CONSTRAINT_LOEUF_LOWER,GENE_CONSTRAINT_LOEUF_UPPER,MAX_FREQ_SOURCE,MAX_FREQ,…,EXOMISER_ACMG_EVIDENCE_PP3_Strong,EXOMISER_ACMG_EVIDENCE_PP4,EXOMISER_ACMG_EVIDENCE_PP5,EXOMISER_ACMG_EVIDENCE_PP5_Strong,EXOMISER_ACMG_EVIDENCE_PP5_VeryStrong,EXOMISER_ACMG_EVIDENCE_PS1,EXOMISER_ACMG_EVIDENCE_PS1_Moderate,EXOMISER_ACMG_EVIDENCE_PS1_Supporting,EXOMISER_ACMG_EVIDENCE_PS2,EXOMISER_ACMG_EVIDENCE_PVS1,EXOMISER_ACMG_EVIDENCE_PVS1_Strong,ACMG_EVIDENCE_SCORE,TALLY_P_SUPPORTING,TALLY_P_MODERATE,TALLY_P_STRONG,TALLY_P_VERY_STRONG,TALLY_B_SUPPORTING,TALLY_B_MODERATE,TALLY_B_STRONG,TALLY_B_VERY_STRONG,TALLY_BA1,ACMG_PPP,ACMG_PPP_MEAN,WEIGHTED_PVS1,WEIGHTED_PS1,WEIGHTED_PS2,WEIGHTED_PP5,WEIGHTED_PP4,WEIGHTED_PP3,WEIGHTED_PM5,WEIGHTED_PM4,WEIGHTED_PM2,WEIGHTED_PM1,WEIGHTED_BS4,WEIGHTED_BP6,WEIGHTED_BP4,WEIGHTED_BA1
i64,str,str,f64,str,f64,f64,f64,f64,f64,i64,i64,str,str,i64,i64,i64,str,str,i64,f64,str,str,str,str,str,str,str,str,f64,str,i64,f64,f64,f64,str,f64,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64
1,"""9-139417494-G-A_AD""","""NOTCH1""",4851.0,"""AD""",0.0,0.9972,0.9604,1.0,1.0,1,1,,"""rs1589072024""",9,139417494,139417494,"""G""","""A""",0,100.0,"""PASS""","""0/1""","""stop_gained""","""NOTCH1:ENST00000277541.6:c.550…","""PATHOGENIC""","""PVS1,PM2_Supporting,PP4,PP5""","""OMIM:616028""","""Adams-Oliver syndrome 5""",645087.0,"""PATHOGENIC""",1,0.16418,0.125,0.217,,0.0,…,0,1,1,0,0,0,0,0,0,1,0,11.0,3,0,0,1,0,0,0,0,0,0.997,0.997,8,0,0,1,1,0,0,0,1,0,0,0,0,0
1,"""9-139401304-G-T_AD""","""NOTCH1""",4851.0,"""AD""",0.0,0.9956,0.9155,1.0,1.0,1,0,,"""rs1057515423""",9,139401304,139401304,"""G""","""T""",0,100.0,"""PASS""","""0/1""","""stop_gained""","""NOTCH1:ENST00000277541.6:c.376…","""PATHOGENIC""","""PVS1,PM2_Supporting,PP4""","""OMIM:109730""","""Aortic valve disease 1""",221997.0,"""PATHOGENIC""",0,0.16418,0.125,0.217,,0.0,…,0,1,0,0,0,0,0,0,0,1,0,10.0,2,0,0,1,0,0,0,0,0,0.994,0.994,8,0,0,0,1,0,0,0,1,0,0,0,0,0
1,"""9-139399985-G-A_AD""","""NOTCH1""",4851.0,"""AD""",0.0,0.9905,0.8415,1.0,1.0,1,1,,"""rs2133339552""",9,139399985,139399985,"""G""","""A""",0,100.0,"""PASS""","""0/1""","""stop_gained""","""NOTCH1:ENST00000277541.6:c.436…","""PATHOGENIC""","""PVS1,PM2_Supporting,PP4,PP5""","""OMIM:616028""","""Adams-Oliver syndrome 5""",1072334.0,"""PATHOGENIC""",1,0.16418,0.125,0.217,,0.0,…,0,1,1,0,0,0,0,0,0,1,0,11.0,3,0,0,1,0,0,0,0,0,0.997,0.997,8,0,0,1,1,0,0,0,1,0,0,0,0,0
1,"""9-139404312-C-A_AD""","""NOTCH1""",4851.0,"""AD""",0.0,0.9938,0.8833,1.0,1.0,1,0,,,9,139404312,139404312,"""C""","""A""",0,100.0,"""PASS""","""0/1""","""stop_gained""","""NOTCH1:ENST00000277541.6:c.284…","""PATHOGENIC""","""PVS1,PM2_Supporting,PP4""","""OMIM:616028""","""Adams-Oliver syndrome 5""",2503443.0,"""LIKELY_PATHOGENIC""",0,0.16418,0.125,0.217,,0.0,…,0,1,0,0,0,0,0,0,0,1,0,10.0,2,0,0,1,0,0,0,0,0,0.994,0.994,8,0,0,0,1,0,0,0,1,0,0,0,0,0
1,"""9-139407491-A-AG_AD""","""NOTCH1""",4851.0,"""AD""",0.0,0.9899,0.8359,1.0,1.0,1,1,,,9,139407491,139407491,"""A""","""AG""",1,100.0,"""PASS""","""0/1""","""frameshift_variant""","""NOTCH1:ENST00000277541.6:c.244…","""PATHOGENIC""","""PVS1,PM2_Supporting,PP4,PP5_St…","""OMIM:616028""","""Adams-Oliver syndrome 5""",817997.0,"""PATHOGENIC""",2,0.16418,0.125,0.217,,0.0,…,0,1,0,1,0,0,0,0,0,1,0,14.0,2,0,1,1,0,0,0,0,0,1.0,1.0,8,0,0,4,1,0,0,0,1,0,0,0,0,0
