In [1]:
import dataclasses
import numpy as np
import pandas as pd
import toml
import pathlib2 as pathlib

with open("config.toml") as f:
    config = toml.load(f)

@dataclasses.dataclass
class Thresholds:
    GQ: int = config['QC_Thresholds']['GQ']
    DP: int = config['QC_Thresholds']['DP']
    AB_het = config['QC_Thresholds']['AB_het']
    AB_hom = config['QC_Thresholds']['AB_hom']
    SIFT: float = config['In_silico_predictions']['SIFT']
    PP2: float = config['In_silico_predictions']['PolyPhen-2']
    CADD: float = config['In_silico_predictions']['CADD']
    MutationTaster: list = dataclasses.field(default_factory=list)
    GGM_AF: float = config['AF_Thresholds']['GGM']

# Define the thresholds of 'MutationTaster' as a list of strings
Thresholds.MutationTaster = config['In_silico_predictions']['exclude_MutationTaster']

In [2]:
input_dir = '/Volumes/vol/work/Github/TestData/ggm'
input_csv = f"{input_dir}/ggm_trio_BRS3.csv"
input_csv = pathlib.Path(input_csv)
input_basename = input_csv.stem

In [3]:
def _split_qc_col(df: pd.DataFrame) -> pd.DataFrame:
    for rel in ['pro', 'pat', 'mat']:
        df = pd.concat(
            [df, df[f'GQ:DP:AD({rel})'].str.split(':', expand=True)], axis=1)
        for i in range(3):
            df[i] = df[i].replace('.', np.nan)
            df[i] = df[i].replace('-', np.nan)
        df = df.astype({0: float, 1: float, 2: float})
        df.rename(columns={0: f'GQ({rel})', 1: f'DP({rel})', 2: f'AD({rel})'}, 
                inplace=True)
    return df

def _add_ab_col(df: pd.DataFrame) -> pd.DataFrame:
    for rel in ['pro', 'pat', 'mat']:
        df[f'AB({rel})'] = df[f'AD({rel})'] / df[f'DP({rel})']
    return df

def _split_ggmacan_col(df: pd.DataFrame) -> pd.DataFrame:
    df = pd.concat(
        [df, df['GGM(AC/AN)'].str.split('/', expand=True)], axis=1)
    df = df.astype({0: float, 1: float})
    df.rename(columns={0: 'GGM(AC)', 1: 'GGM(AN)'}, inplace=True)

    return df


def _add_ggmaf_col(df: pd.DataFrame) -> pd.DataFrame:
    df['GGM(AF)'] = df['GGM(AC)'] / df['GGM(AN)']
    
    return df

def _rename_cols(df: pd.DataFrame) -> pd.DataFrame:
    rename_dict: dict = {
        'Chr': 'CHROM',
        'Position': 'POS',
        'Ref': 'REF',
        'Alt': 'ALT'
        }
    df.rename(columns=rename_dict, inplace=True)

    return df

def generate_variant_id(df: pd.DataFrame) -> pd.DataFrame:
    df['variant_id'] = df['Chr'] + ':' + \
                       df['Position'] + '-' + df['Ref'] + '-' + df['Alt']
    return df

def add_qc_filter(row, thresholds: Thresholds) -> pd.DataFrame:    
    if row['GQ(pro)'] < thresholds.GQ:
        return '.'
    else:
        if row['Vtype'] == 'homo':
            if row['AB(pro)'] >= (1 - thresholds.AB_hom):
                return 'PASS'
            else:
                return '.'
        else:
            if thresholds.AB_het <= row['AB(pro)'] <= (1 - thresholds.AB_het):
                return 'PASS'
            else:
                return '.'

def add_insilico_filter(row, thresholds: Thresholds) -> str:
    if ((row['SIFT'] >= thresholds.SIFT) 
        and (row['PolyPhen-2'] <= thresholds.PP2)
        and (row['CADD'] < thresholds.CADD)
        and (row['MutationTaster'] in thresholds.MutationTaster)):
        return 'FAIL' 
    else:
        return 'PASS'

def add_identified_filter(df: pd.DataFrame) -> pd.DataFrame:
    df.loc[
        df['Analysis status'] != 'Identified', 
        'Not_Identified_FILTER'] = 'PASS'

    return df

def add_ggmmaf_filter(df: pd.DataFrame, thresholds: Thresholds) -> pd.DataFrame:
    df.loc[
        df['GGM(AF)'] < thresholds.GGM_AF,
        'GGM_FILTER'] = 'PASS'
    
    return df
    
def add_hard_filter(df: pd.DataFrame) -> pd.DataFrame:
    df.loc[
        (
            (df['QC_FILTER'] == 'PASS')
            & (df['Not_Identified_FILTER'] == 'PASS')
            & (df['insilico_FILTER'] == 'PASS')
            & (df['GGM_FILTER'] == 'PASS')
        ),
        'HARD_FILTER'] = 'PASS'
    
    return df

# Pre-processing functions
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    df = _split_qc_col(df)
    df = _add_ab_col(df)
    df = _split_ggmacan_col(df)
    df = _add_ggmaf_col(df)
    df = generate_variant_id(df)
    df.replace({'SIFT': '-', 'PolyPhen-2': '-', 'CADD': '-'}, np.nan, inplace=True)
    df = df.astype(
        {'SIFT': float, 'PolyPhen-2': float, 'CADD': float, 
         'GQ(pro)': float, 'AB(pro)': float, 'GGM(AF)': float}
        )

    return df

def reorder_columns(df: pd.DataFrame) -> pd.DataFrame:
    # Drop the 1st column
    df = df.drop(columns=df.columns[0])
    
    # Rename the columns
    df = _rename_cols(df)

    # Reorder the columns
    reodered_columns = [
        'HARD_FILTER', 'Gene', 'Transcript', 'Family', 'Sample', 'Disease', 
        'Vtype', 'variant_id', 'Amino acid change2', 'Effect',  
        'Distance', 'SIFT', 'PolyPhen-2', 'MutationTaster', 'CADD',
        'gnomAD(AF)', 'ExAC(AF)', 'ToMMo3.5K(AF)', 'GGM(AF)', 'JPNCTL(SC)', 
        'GGM(AC)', 'gnomAD(AC)', 'ToMMo3.5K(AC)',  
        'ID(pro)', 'AS(pro)', 'GT(pro)', 'GQ(pro)', 'DP(pro)', 'AD(pro)', 'AB(pro)',
        'ID(pat)', 'AS(pat)', 'GT(pat)', 'GQ(pat)', 'DP(pat)', 'AD(pat)', 'AB(pat)',
        'ID(mat)', 'AS(mat)', 'GT(mat)', 'GQ(mat)', 'DP(mat)', 'AD(mat)', 'AB(mat)',
        'Impact', 'QC_FILTER', 'Not_Identified_FILTER', 'insilico_FILTER',
        'Analysis status', 'Identified gene', 'Variant description', 
        'CHROM', 'POS', 'REF', 'ALT'
        ]
    
    df = df[reodered_columns]

    return df

def postprocess(df: pd.DataFrame) -> pd.DataFrame:
    df.replace(np.nan, '.', inplace=True)

    return df

In [4]:
df = pd.read_csv(input_csv, header=0, dtype=str)

df = preprocess(df)
df = add_identified_filter(df)
df = add_ggmmaf_filter(df, Thresholds())
df['QC_FILTER'] = df.apply(add_qc_filter, args=(Thresholds(),), axis=1)
df['insilico_FILTER'] = df.apply(add_insilico_filter, args=(Thresholds(),), axis=1)
df = add_hard_filter(df)
df = reorder_columns(df)
df = postprocess(df)


In [22]:
df_hf = df[df['HARD_FILTER'] == 'PASS']
df_hf[['Sample', 'Disease', 'Vtype', 'GQ(pro)', 'DP(pro)', 'AD(pro)', 'AB(pro)', 'CADD']]

Unnamed: 0,Sample,Disease,Vtype,GQ(pro),DP(pro),AD(pro),AB(pro),CADD
4,Sample_21958,Ascending_spastic_paralysis,homo,99.0,77.0,77.0,1.0,26.5
9,Sample_26134,Acute_encephalitis_with_refractory_repetitive_...,denovo,23.0,10.0,2.0,0.2,.
10,Sample_23722,Epilepsy_West_synd,homo,99.0,39.0,39.0,1.0,22.2
11,Sample_16449,Inherited_glycosilation_disorder,homo,99.0,61.0,61.0,1.0,.
13,Sample_2657,Epilepsy:Ohtahara_syndrome,homo,99.0,96.0,96.0,1.0,21.3
15,Sample_6093,Autism,denovo,99.0,144.0,40.0,0.277778,35.0
16,Sample_6093,Autism,denovo,99.0,141.0,37.0,0.262411,.
19,Sample_4491,Fibular_aplasia_tibial_campomelia_and_oligosyn...,homo,99.0,172.0,172.0,1.0,25.2
21,Sample_29017,Hypothalamic_Hamartoma,denovo,20.0,10.0,2.0,0.2,.
27,Sample_33720,long_QT_syndrome,denovo,34.0,5.0,2.0,0.4,34.0


In [23]:
df.to_excel(f"{input_dir}/{input_basename}.parsed.xlsx", index=False)