# **Patient Analysis**
## This is for testing of patient parsing - the scripts tested here are "patient_avinput.py" and "patient_concatenator".

In [1]:
# Importing required packages
import os
import glob
import re
import pandas as pd
import numpy as np

# Setting working directory
cwd = os.getcwd()
print(cwd)

c:\Users\TooFastDan\OneDrive - Baylor College of Medicine\BCM\Projects\Autosomal Dominant Predictor of IRDs\manuscript\GitHub


## **Run this block if shell script does not merge all annovar csv output files**

In [2]:
# Getting a list of all annotated csv files from ANNOVAR
#patient_data = glob.glob(cwd+"/patient_results/*.csv")
#print("Number of patient csv files: " + str(len(patient_data)))

# Attempting to read all ANNOVAR csv files, adding a patient_ID column and merging into a final df
#annovar_df_list = []
#for f in patient_data:
#    try:
#        patient_id = f.split("/")[-1].replace(".hg19_multianno.csv", "")
#        df = pd.read_csv(f)
#        df["Patient_ID"] = patient_id
#        IDs = df.pop("Patient_ID")
#        df.insert(5, "Patient_ID", IDs)
#        annovar_df_list.append(df)
#    except:
#        print("{} failed csv parsing - likely to no input in .final.analysis".format(f))

# Printing info about final_df
#final_df = pd.concat(annovar_df_list, axis=0)
#display(final_df.shape)
#display(final_df.head())

# Optional export of final_df
#final_df.to_csv(cwd+"/patient_data/annovar_patient_data.csv", index=False)

## **Importing and Cleaning patient annovar data export**

In [None]:
# Importing the annovar_patient_data.csv file from shell script output
final_df = pd.read_csv(cwd+"/patient_data/annovar_patient_data.csv")
final_df.head()

## **Test Cleaning extracted txt files with read counts, and allele frequencies**

In [None]:
# List of all patient txt files
patient_txt_files = glob.glob(cwd+"/patient_data/*.txt")

# Reading the df, cleaning up any rows with NA values, and reorganizing/making new columns
df = pd.read_table(patient_txt_files[2], sep="\t", na_values=".")
df = df.dropna(axis=0, how="all") #dropping any rows with all NaN values
df.columns = ['Chr', 'Start', 'End', 'Ref', 'Alt', 'Geno', 'gnomad_WES', 'gnomad_WGS']
df[['Genotype', 'alt_reads', 'ref_reads', 'total_reads', 'delete']] = df["Geno"].str.split(":", expand=True)

# Finding allele count, frequency, and number from whole exome seq (WES) data
gnomad_WES_AC = []
gnomad_WES_AF = []
gnomad_WES_AN = []
for wes in df["gnomad_WES"]:
    try:
        allele_count = float(re.findall(pattern=r"AC:([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?);", string=wes)[0])
        gnomad_WES_AC.append(allele_count)
    except:
        gnomad_WES_AC.append(np.nan)
    try:
        allele_freq = float(re.findall(pattern=r"AF:([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?);", string=wes)[0])
        gnomad_WES_AF.append(allele_freq)
    except:
        gnomad_WES_AF.append(np.nan)
    try:
        allele_number = float(re.findall(pattern=r"AN:([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?);", string=wes)[0])
        gnomad_WES_AN.append(allele_number)
    except:
        gnomad_WES_AN.append(np.nan)
df["gnomad_WES_AC"] = gnomad_WES_AC
df["gnomad_WES_AF"] = gnomad_WES_AF
df["gnomad_WES_AN"] = gnomad_WES_AN
   
# Finding allele count, frequency, and number from whole genome seq (WGS) data
gnomad_WGS_AC = []
gnomad_WGS_AF = []
gnomad_WGS_AN = []
for wgs in df["gnomad_WGS"]:
    try:
        allele_count = float(re.findall(pattern=r"AC:([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?);", string=wgs)[0])
        gnomad_WGS_AC.append(allele_count)
    except:
        gnomad_WGS_AC.append(np.nan)
    try:
        allele_freq = float(re.findall(pattern=r"AF:([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?);", string=wgs)[0])
        gnomad_WGS_AF.append(allele_freq)
    except:
        gnomad_WGS_AF.append(np.nan)
    try:
        allele_number = float(re.findall(pattern=r"AN:([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?);", string=wgs)[0])
        gnomad_WGS_AN.append(allele_number)
    except:
        gnomad_WGS_AN.append(np.nan)
df["gnomad_WGS_AC"] = gnomad_WGS_AC
df["gnomad_WGS_AF"] = gnomad_WGS_AF
df["gnomad_WGS_AN"] = gnomad_WGS_AN

# Deleting unessesary columns
df = df.drop(["delete", "gnomad_WES", "gnomad_WGS"], axis=1)

# Converting start/end and other columns to integers (otherwise ANNOVAR won't work)
df[["Start", "End", "alt_reads", "ref_reads", "total_reads"]] = df[["Start", "End", "alt_reads", "ref_reads", "total_reads"]].astype(int)

# Making a patient ID column
patient_ID = patient_txt_files[2].split("/")[-1].replace(".txt", "")
df["Patient_ID"] = patient_ID

# Reorganizing columns
df = df[['Chr', 'Start', 'End', 'Ref', 'Alt', 'Patient_ID', 'Geno', 'Genotype', 'alt_reads', 'ref_reads', 'total_reads', 
         'gnomad_WES_AC', 'gnomad_WGS_AC', 'gnomad_WES_AF', 'gnomad_WGS_AF', 'gnomad_WES_AN', 'gnomad_WGS_AN']]

# Displaying the results
display(df.shape)
display(df.head())

In [None]:
# List of all patient txt files
patient_txt_files = glob.glob(cwd+"/patient_data/*.txt")

for f in patient_txt_files:
    # Reading the df, cleaning up any rows with NA values, and reorganizing/making new columns
    df = pd.read_table(f, sep="\t", na_values=".")
    df = df.dropna(axis=0, how="all") #dropping any rows with all NaN values
    df.columns = ['Chr', 'Start', 'End', 'Ref', 'Alt', 'Geno', 'gnomad_WES', 'gnomad_WGS']
    df[['Genotype', 'alt_reads', 'ref_reads', 'total_reads', 'delete']] = df["Geno"].str.split(":", expand=True)
    
    # Finding allele count, frequency, and number from whole exome seq (WES) data
    gnomad_WES_AC = []
    gnomad_WES_AF = []
    gnomad_WES_AN = []
    for wes in df["gnomad_WES"]:
        try:
            allele_count = float(re.findall(pattern=r"AC:([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?);", string=wes)[0])
            gnomad_WES_AC.append(allele_count)
        except:
            gnomad_WES_AC.append(np.nan)
        try:
            allele_freq = float(re.findall(pattern=r"AF:([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?);", string=wes)[0])
            gnomad_WES_AF.append(allele_freq)
        except:
            gnomad_WES_AF.append(np.nan)
        try:
            allele_number = float(re.findall(pattern=r"AN:([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?);", string=wes)[0])
            gnomad_WES_AN.append(allele_number)
        except:
            gnomad_WES_AN.append(np.nan)
    df["gnomad_WES_AC"] = gnomad_WES_AC
    df["gnomad_WES_AF"] = gnomad_WES_AF
    df["gnomad_WES_AN"] = gnomad_WES_AN
       
    # Finding allele count, frequency, and number from whole genome seq (WGS) data
    gnomad_WGS_AC = []
    gnomad_WGS_AF = []
    gnomad_WGS_AN = []
    for wgs in df["gnomad_WGS"]:
        try:
            allele_count = float(re.findall(pattern=r"AC:([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?);", string=wgs)[0])
            gnomad_WGS_AC.append(allele_count)
        except:
            gnomad_WGS_AC.append(np.nan)
        try:
            allele_freq = float(re.findall(pattern=r"AF:([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?);", string=wgs)[0])
            gnomad_WGS_AF.append(allele_freq)
        except:
            gnomad_WGS_AF.append(np.nan)
        try:
            allele_number = float(re.findall(pattern=r"AN:([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?);", string=wgs)[0])
            gnomad_WGS_AN.append(allele_number)
        except:
            gnomad_WGS_AN.append(np.nan)
    df["gnomad_WGS_AC"] = gnomad_WGS_AC
    df["gnomad_WGS_AF"] = gnomad_WGS_AF
    df["gnomad_WGS_AN"] = gnomad_WGS_AN
    
    # Deleting unessesary columns
    df = df.drop(["delete", "gnomad_WES", "gnomad_WGS"], axis=1)
    
    # Converting start/end and other columns to integers (otherwise ANNOVAR won't work)
    df[["Start", "End", "alt_reads", "ref_reads", "total_reads"]] = df[["Start", "End", "alt_reads", "ref_reads", "total_reads"]].astype(int)
    
    # Making a patient ID column
    patient_ID = f.split("/")[-1].replace(".txt", "")
    df["Patient_ID"] = patient_ID
    
    # Reorganizing columns
    df = df[['Chr', 'Start', 'End', 'Ref', 'Alt', 'Patient_ID', 'Geno', 'Genotype', 'alt_reads', 'ref_reads', 'total_reads', 
             'gnomad_WES_AC', 'gnomad_WGS_AC', 'gnomad_WES_AF', 'gnomad_WGS_AF', 'gnomad_WES_AN', 'gnomad_WGS_AN']]
    
    # Displaying the results
    print("\n"+patient_ID)
    display(df.shape)
    display(df.head())

## **Test Concatenating and merging ANNOVAR CSV with avinput**

In [68]:
# List of patient avinput files
patient_av_files = glob.glob(cwd+"/patient_samples/*.avinput")

# Looping through avinput files, importing dfs, and appending to a list
patient_av_list = []
for av in patient_av_files:
    try:
        df = pd.read_table(av, header=None, names=['Chr', 'Start', 'End', 'Ref', 'Alt', 'Patient_ID', 'Geno', 'Genotype', 
                                                   'alt_reads', 'ref_reads', 'total_reads', 'gnomad_WES_AC', 'gnomad_WGS_AC', 
                                                   'gnomad_WES_AF', 'gnomad_WGS_AF', 'gnomad_WES_AN', 'gnomad_WGS_AN'])
        patient_av_list.append(df)
    except:
        print("{} failed table parsing, likely due to no input in .final.analysis file".format(av))

# Concatenating avinput files
patient_avinput = pd.concat(patient_av_list, axis=0)
patient_avinput.shape

(1701, 17)

In [69]:
# List of patient annotated csv files from ANNOVAR
annovar_files = glob.glob(cwd+"/patient_samples/*.csv")

# Looping through csv files, importing dfs, and appending to a list
annovar_df_list = []
for ann in annovar_files:
    try:
        patient_id = ann.split("/")[-1].replace(".hg19_multianno.csv", "")
        df = pd.read_csv(ann)
        df["Patient_ID"] = patient_id
        IDs = df.pop("Patient_ID")
        df.insert(5, "Patient_ID", IDs)
        annovar_df_list.append(df)
    except:
        print("{} failed csv parsing, likely due to no input in .final.analysis file".format(ann))

# Concatenating csv ANNOVAR files
patient_annovar = pd.concat(annovar_df_list, axis=0)
patient_annovar.shape

(1701, 122)

In [None]:
# Merging patient annotated df from ANNOVAR with avinput files to get info about read count and allele frequency
final_df = pd.merge(patient_avinput, patient_annovar, on=['Chr', 'Start', 'End', 'Ref', 'Alt', 'Patient_ID'])
display(final_df.shape)
display(final_df.head())

In [72]:
#final_df.to_excel(cwd+"/patient_data/test_output.xlsx", index=False)