In [4]:
import pandas as pd
import numpy as np
import random

In [6]:
# Define the function to allow users to choose how many samples(AD and AR) they need to simulate.
def pathway_variants(n, m):
    """
    Simulate causative variants of samples for AD (Autosomal Dominant) and AR (Autosomal Recessive) inheritance.

    Args:
        n (int): A non-negative integer specifying the number of Autosomal Dominant (AD) samples to simulate.
        m (int): A non-negative integer specifying the number of Autosomal Recessive (AR) samples to simulate.

    Returns:
        None: This function generates and saves two text files:
            1. `pathway_variants.txt`: Contains the simulated causative variants for each sample.
            2. `pathway_chrom_pos.txt`: Contains the chromosome and position information for the causative variants.

    """
        
    # Read the file into a DataFrame, assuming it is tab-separated
    df = pd.read_csv('variants_pathway_vcf.csv')
    
    # Ensure n and m are within the specified limits
    n = min(n, 1026)
    m = min(m, 1182)
    
    # Extract first n rows and last m rows
    rows_extracted = pd.concat([df.head(n), df.tail(m)])
    
    # Extract first 5+n columns and last m columns
    columns_extracted = pd.concat([df.iloc[:, :9+n], df.iloc[:, -m:]], axis=1)
    
    # Combine extracted rows and columns
    extracted_df = rows_extracted[columns_extracted.columns]
    
    # Save the result to a txt file
    extracted_df.to_csv('pathway_variants.txt', sep='\t', index=False, header=False)
    
    # Extract the first two columns (CHROM and POS)
    chrom_pos_columns = extracted_df[['CHROM', 'POS']]

    # Save to a text file
    chrom_pos_columns.to_csv("pathway_chrom_pos.txt", index=False, header=False, sep='\t')
    
    return extracted_df

In [8]:
# Define the function to allow users to get the causative genes for the simulated patients.
def pathway_genes(n, m): 
    """
    Simulate causative genes of samples for AD (Autosomal Dominant) and AR (Autosomal Recessive) inheritance.

    Args:
        n (int): A non-negative, even integer specifying the number of Autosomal Dominant (AD) samples to simulate.
        m (int): A non-negative, even integer specifying the number of Autosomal Recessive (AR) samples to simulate.

    Returns:
        None: This function generates and saves a csv file:
        `pathway_genes.csv`: Contains the simulated causative genes for each sample.
        
    """    
    # Read the file into a DataFrame, assuming it is tab-separated
    df = pd.read_csv('pathway_variants_gene_orpha_inher.csv')
    
    # Ensure n and m are within the specified limits
    n = min(n, 1026)
    m = min(m, 1182)
     
    # Extract first n rows and last m rows
    rows_extracted = pd.concat([df.head(n), df.tail(m)])
    
    # Extract first 5+n columns and last m columns
    columns_extracted = pd.concat([df.iloc[:, :5+n], df.iloc[:, -m:]], axis=1)
    
    # Combine extracted rows and columns
    extracted_df = rows_extracted[columns_extracted.columns]
    
    # Save the result to a txt file
    extracted_df.to_csv('pathway_genes.csv', index=False)
    
    return extracted_df

In [10]:
df = pd.read_csv('pathway_variants_gene_orpha_inher.csv')
df

Unnamed: 0,CHROM,POS,ID,REF,ALT,GENEINFO,ORPHACODE,INHERITANCE
0,4,1806104,16330,G,T,FGFR3,15,AD
1,4,1805658,2664079,C,G,FGFR3,15,AD
2,15,66727442,13350,T,C,MAP2K1,15,AD
3,7,138391446,1344704,T,C,ATP6V0A4,18,AD
4,2,71185243,12228,T,C,ATP6V1B1,18,AD
...,...,...,...,...,...,...,...,...
2203,17,8076766,929264,G,C,TMEM107,542310,AR
2204,4,108852800,989059,A,C,CYP2U1,542310,AR
2205,1,150990382,932732,T,C,PRUNE1,544469,AR
2206,1,150990970,427231,C,A,PRUNE1,544469,AR


In [12]:
df_pathway_genes = pathway_genes(30, 30)

In [14]:
# Define the function allowing users to get the phenotypes for the simulated patients.
def pathway_phenotypes(n, m):
    
    """
    Simulate phenotypes of samples for AD (Autosomal Dominant) and AR (Autosomal Recessive) inheritance.

    Args:
        n (int): A non-negative, even integer specifying the number of Autosomal Dominant (AD) samples to simulate.
        m (int): A non-negative, even integer specifying the number of Autosomal Recessive (AR) samples to simulate.

    Returns:
        None: This function generates and saves a csv file:
        `pathway_phenotypes.csv`: Contains the simulated phenotypes for each sample.
        
    """    
    
    # Read the file into a DataFrame, assuming it is tab-separated
    df = pd.read_csv('pathway_orphacode_hposet.csv')
    
    # Ensure n and m are within the specified limits
    n = min(n, 1184)
    m = min(m, 1410)
    
    # Extract first n rows and last m rows
    rows_extracted = pd.concat([df.head(n), df.tail(m)])
    
    # Save the result to a txt file
    rows_extracted.to_csv('pathway_phenotypes.csv')
    
    return rows_extracted

In [16]:
# Function to randomly select HPO terms to simulate different scenarios for phenotype simulation
def pathway_hpo(i, n, m):

    """
    Simulate user-defined scenarios by selecting a specified number of HPO terms related to rare diseases 
    for phenotype simulation.

    Args:
        i (int): A non-negative integer specifying the number of rare disease-related HPO terms to select, 
        simulating scenarios where 'i' rare disease-related HPO terms are randomly chosen for phenotype simulation.  
        
        n (int): A non-negative integer specifying the number of rare disease-related HPO terms to select, 
        simulating scenarios where 'n' rare disease-related HPO terms are randomly chosen. Additionally, 
        'm' HPO terms are randomly selected from the entire set of HPO terms to introduce noise for phenotype simulation.
        
        m (int): A non-negative integer specifying the number of rare disease-unrelated HPO terms to select, 
        simulating scenarios where 'n' rare disease-related HPO terms are randomly chosen. Additionally, 
        'm' HPO terms are randomly selected from the entire set of HPO terms to introduce noise for phenotype simulation.

    Returns:
        None: This function generates and saves a csv file:
        `pathway_hpo.csv`: Contains the simulated HPO terms user selected for each sample.
        
    """    
        
    # read the orphanet code and hpoid of pathway information
    df_pathway_hposet = pd.read_csv('pathway_phenotypes.csv')
    
    def random_hpo(hpo_str, i):
        # Split the HPO terms by "; " and randomly select n terms
        hpo_terms = hpo_str.split("; ")
        selected_terms = random.sample(hpo_terms, min(i, len(hpo_terms)))  # limit to available terms
        return "; ".join(selected_terms)
    
    # Function to select random HPO terms from a string
    def random_hpo_noise(hpo_str, num_terms):
        # Split the HPO terms by "; " and randomly select the specified number of terms
        hpo_terms = hpo_str.split("; ")
        selected_terms = random.sample(hpo_terms, min(num_terms, len(hpo_terms)))  # limit to available terms
        return selected_terms
   
    # read all the hpo term
    df_all_hpo = pd.read_csv('hpo_id_all.csv')
    
    # Convert df_all_hpo 'hpo_id' column to a list of all HPO terms
    all_hpo_terms = df_all_hpo['hpo_id'].tolist()
    
    # Apply the function to create the HPO_NOISE column in df_pathway_hposet
    def combine_hpo_noise(row):
        # Select n terms from the HPO_ALL column in df_pathway_hposet
        selected_hpo_all = random_hpo_noise(row['HPO_ALL'], n)
        # Select m terms from the df_all_hpo HPO list
        selected_hpo_noise = random.sample(all_hpo_terms, min(m, len(all_hpo_terms)))
        # Combine and return as a single string with "; " delimiter
        return "; ".join(selected_hpo_all + selected_hpo_noise)

    # Apply the function to each row in df_pathway_hposet
    df_pathway_hposet[f'HPO_{i}'] = df_pathway_hposet['HPO_ALL'].apply(lambda x: random_hpo(x, i))    

    # Add the new column HPO_NOISE to df_pathway_hposet
    df_pathway_hposet['HPO_NOISE'] = df_pathway_hposet.apply(combine_hpo_noise, axis=1)
    
    # Save the result to a csv file
    df_pathway_hposet.to_csv('pathway_hpo.csv')
    
    return df_pathway_hposet