In [2]:
import pandas as pd
import numpy as np
import random

In [4]:
# read the orphanet code and hpoid of samples information
df_orpha_hposet = pd.read_csv('pathway_orphacode_hposet.csv')
df_orpha_hposet

Unnamed: 0,ORPHACODE,DISEASE,HPO_ALL
0,15,Achondroplasia,HP:0001513; HP:0000463; HP:0011867; HP:0000238...
1,15,Achondroplasia,HP:0001513; HP:0000463; HP:0011867; HP:0000238...
2,15,Achondroplasia,HP:0001513; HP:0000463; HP:0011867; HP:0000238...
3,18,Distal renal tubular acidosis,HP:0000114; HP:0011964; HP:0002013; HP:0004396...
4,18,Distal renal tubular acidosis,HP:0000114; HP:0011964; HP:0002013; HP:0004396...
...,...,...,...
2203,542310,Leukoencephalopathy with calcifications and cysts,HP:0100320; HP:0100543; HP:0001260; HP:0002516...
2204,542310,Leukoencephalopathy with calcifications and cysts,HP:0100320; HP:0100543; HP:0001260; HP:0002516...
2205,544469,PRUNE1-related neurological syndrome,HP:0002540; HP:0000347; HP:0001285; HP:0001639...
2206,544469,PRUNE1-related neurological syndrome,HP:0002540; HP:0000347; HP:0001285; HP:0001639...


In [11]:
unique_df = df_orpha_hposet.drop_duplicates()
unique_df

Unnamed: 0,ORPHACODE,DISEASE,HPO_ALL
0,15,Achondroplasia,HP:0001513; HP:0000463; HP:0011867; HP:0000238...
3,18,Distal renal tubular acidosis,HP:0000114; HP:0011964; HP:0002013; HP:0004396...
6,41,Dyschromatosis symmetrica hereditaria,HP:0012733; HP:0001304; HP:0011509; HP:0007988
9,51,Aicardi-Goutières syndrome,HP:0007052; HP:0002071; HP:0001288; HP:0002187...
12,63,Alport syndrome,HP:0011488; HP:0006756; HP:0002837; HP:0012576...
...,...,...,...
2193,536467,B3GALT6-related spondylodysplastic Ehlers-Danl...,HP:0001252; HP:0003414; HP:0002091; HP:0002779...
2196,541423,Growth delay-intellectual disability-hepatopat...,HP:0100511; HP:0002719; HP:0006129; HP:0001399...
2199,542306,GNB5-related intellectual disability-cardiac a...,HP:0002521; HP:0001344; HP:0000639; HP:0011675...
2202,542310,Leukoencephalopathy with calcifications and cysts,HP:0100320; HP:0100543; HP:0001260; HP:0002516...


In [13]:
# Extract the first two columns (ORPHACODE, DISEASE)
df_orpha_disease = unique_df[['ORPHACODE', 'DISEASE']]

# Save to a csv file
df_orpha_disease.to_csv("pathway_orphacode_disease.csv", index=False)

In [21]:
# Define the function with file-saving capability
def pathway_phenotypes(n, m):
    
    # Read the file into a DataFrame, assuming it is tab-separated
    df = pd.read_csv('pathway_orphacode_hposet.csv')
    
    # Ensure n and m are within the specified limits
    n = min(n, 1026)
    m = min(m, 1182)
    
    # Extract first n rows and last m rows
    rows_extracted = pd.concat([df.head(n), df.tail(m)])
    
    # Save the result to a txt file
    rows_extracted.to_csv('pathway_phenotypes.csv', index=False)
    
    return rows_extracted

In [29]:
df_pathway_phenotypes = pathway_phenotypes(n=30, m=30)
print(df_pathway_phenotypes)

      ORPHACODE                                            DISEASE  \
0            15                                     Achondroplasia   
1            15                                     Achondroplasia   
2            15                                     Achondroplasia   
3            18                      Distal renal tubular acidosis   
4            18                      Distal renal tubular acidosis   
5            18                      Distal renal tubular acidosis   
6            41              Dyschromatosis symmetrica hereditaria   
7            41              Dyschromatosis symmetrica hereditaria   
8            41              Dyschromatosis symmetrica hereditaria   
9            51                         Aicardi-Goutières syndrome   
10           51                         Aicardi-Goutières syndrome   
11           51                         Aicardi-Goutières syndrome   
12           63                                    Alport syndrome   
13           63     

In [31]:
# Function to randomly select n HPO terms from the HPOALL column in df_pathway_hposet
def pathway_hpo(i, n, m):
    
    # read the orphanet code and hpoid of samples information
    df_pathway_hposet = pd.read_csv('pathway_phenotypes.csv')
    
    def random_hpo(hpo_str, i):
        # Split the HPO terms by "; " and randomly select n terms
        hpo_terms = hpo_str.split("; ")
        selected_terms = random.sample(hpo_terms, min(i, len(hpo_terms)))  # limit to available terms
        return "; ".join(selected_terms)
    
    # Function to select random HPO terms from a string
    def random_hpo_noise(hpo_str, num_terms):
        # Split the HPO terms by "; " and randomly select the specified number of terms
        hpo_terms = hpo_str.split("; ")
        selected_terms = random.sample(hpo_terms, min(num_terms, len(hpo_terms)))  # limit to available terms
        return selected_terms
   
    # read all the hpo term
    df_all_hpo = pd.read_csv('hpo_id_all.csv')
    
    # Convert df_all_hpo 'hpo_id' column to a list of all HPO terms
    all_hpo_terms = df_all_hpo['hpo_id'].tolist()
    
    # Apply the function to create the HPO_NOISE column in df_pathway_hposet
    def combine_hpo_noise(row):
        # Select n terms from the HPO_ALL column in df_pathway_hposet
        selected_hpo_all = random_hpo_noise(row['HPO_ALL'], n)
        # Select m terms from the df_all_hpo HPO list
        selected_hpo_noise = random.sample(all_hpo_terms, min(m, len(all_hpo_terms)))
        # Combine and return as a single string with "; " delimiter
        return "; ".join(selected_hpo_all + selected_hpo_noise)

    # Apply the function to each row in df_pathway_hposet
    df_pathway_hposet[f'HPO_{i}'] = df_pathway_hposet['HPO_ALL'].apply(lambda x: random_hpo(x, i))    

    # Add the new column HPO_NOISE to df_pathway_hposet
    df_pathway_hposet['HPO_NOISE'] = df_pathway_hposet.apply(combine_hpo_noise, axis=1)
    
    # Save the result to a csv file
    df_pathway_hposet.to_csv('pathway_hpo.csv')

    return df_pathway_hposet

In [33]:
# Example usage (assuming df_pathway_hposet and df_all_hpo are defined):
df_pathway_hposet = pathway_hpo(i=5, n=3, m=2)
print(df_pathway_hposet)

    ORPHACODE                                            DISEASE  \
0          15                                     Achondroplasia   
1          15                                     Achondroplasia   
2          15                                     Achondroplasia   
3          18                      Distal renal tubular acidosis   
4          18                      Distal renal tubular acidosis   
5          18                      Distal renal tubular acidosis   
6          41              Dyschromatosis symmetrica hereditaria   
7          41              Dyschromatosis symmetrica hereditaria   
8          41              Dyschromatosis symmetrica hereditaria   
9          51                         Aicardi-Goutières syndrome   
10         51                         Aicardi-Goutières syndrome   
11         51                         Aicardi-Goutières syndrome   
12         63                                    Alport syndrome   
13         63                                   

In [35]:
print(df_pathway_hposet.head(1))

   ORPHACODE         DISEASE  \
0         15  Achondroplasia   

                                             HPO_ALL  \
0  HP:0001513; HP:0000463; HP:0011867; HP:0000238...   

                                               HPO_5  \
0  HP:0002938; HP:0000256; HP:0008905; HP:0003375...   

                                           HPO_NOISE  
0  HP:0012418; HP:0010536; HP:0001513; HP:6000754...  


In [37]:
# Access and print values from the first row for specific columns
hpo_all = df_pathway_hposet.loc[0, 'HPO_ALL']
hpo_5 = df_pathway_hposet.loc[0, 'HPO_5']
hpo_noise = df_pathway_hposet.loc[0, 'HPO_NOISE']

In [39]:
print(hpo_all)
print(hpo_5)
print(hpo_noise)

HP:0001513; HP:0000463; HP:0011867; HP:0000238; HP:0005819; HP:0003498; HP:0001377; HP:0001156; HP:0005257; HP:0012418; HP:0008445; HP:0002091; HP:0005280; HP:0005619; HP:0003416; HP:0003375; HP:0000956; HP:0010536; HP:0008905; HP:0045086; HP:0002808; HP:0000242; HP:0011452; HP:0000365; HP:0003180; HP:0000260; HP:0008947; HP:0004060; HP:0045087; HP:0002979; HP:0000256; HP:0009826; HP:0003194; HP:0002938; HP:0000309; HP:0002007; HP:0003026; HP:0002870; HP:0010241
HP:0002938; HP:0000256; HP:0008905; HP:0003375; HP:0000242
HP:0012418; HP:0010536; HP:0001513; HP:6000754; HP:0030142


In [41]:
print(hpo_all)

HP:0001513; HP:0000463; HP:0011867; HP:0000238; HP:0005819; HP:0003498; HP:0001377; HP:0001156; HP:0005257; HP:0012418; HP:0008445; HP:0002091; HP:0005280; HP:0005619; HP:0003416; HP:0003375; HP:0000956; HP:0010536; HP:0008905; HP:0045086; HP:0002808; HP:0000242; HP:0011452; HP:0000365; HP:0003180; HP:0000260; HP:0008947; HP:0004060; HP:0045087; HP:0002979; HP:0000256; HP:0009826; HP:0003194; HP:0002938; HP:0000309; HP:0002007; HP:0003026; HP:0002870; HP:0010241


In [43]:
hpo_all = df_pathway_hposet.loc[2, 'HPO_ALL']
hpo_5 = df_pathway_hposet.loc[2, 'HPO_5']
hpo_noise = df_pathway_hposet.loc[2, 'HPO_NOISE']

In [45]:
print(hpo_all)
print(hpo_5)
print(hpo_noise)

HP:0001513; HP:0000463; HP:0011867; HP:0000238; HP:0005819; HP:0003498; HP:0001377; HP:0001156; HP:0005257; HP:0012418; HP:0008445; HP:0002091; HP:0005280; HP:0005619; HP:0003416; HP:0003375; HP:0000956; HP:0010536; HP:0008905; HP:0045086; HP:0002808; HP:0000242; HP:0011452; HP:0000365; HP:0003180; HP:0000260; HP:0008947; HP:0004060; HP:0045087; HP:0002979; HP:0000256; HP:0009826; HP:0003194; HP:0002938; HP:0000309; HP:0002007; HP:0003026; HP:0002870; HP:0010241
HP:0002007; HP:0008445; HP:0010536; HP:0008947; HP:0000238
HP:0005257; HP:0005819; HP:0010241; HP:0002296; HP:0011316
