In [17]:
import pandas as pd
import numpy as np
import random

In [31]:
# read hpo, orphanet_code and disease information
df = pd.read_csv('hpo_id_orpha.csv')
df

Unnamed: 0,orphanet_code,disease_name,hpo_id
0,5,Long chain 3-hydroxyacyl-CoA dehydrogenase def...,HP:0001249
1,5,Long chain 3-hydroxyacyl-CoA dehydrogenase def...,HP:0001939
2,5,Long chain 3-hydroxyacyl-CoA dehydrogenase def...,HP:0011968
3,5,Long chain 3-hydroxyacyl-CoA dehydrogenase def...,HP:0000532
4,5,Long chain 3-hydroxyacyl-CoA dehydrogenase def...,HP:0000533
...,...,...,...
114044,658549,Idiopathic small fibers neuropathy,HP:0007550
114045,658549,Idiopathic small fibers neuropathy,HP:0008652
114046,658549,Idiopathic small fibers neuropathy,HP:0002579
114047,658549,Idiopathic small fibers neuropathy,HP:0032147


In [33]:
# Dropping duplicate rows based on "orphanet_code" and keeping the first occurrence
df = df.drop_duplicates(subset="orphanet_code")
df = df.drop(columns = "hpo_id")
df

Unnamed: 0,orphanet_code,disease_name
0,5,Long chain 3-hydroxyacyl-CoA dehydrogenase def...
25,6,3-methylcrotonyl-CoA carboxylase deficiency
35,7,3C syndrome
99,8,"47,XYY syndrome"
144,9,Tetrasomy X
...,...,...
113858,592574,Menke-Hennekam syndrome
113902,596448,IgG4-related systemic disease
113932,599373,STXBP1-related encephalopathy
113963,647799,MYT1L-related developmental delay-intellectual...


In [35]:
# read the orphanet code and hpoid of pairs information
df_orpha_hposet = pd.read_csv('case_orphacode_hposet.csv')
df_orpha_hposet

Unnamed: 0,ORPHACODE,HPOID
0,15,HP:0000242; HP:0011452; HP:0002979; HP:0000365...
1,18,HP:0032066; HP:0012405; HP:0011387; HP:0003355...
2,41,HP:0007988; HP:0011509; HP:0001304; HP:0012733
3,51,HP:0030038; HP:0000508; HP:0004809; HP:0011834...
4,53,HP:0001373; HP:0002758; HP:0002754; HP:0002007...
...,...,...
1503,542306,HP:0011704; HP:0000546; HP:0005155; HP:0010864...
1504,542310,HP:0010576; HP:0000712; HP:0011153; HP:0001250...
1505,544469,HP:0000648; HP:0010864; HP:0002313; HP:0002650...
1506,557003,HP:0000278; HP:0000286; HP:0012758; HP:0000519...


In [37]:
# Merging the data based on the 'ORPHACODE' and 'orphanet_code' columns
df_merged = pd.merge(df_orpha_hposet, df, left_on='ORPHACODE', right_on='orphanet_code', how='inner')

# Display the merged DataFrame
df_merged

Unnamed: 0,ORPHACODE,HPOID,orphanet_code,disease_name
0,15,HP:0000242; HP:0011452; HP:0002979; HP:0000365...,15,Achondroplasia
1,18,HP:0032066; HP:0012405; HP:0011387; HP:0003355...,18,Distal renal tubular acidosis
2,41,HP:0007988; HP:0011509; HP:0001304; HP:0012733,41,Dyschromatosis symmetrica hereditaria
3,51,HP:0030038; HP:0000508; HP:0004809; HP:0011834...,51,Aicardi-Goutières syndrome
4,53,HP:0001373; HP:0002758; HP:0002754; HP:0002007...,53,Albers-Schönberg osteopetrosis
...,...,...,...,...
1503,542306,HP:0011704; HP:0000546; HP:0005155; HP:0010864...,542306,GNB5-related intellectual disability-cardiac a...
1504,542310,HP:0010576; HP:0000712; HP:0011153; HP:0001250...,542310,Leukoencephalopathy with calcifications and cysts
1505,544469,HP:0000648; HP:0010864; HP:0002313; HP:0002650...,544469,PRUNE1-related neurological syndrome
1506,557003,HP:0000278; HP:0000286; HP:0012758; HP:0000519...,557003,Oculoskeletodental syndrome


In [41]:
# Extract the three columns (ORPHACODE', 'disease_name', 'HPOID')
df_case_hposet = df_merged[['ORPHACODE', 'disease_name', 'HPOID']]
print(df_case_hposet.head())

   ORPHACODE                           disease_name  \
0         15                         Achondroplasia   
1         18          Distal renal tubular acidosis   
2         41  Dyschromatosis symmetrica hereditaria   
3         51             Aicardi-Goutières syndrome   
4         53         Albers-Schönberg osteopetrosis   

                                               HPOID  
0  HP:0000242; HP:0011452; HP:0002979; HP:0000365...  
1  HP:0032066; HP:0012405; HP:0011387; HP:0003355...  
2     HP:0007988; HP:0011509; HP:0001304; HP:0012733  
3  HP:0030038; HP:0000508; HP:0004809; HP:0011834...  
4  HP:0001373; HP:0002758; HP:0002754; HP:0002007...  


In [68]:
df_case_hposet = df_case_hposet.rename(columns={"disease_name": "DISEASE"})
df_case_hposet = df_case_hposet.rename(columns={"HPOID": "HPO_ALL"})
df_case_hposet

Unnamed: 0,ORPHACODE,DISEASE,HPO_ALL
0,15,Achondroplasia,HP:0000242; HP:0011452; HP:0002979; HP:0000365...
1,18,Distal renal tubular acidosis,HP:0032066; HP:0012405; HP:0011387; HP:0003355...
2,41,Dyschromatosis symmetrica hereditaria,HP:0007988; HP:0011509; HP:0001304; HP:0012733
3,51,Aicardi-Goutières syndrome,HP:0030038; HP:0000508; HP:0004809; HP:0011834...
4,53,Albers-Schönberg osteopetrosis,HP:0001373; HP:0002758; HP:0002754; HP:0002007...
...,...,...,...
1503,542306,GNB5-related intellectual disability-cardiac a...,HP:0011704; HP:0000546; HP:0005155; HP:0010864...
1504,542310,Leukoencephalopathy with calcifications and cysts,HP:0010576; HP:0000712; HP:0011153; HP:0001250...
1505,544469,PRUNE1-related neurological syndrome,HP:0000648; HP:0010864; HP:0002313; HP:0002650...
1506,557003,Oculoskeletodental syndrome,HP:0000278; HP:0000286; HP:0012758; HP:0000519...


In [70]:
# Extract the first two columns (ORPHACODE, DISEASE)
df_orpha_disease = df_case_hposet[['ORPHACODE', 'DISEASE']]
# Save to a csv file
df_orpha_disease.to_csv("case_orphacode_disease.csv", index=False)

In [72]:
df_case_hposet

Unnamed: 0,ORPHACODE,DISEASE,HPO_ALL
0,15,Achondroplasia,HP:0000242; HP:0011452; HP:0002979; HP:0000365...
1,18,Distal renal tubular acidosis,HP:0032066; HP:0012405; HP:0011387; HP:0003355...
2,41,Dyschromatosis symmetrica hereditaria,HP:0007988; HP:0011509; HP:0001304; HP:0012733
3,51,Aicardi-Goutières syndrome,HP:0030038; HP:0000508; HP:0004809; HP:0011834...
4,53,Albers-Schönberg osteopetrosis,HP:0001373; HP:0002758; HP:0002754; HP:0002007...
...,...,...,...
1503,542306,GNB5-related intellectual disability-cardiac a...,HP:0011704; HP:0000546; HP:0005155; HP:0010864...
1504,542310,Leukoencephalopathy with calcifications and cysts,HP:0010576; HP:0000712; HP:0011153; HP:0001250...
1505,544469,PRUNE1-related neurological syndrome,HP:0000648; HP:0010864; HP:0002313; HP:0002650...
1506,557003,Oculoskeletodental syndrome,HP:0000278; HP:0000286; HP:0012758; HP:0000519...


In [74]:
# Save to a csv file
df_case_hposet.to_csv("case_orphacode_hposet.csv", index=False)

In [85]:
# Define the function with file-saving capability
def case_phenotypes(n, m):
    
    # Read the file into a DataFrame, assuming it is tab-separated
    df = pd.read_csv('case_orphacode_hposet.csv')
    
    # Ensure n and m are within the specified limits
    n = min(n, 814)
    m = min(m, 694)
    
    # Extract first n rows and last m rows
    rows_extracted = pd.concat([df.head(n), df.tail(m)])
    
    # Save the result to a txt file
    rows_extracted.to_csv('case_phenotypes.csv', index=False)
    
    return rows_extracted

In [87]:
df_case_phenotypes = case_phenotypes(n=20, m=20)
print(df_case_phenotypes)

      ORPHACODE                                            DISEASE  \
0            15                                     Achondroplasia   
1            18                      Distal renal tubular acidosis   
2            41              Dyschromatosis symmetrica hereditaria   
3            51                         Aicardi-Goutières syndrome   
4            53                     Albers-Schönberg osteopetrosis   
5            58                                  Alexander disease   
6            63                                    Alport syndrome   
7            65                         Leber congenital amaurosis   
8            82  Hereditary thrombophilia due to congenital ant...   
9            87                                     Apert syndrome   
10           97                         Familial paroxysmal ataxia   
11          107                                       BOR syndrome   
12          112                                   Bartter syndrome   
13          115     

In [89]:
# Function to randomly select n HPO terms from the HPOALL column in df_pairs_hposet
def case_hpo(i, n, m):
    
    # read the orphanet code and hpoid of pairs information
    df_case_hposet = pd.read_csv('case_phenotypes.csv')
    
    def random_hpo(hpo_str, i):
        # Split the HPO terms by "; " and randomly select n terms
        hpo_terms = hpo_str.split("; ")
        selected_terms = random.sample(hpo_terms, min(i, len(hpo_terms)))  # limit to available terms
        return "; ".join(selected_terms)
    
    # Function to select random HPO terms from a string
    def random_hpo_noise(hpo_str, num_terms):
        # Split the HPO terms by "; " and randomly select the specified number of terms
        hpo_terms = hpo_str.split("; ")
        selected_terms = random.sample(hpo_terms, min(num_terms, len(hpo_terms)))  # limit to available terms
        return selected_terms
   
    # read all the hpo term
    df_all_hpo = pd.read_csv('hpo_id_all.csv')
    
    # Convert df_all_hpo 'hpo_id' column to a list of all HPO terms
    all_hpo_terms = df_all_hpo['hpo_id'].tolist()
    
    # Apply the function to create the HPO_NOISE column in df_pairs_hposet
    def combine_hpo_noise(row):
        # Select n terms from the HPO_ALL column in df_pairs_hposet
        selected_hpo_all = random_hpo_noise(row['HPO_ALL'], n)
        # Select m terms from the df_all_hpo HPO list
        selected_hpo_noise = random.sample(all_hpo_terms, min(m, len(all_hpo_terms)))
        # Combine and return as a single string with "; " delimiter
        return "; ".join(selected_hpo_all + selected_hpo_noise)

    # Apply the function to each row in df_pairs_hposet
    df_case_hposet[f'HPO_{i}'] = df_pairs_hposet['HPO_ALL'].apply(lambda x: random_hpo(x, i))    

    # Add the new column HPO_NOISE to df_pairs_hposet
    df_case_hposet['HPO_NOISE'] = df_pairs_hposet.apply(combine_hpo_noise, axis=1)
    
    # Save the result to a csv file
    df_case_hposet.to_csv('pairs_hpo.csv')

    return df_case_hposet

In [91]:
# Example usage (assuming df_pairs_hposet and df_all_hpo are defined):
df_case_hposet = case_hpo(i=5, n=3, m=2)
print(df_case_hposet)

    ORPHACODE                                            DISEASE  \
0          15                                     Achondroplasia   
1          18                      Distal renal tubular acidosis   
2          41              Dyschromatosis symmetrica hereditaria   
3          51                         Aicardi-Goutières syndrome   
4          53                     Albers-Schönberg osteopetrosis   
5          58                                  Alexander disease   
6          63                                    Alport syndrome   
7          65                         Leber congenital amaurosis   
8          82  Hereditary thrombophilia due to congenital ant...   
9          87                                     Apert syndrome   
10         97                         Familial paroxysmal ataxia   
11        107                                       BOR syndrome   
12        112                                   Bartter syndrome   
13        115             Congenital contractura

In [93]:
print(df_case_hposet.head(1))

   ORPHACODE         DISEASE  \
0         15  Achondroplasia   

                                             HPO_ALL  \
0  HP:0000242; HP:0011452; HP:0002979; HP:0000365...   

                                               HPO_5  \
0  HP:0010241; HP:0000256; HP:0001377; HP:0005619...   

                                           HPO_NOISE  
0  HP:0003416; HP:0008445; HP:0005280; HP:0100530...  


In [95]:
# Access and print values from the first row for specific columns
hpo_all = df_case_hposet.loc[0, 'HPO_ALL']
hpo_5 = df_case_hposet.loc[0, 'HPO_5']
hpo_noise = df_case_hposet.loc[0, 'HPO_NOISE']

In [97]:
print(hpo_all)
print(hpo_5)
print(hpo_noise)

HP:0000242; HP:0011452; HP:0002979; HP:0000365; HP:0003180; HP:0000260; HP:0008947; HP:0004060; HP:0045087; HP:0010241; HP:0000256; HP:0009826; HP:0003194; HP:0002938; HP:0000309; HP:0002007; HP:0003026; HP:0002870; HP:0002808; HP:0045086; HP:0008905; HP:0000956; HP:0001513; HP:0011867; HP:0000238; HP:0005819; HP:0003498; HP:0001377; HP:0000463; HP:0001156; HP:0010536; HP:0012418; HP:0008445; HP:0002091; HP:0005280; HP:0005619; HP:0003416; HP:0005257; HP:0003375
HP:0010241; HP:0000256; HP:0001377; HP:0005619; HP:0000238
HP:0003416; HP:0008445; HP:0005280; HP:0100530; HP:0031983


In [99]:
print(hpo_all)

HP:0000242; HP:0011452; HP:0002979; HP:0000365; HP:0003180; HP:0000260; HP:0008947; HP:0004060; HP:0045087; HP:0010241; HP:0000256; HP:0009826; HP:0003194; HP:0002938; HP:0000309; HP:0002007; HP:0003026; HP:0002870; HP:0002808; HP:0045086; HP:0008905; HP:0000956; HP:0001513; HP:0011867; HP:0000238; HP:0005819; HP:0003498; HP:0001377; HP:0000463; HP:0001156; HP:0010536; HP:0012418; HP:0008445; HP:0002091; HP:0005280; HP:0005619; HP:0003416; HP:0005257; HP:0003375


In [101]:
hpo_all = df_case_hposet.loc[2, 'HPO_ALL']
hpo_5 = df_case_hposet.loc[2, 'HPO_5']
hpo_noise = df_case_hposet.loc[2, 'HPO_NOISE']

In [46]:
print(hpo_all)
print(hpo_5)
print(hpo_noise)

HP:0000114; HP:0011964; HP:0002013; HP:0004396; HP:0002014; HP:0002653; HP:0004322; HP:0001878; HP:0001996; HP:0001510; HP:0012213; HP:0002659; HP:0003470; HP:0002019; HP:0003109; HP:0001508; HP:0002150; HP:0000121; HP:0002749; HP:0002900; HP:0002747; HP:0000107; HP:0032066; HP:0012405; HP:0003355; HP:0004349; HP:0004918; HP:0012608; HP:0000128; HP:0011387; HP:0000787; HP:0001959; HP:0000407; HP:0003126; HP:0002748; HP:0001944; HP:0001324; HP:0032944
HP:0002014; HP:0011964; HP:0002013; HP:0011387; HP:0003470
HP:0002900; HP:0002013; HP:0003126; HP:0031728; HP:0003173
