In [4]:
import pandas as pd
import numpy as np
import random

In [6]:
# read hpo, orphanet_code and disease information
df = pd.read_csv('hpo_id_orpha.csv')
df

Unnamed: 0,orphanet_code,disease_name,hpo_id
0,5,Long chain 3-hydroxyacyl-CoA dehydrogenase def...,HP:0001249
1,5,Long chain 3-hydroxyacyl-CoA dehydrogenase def...,HP:0001939
2,5,Long chain 3-hydroxyacyl-CoA dehydrogenase def...,HP:0011968
3,5,Long chain 3-hydroxyacyl-CoA dehydrogenase def...,HP:0000532
4,5,Long chain 3-hydroxyacyl-CoA dehydrogenase def...,HP:0000533
...,...,...,...
114044,658549,Idiopathic small fibers neuropathy,HP:0007550
114045,658549,Idiopathic small fibers neuropathy,HP:0008652
114046,658549,Idiopathic small fibers neuropathy,HP:0002579
114047,658549,Idiopathic small fibers neuropathy,HP:0032147


In [8]:
# Dropping duplicate rows based on "orphanet_code" and keeping the first occurrence
df = df.drop_duplicates(subset="orphanet_code")
df = df.drop(columns = "hpo_id")
df

Unnamed: 0,orphanet_code,disease_name
0,5,Long chain 3-hydroxyacyl-CoA dehydrogenase def...
25,6,3-methylcrotonyl-CoA carboxylase deficiency
35,7,3C syndrome
99,8,"47,XYY syndrome"
144,9,Tetrasomy X
...,...,...
113858,592574,Menke-Hennekam syndrome
113902,596448,IgG4-related systemic disease
113932,599373,STXBP1-related encephalopathy
113963,647799,MYT1L-related developmental delay-intellectual...


In [10]:
# read the orphanet code and hpoid of pairs information
df_orpha_hposet = pd.read_csv('pairs_orphacode_hposet.csv')
df_orpha_hposet

Unnamed: 0,ORPHACODE,DISEASE,HPO_ALL
0,15,Achondroplasia,HP:0001513; HP:0000463; HP:0011867; HP:0000238...
1,15,Achondroplasia,HP:0001513; HP:0000463; HP:0011867; HP:0000238...
2,18,Distal renal tubular acidosis,HP:0000114; HP:0011964; HP:0002013; HP:0004396...
3,18,Distal renal tubular acidosis,HP:0000114; HP:0011964; HP:0002013; HP:0004396...
4,41,Dyschromatosis symmetrica hereditaria,HP:0012733; HP:0001304; HP:0011509; HP:0007988
...,...,...,...
2589,542310,Leukoencephalopathy with calcifications and cysts,HP:0100320; HP:0100543; HP:0001260; HP:0002516...
2590,544469,PRUNE1-related neurological syndrome,HP:0002540; HP:0000347; HP:0001285; HP:0001639...
2591,544469,PRUNE1-related neurological syndrome,HP:0002540; HP:0000347; HP:0001285; HP:0001639...
2592,565624,Combined oxidative phosphorylation defect type 39,HP:0000028; HP:0011344; HP:0007371; HP:0006956...


In [12]:
# Merging the data based on the 'ORPHACODE' and 'orphanet_code' columns
df_merged = pd.merge(df_orpha_hposet, df, left_on='ORPHACODE', right_on='orphanet_code', how='inner')

# Display the merged DataFrame
df_merged

Unnamed: 0,ORPHACODE,DISEASE,HPO_ALL,orphanet_code,disease_name
0,15,Achondroplasia,HP:0001513; HP:0000463; HP:0011867; HP:0000238...,15,Achondroplasia
1,15,Achondroplasia,HP:0001513; HP:0000463; HP:0011867; HP:0000238...,15,Achondroplasia
2,18,Distal renal tubular acidosis,HP:0000114; HP:0011964; HP:0002013; HP:0004396...,18,Distal renal tubular acidosis
3,18,Distal renal tubular acidosis,HP:0000114; HP:0011964; HP:0002013; HP:0004396...,18,Distal renal tubular acidosis
4,41,Dyschromatosis symmetrica hereditaria,HP:0012733; HP:0001304; HP:0011509; HP:0007988,41,Dyschromatosis symmetrica hereditaria
...,...,...,...,...,...
2589,542310,Leukoencephalopathy with calcifications and cysts,HP:0100320; HP:0100543; HP:0001260; HP:0002516...,542310,Leukoencephalopathy with calcifications and cysts
2590,544469,PRUNE1-related neurological syndrome,HP:0002540; HP:0000347; HP:0001285; HP:0001639...,544469,PRUNE1-related neurological syndrome
2591,544469,PRUNE1-related neurological syndrome,HP:0002540; HP:0000347; HP:0001285; HP:0001639...,544469,PRUNE1-related neurological syndrome
2592,565624,Combined oxidative phosphorylation defect type 39,HP:0000028; HP:0011344; HP:0007371; HP:0006956...,565624,Combined oxidative phosphorylation defect type 39


In [14]:
# Extract the three columns (ORPHACODE', 'disease_name', 'HPO_ALL')
df_pairs_hposet = df_merged[['ORPHACODE', 'disease_name', 'HPO_ALL']]
print(df_pairs_hposet.head())

   ORPHACODE                           disease_name  \
0         15                         Achondroplasia   
1         15                         Achondroplasia   
2         18          Distal renal tubular acidosis   
3         18          Distal renal tubular acidosis   
4         41  Dyschromatosis symmetrica hereditaria   

                                             HPO_ALL  
0  HP:0001513; HP:0000463; HP:0011867; HP:0000238...  
1  HP:0001513; HP:0000463; HP:0011867; HP:0000238...  
2  HP:0000114; HP:0011964; HP:0002013; HP:0004396...  
3  HP:0000114; HP:0011964; HP:0002013; HP:0004396...  
4     HP:0012733; HP:0001304; HP:0011509; HP:0007988  


In [16]:
df_pairs_hposet = df_pairs_hposet.rename(columns={"disease_name": "DISEASE"})
df_pairs_hposet

Unnamed: 0,ORPHACODE,DISEASE,HPO_ALL
0,15,Achondroplasia,HP:0001513; HP:0000463; HP:0011867; HP:0000238...
1,15,Achondroplasia,HP:0001513; HP:0000463; HP:0011867; HP:0000238...
2,18,Distal renal tubular acidosis,HP:0000114; HP:0011964; HP:0002013; HP:0004396...
3,18,Distal renal tubular acidosis,HP:0000114; HP:0011964; HP:0002013; HP:0004396...
4,41,Dyschromatosis symmetrica hereditaria,HP:0012733; HP:0001304; HP:0011509; HP:0007988
...,...,...,...
2589,542310,Leukoencephalopathy with calcifications and cysts,HP:0100320; HP:0100543; HP:0001260; HP:0002516...
2590,544469,PRUNE1-related neurological syndrome,HP:0002540; HP:0000347; HP:0001285; HP:0001639...
2591,544469,PRUNE1-related neurological syndrome,HP:0002540; HP:0000347; HP:0001285; HP:0001639...
2592,565624,Combined oxidative phosphorylation defect type 39,HP:0000028; HP:0011344; HP:0007371; HP:0006956...


In [18]:
# Extract the first two columns (ORPHACODE, DISEASE)
df_orpha_disease = df_pairs_hposet[['ORPHACODE', 'DISEASE']]
# Save to a csv file
df_orpha_disease.to_csv("paire_orphacode_disease.csv", index=False)

In [20]:
df_pairs_hposet

Unnamed: 0,ORPHACODE,DISEASE,HPO_ALL
0,15,Achondroplasia,HP:0001513; HP:0000463; HP:0011867; HP:0000238...
1,15,Achondroplasia,HP:0001513; HP:0000463; HP:0011867; HP:0000238...
2,18,Distal renal tubular acidosis,HP:0000114; HP:0011964; HP:0002013; HP:0004396...
3,18,Distal renal tubular acidosis,HP:0000114; HP:0011964; HP:0002013; HP:0004396...
4,41,Dyschromatosis symmetrica hereditaria,HP:0012733; HP:0001304; HP:0011509; HP:0007988
...,...,...,...
2589,542310,Leukoencephalopathy with calcifications and cysts,HP:0100320; HP:0100543; HP:0001260; HP:0002516...
2590,544469,PRUNE1-related neurological syndrome,HP:0002540; HP:0000347; HP:0001285; HP:0001639...
2591,544469,PRUNE1-related neurological syndrome,HP:0002540; HP:0000347; HP:0001285; HP:0001639...
2592,565624,Combined oxidative phosphorylation defect type 39,HP:0000028; HP:0011344; HP:0007371; HP:0006956...


In [22]:
# Save to a csv file
df_pairs_hposet.to_csv("pairs_orphacode_hposet.csv", index=False)

In [24]:
# Define the function with file-saving capability
def pairs_phenotypes(n, m):
    
    # Read the file into a DataFrame, assuming it is tab-separated
    df = pd.read_csv('pairs_orphacode_hposet.csv')
    
    # Ensure n and m are within the specified limits
    n = min(n, 1184)
    m = min(m, 1410)
    
    # Extract first n rows and last m rows
    rows_extracted = pd.concat([df.head(n), df.tail(m)])
    
    # Save the result to a txt file
    rows_extracted.to_csv('pairs_phenotypes.csv')
    
    print(rows_extracted.head())
    print("File saved as 'pairs_phenotypes.csv'")
    
    return rows_extracted

In [26]:
df_paris_phenotypes = pairs_phenotypes(n=20, m=20)
print(df_paris_phenotypes)

   ORPHACODE                                DISEASE  \
0         15                         Achondroplasia   
1         15                         Achondroplasia   
2         18          Distal renal tubular acidosis   
3         18          Distal renal tubular acidosis   
4         41  Dyschromatosis symmetrica hereditaria   

                                             HPO_ALL  
0  HP:0001513; HP:0000463; HP:0011867; HP:0000238...  
1  HP:0001513; HP:0000463; HP:0011867; HP:0000238...  
2  HP:0000114; HP:0011964; HP:0002013; HP:0004396...  
3  HP:0000114; HP:0011964; HP:0002013; HP:0004396...  
4     HP:0012733; HP:0001304; HP:0011509; HP:0007988  
File saved as 'pairs_phenotypes.csv'
      ORPHACODE                                            DISEASE  \
0            15                                     Achondroplasia   
1            15                                     Achondroplasia   
2            18                      Distal renal tubular acidosis   
3            18       

In [28]:
# Function to randomly select n HPO terms from the HPOALL column in df_pairs_hposet
def pairs_hpo(i, n, m):
    
    # read the orphanet code and hpoid of pairs information
    df_pairs_hposet = pd.read_csv('pairs_phenotypes.csv')
    
    def random_hpo(hpo_str, i):
        # Split the HPO terms by "; " and randomly select n terms
        hpo_terms = hpo_str.split("; ")
        selected_terms = random.sample(hpo_terms, min(i, len(hpo_terms)))  # limit to available terms
        return "; ".join(selected_terms)
    
    # Function to select random HPO terms from a string
    def random_hpo_noise(hpo_str, num_terms):
        # Split the HPO terms by "; " and randomly select the specified number of terms
        hpo_terms = hpo_str.split("; ")
        selected_terms = random.sample(hpo_terms, min(num_terms, len(hpo_terms)))  # limit to available terms
        return selected_terms
   
    # read all the hpo term
    df_all_hpo = pd.read_csv('hpo_id_all.csv')
    
    # Convert df_all_hpo 'hpo_id' column to a list of all HPO terms
    all_hpo_terms = df_all_hpo['hpo_id'].tolist()
    
    # Apply the function to create the HPO_NOISE column in df_pairs_hposet
    def combine_hpo_noise(row):
        # Select n terms from the HPO_ALL column in df_pairs_hposet
        selected_hpo_all = random_hpo_noise(row['HPO_ALL'], n)
        # Select m terms from the df_all_hpo HPO list
        selected_hpo_noise = random.sample(all_hpo_terms, min(m, len(all_hpo_terms)))
        # Combine and return as a single string with "; " delimiter
        return "; ".join(selected_hpo_all + selected_hpo_noise)

    # Apply the function to each row in df_pairs_hposet
    df_pairs_hposet[f'HPO_{i}'] = df_pairs_hposet['HPO_ALL'].apply(lambda x: random_hpo(x, i))    

    # Add the new column HPO_NOISE to df_pairs_hposet
    df_pairs_hposet['HPO_NOISE'] = df_pairs_hposet.apply(combine_hpo_noise, axis=1)
    
    # Save the result to a csv file
    df_pairs_hposet.to_csv('pairs_hpo.csv')

    print(df_pairs_hposet.head())
    print("File saved as 'pairs_hpo.csv'")
    
    return df_pairs_hposet

In [30]:
# Example usage (assuming df_pairs_hposet and df_all_hpo are defined):
df_pairs_hposet = pairs_hpo(i=5, n=3, m=2)
print(df_pairs_hposet)

   Unnamed: 0  ORPHACODE                                DISEASE  \
0           0         15                         Achondroplasia   
1           1         15                         Achondroplasia   
2           2         18          Distal renal tubular acidosis   
3           3         18          Distal renal tubular acidosis   
4           4         41  Dyschromatosis symmetrica hereditaria   

                                             HPO_ALL  \
0  HP:0001513; HP:0000463; HP:0011867; HP:0000238...   
1  HP:0001513; HP:0000463; HP:0011867; HP:0000238...   
2  HP:0000114; HP:0011964; HP:0002013; HP:0004396...   
3  HP:0000114; HP:0011964; HP:0002013; HP:0004396...   
4     HP:0012733; HP:0001304; HP:0011509; HP:0007988   

                                               HPO_5  \
0  HP:0005619; HP:0009826; HP:0005819; HP:0008905...   
1  HP:0003498; HP:0000256; HP:0000365; HP:0004060...   
2  HP:0002014; HP:0011964; HP:0002013; HP:0011387...   
3  HP:0002014; HP:0001510; HP:000312

In [32]:
print(df_pairs_hposet.head(1))

   Unnamed: 0  ORPHACODE         DISEASE  \
0           0         15  Achondroplasia   

                                             HPO_ALL  \
0  HP:0001513; HP:0000463; HP:0011867; HP:0000238...   

                                               HPO_5  \
0  HP:0005619; HP:0009826; HP:0005819; HP:0008905...   

                                           HPO_NOISE  
0  HP:0005819; HP:0008947; HP:0010536; HP:0010622...  


In [34]:
# Access and print values from the first row for specific columns
hpo_all = df_pairs_hposet.loc[0, 'HPO_ALL']
hpo_5 = df_pairs_hposet.loc[0, 'HPO_5']
hpo_noise = df_pairs_hposet.loc[0, 'HPO_NOISE']

In [36]:
print(hpo_all)
print(hpo_5)
print(hpo_noise)

HP:0001513; HP:0000463; HP:0011867; HP:0000238; HP:0005819; HP:0003498; HP:0001377; HP:0001156; HP:0005257; HP:0012418; HP:0008445; HP:0002091; HP:0005280; HP:0005619; HP:0003416; HP:0003375; HP:0000956; HP:0010536; HP:0008905; HP:0045086; HP:0002808; HP:0000242; HP:0011452; HP:0000365; HP:0003180; HP:0000260; HP:0008947; HP:0004060; HP:0045087; HP:0002979; HP:0000256; HP:0009826; HP:0003194; HP:0002938; HP:0000309; HP:0002007; HP:0003026; HP:0002870; HP:0010241
HP:0005619; HP:0009826; HP:0005819; HP:0008905; HP:0001513
HP:0005819; HP:0008947; HP:0010536; HP:0010622; HP:6000577


In [38]:
print(hpo_all)

HP:0001513; HP:0000463; HP:0011867; HP:0000238; HP:0005819; HP:0003498; HP:0001377; HP:0001156; HP:0005257; HP:0012418; HP:0008445; HP:0002091; HP:0005280; HP:0005619; HP:0003416; HP:0003375; HP:0000956; HP:0010536; HP:0008905; HP:0045086; HP:0002808; HP:0000242; HP:0011452; HP:0000365; HP:0003180; HP:0000260; HP:0008947; HP:0004060; HP:0045087; HP:0002979; HP:0000256; HP:0009826; HP:0003194; HP:0002938; HP:0000309; HP:0002007; HP:0003026; HP:0002870; HP:0010241


In [44]:
hpo_all = df_pairs_hposet.loc[2, 'HPO_ALL']
hpo_5 = df_pairs_hposet.loc[2, 'HPO_5']
hpo_noise = df_pairs_hposet.loc[2, 'HPO_NOISE']

In [46]:
print(hpo_all)
print(hpo_5)
print(hpo_noise)

HP:0000114; HP:0011964; HP:0002013; HP:0004396; HP:0002014; HP:0002653; HP:0004322; HP:0001878; HP:0001996; HP:0001510; HP:0012213; HP:0002659; HP:0003470; HP:0002019; HP:0003109; HP:0001508; HP:0002150; HP:0000121; HP:0002749; HP:0002900; HP:0002747; HP:0000107; HP:0032066; HP:0012405; HP:0003355; HP:0004349; HP:0004918; HP:0012608; HP:0000128; HP:0011387; HP:0000787; HP:0001959; HP:0000407; HP:0003126; HP:0002748; HP:0001944; HP:0001324; HP:0032944
HP:0002014; HP:0011964; HP:0002013; HP:0011387; HP:0003470
HP:0002900; HP:0002013; HP:0003126; HP:0031728; HP:0003173
