In [18]:
import pandas as pd
import numpy as np

In [97]:
# read the rare disease variants with "pathogetic"
df = pd.read_csv('clinvar_orphanet_variants.tsv', sep='\t')

In [99]:
print(df)

      CHROM     POS       ID REF ALT  \
0         1  949523   183381   C   T   
1         1  957748  3255595   C   A   
2         1  976962   126556   C   T   
3         1  977355  2749616   C   A   
4         1  977433   644955   C   G   
...     ...     ...      ...  ..  ..   
51761    MT   14724   805947   G   A   
51762    MT   14739   690211   G   A   
51763    MT   15150     9681   G   A   
51764    MT   15915   690233   G   A   
51765    MT   15967     9572   G   A   

                                                CLNDISDB  \
0      MONDO:MONDO:0014502,MedGen:C4015293,OMIM:61612...   
1      MONDO:MONDO:0014052,MedGen:C3808739,OMIM:61512...   
2      MONDO:MONDO:0014052,MedGen:C3808739,OMIM:61512...   
3      MONDO:MONDO:0014052,MedGen:C3808739,OMIM:61512...   
4      MONDO:MONDO:0014052,MedGen:C3808739,OMIM:61512...   
...                                                  ...   
51761  MONDO:MONDO:0044970,MeSH:D028361,MedGen:C07516...   
51762  MONDO:MONDO:0010789,MedGen:C0162

In [101]:
# To extract the Orphanet information from the "CLNDISDB" column and create a new column named "ORPHACODE".
df['ORPHACODE'] = df['CLNDISDB'].str.extract(r'Orphanet:(\d+)')

In [103]:
print(df.head(2))

  CHROM     POS       ID REF ALT  \
0     1  949523   183381   C   T   
1     1  957748  3255595   C   A   

                                            CLNDISDB  \
0  MONDO:MONDO:0014502,MedGen:C4015293,OMIM:61612...   
1  MONDO:MONDO:0014052,MedGen:C3808739,OMIM:61512...   

                                               CLNDN      CLNSIG  \
0  Mendelian_susceptibility_to_mycobacterial_dise...  Pathogenic   
1                   Congenital_myasthenic_syndrome_8  Pathogenic   

                             GENEINFO                   MC ORPHACODE  
0                          ISG15:9636  SO:0001587|nonsense    319563  
1  AGRN:375790|LOC126805576:126805576  SO:0001587|nonsense       590  


In [107]:
# Extract the required columns
df = df[["CHROM", "POS", "ID", "REF", "ALT", "GENEINFO", "ORPHACODE"]]
# Extract only the gene name from each "GENEINFO" entry
df["GENEINFO"] = df["GENEINFO"].apply(lambda x: x.split(":")[0])
df

Unnamed: 0,CHROM,POS,ID,REF,ALT,GENEINFO,ORPHACODE
0,1,949523,183381,C,T,ISG15,319563
1,1,957748,3255595,C,A,AGRN,590
2,1,976962,126556,C,T,AGRN,590
3,1,977355,2749616,C,A,AGRN,590
4,1,977433,644955,C,G,AGRN,590
...,...,...,...,...,...,...,...
51761,MT,14724,805947,G,A,MT-TE,68380
51762,MT,14739,690211,G,A,MT-TE,550
51763,MT,15150,9681,G,A,MT-CYB,254864
51764,MT,15915,690233,G,A,MT-TT,550


In [109]:
# Remove rows where CHROM is "X", "Y", or "MT"
df = df[~df["CHROM"].isin(["X", "Y", "MT"])]
df

Unnamed: 0,CHROM,POS,ID,REF,ALT,GENEINFO,ORPHACODE
0,1,949523,183381,C,T,ISG15,319563
1,1,957748,3255595,C,A,AGRN,590
2,1,976962,126556,C,T,AGRN,590
3,1,977355,2749616,C,A,AGRN,590
4,1,977433,644955,C,G,AGRN,590
...,...,...,...,...,...,...,...
46762,22,51160726,975999,G,T,SHANK3,48652
46763,22,51169227,1803071,C,G,SHANK3,48652
46764,22,51169229,3069185,C,A,SHANK3,48652
46765,22,51169297,800506,G,T,SHANK3,48652


In [112]:
df.to_csv('variants_pathogenic.csv', index=False)

In [20]:
# To keep only the first two occurrences of each "ORPHCODE" value and remove any further duplicates in a DataFrame.
df_filtered = df.groupby('ORPHACODE').head(2)
df_filtered

Unnamed: 0,CHROM,POS,ID,REF,ALT,GENEINFO,ORPHACODE
0,1,949523,183381,C,T,ISG15,319563
1,1,957748,3255595,C,A,AGRN,590
2,1,976962,126556,C,T,AGRN,590
12,1,1149118,96692,G,A,TNFRSF4,431149
13,1,1167659,60484,A,G,B3GALT6,536467
...,...,...,...,...,...,...,...
46687,22,51063824,3094,G,T,ARSA,309263
46704,22,51064656,3093,C,A,ARSA,309256
46738,22,51065649,3089,A,G,ARSA,309271
46753,22,51117022,1029037,A,T,SHANK3,48652


In [22]:
# Filter out rows where the values in the "ORPHACODE" column are unique (occur only once). 

# Step 1: Count occurrences of each value in the 'ORPHID' column.
orphacode_counts = df_filtered['ORPHACODE'].value_counts()

# Step 2: Filter rows where 'ORPHACODE' occurs more than once.
non_unique_orphacode = orphacode_counts[orphacode_counts > 1].index
df_twice = df_filtered[df_filtered['ORPHACODE'].isin(non_unique_orphacode)]

In [24]:
print(orphacode_counts)

ORPHACODE
891       2
306498    2
65285     2
404463    2
75233     2
         ..
69084     1
137776    1
99811     1
3406      1
319563    1
Name: count, Length: 2726, dtype: int64


In [26]:
df_twice

Unnamed: 0,CHROM,POS,ID,REF,ALT,GENEINFO,ORPHACODE
1,1,957748,3255595,C,A,AGRN,590
2,1,976962,126556,C,T,AGRN,590
13,1,1167659,60484,A,G,B3GALT6,536467
14,1,1167660,1452181,T,C,B3GALT6,536467
16,1,1167851,60488,A,G,B3GALT6,642099
...,...,...,...,...,...,...,...
46667,22,51018405,1452176,G,A,CHKB,280671
46686,22,51063820,3052,G,A,ARSA,309263
46687,22,51063824,3094,G,T,ARSA,309263
46753,22,51117022,1029037,A,T,SHANK3,48652


In [28]:
# Count occurrences of each value in the 'ORPHID' column.
print(df_twice['ORPHACODE'].value_counts())

ORPHACODE
590       2
708       2
284426    2
53697     2
206549    2
         ..
6         2
77298     2
67047     2
66634     2
48652     2
Name: count, Length: 2190, dtype: int64


In [30]:
# Extract the ORPHACODE column
df_orpha = df_twice.groupby('ORPHACODE').head(1)
df_orpha = df_orpha[["ORPHACODE"]]

# Convert the column 'ORPHACODE' from object to int64
df_orpha["ORPHACODE"] = df_orpha["ORPHACODE"].astype("int64")

# Sort by 'orphanet_code' in ascending order
df_orpha = df_orpha.sort_values(by="ORPHACODE", ascending=True)

df_orpha.info()
df_orpha

<class 'pandas.core.frame.DataFrame'>
Index: 2190 entries, 542 to 31590
Data columns (total 1 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   ORPHACODE  2190 non-null   int64
dtypes: int64(1)
memory usage: 34.2 KB


Unnamed: 0,ORPHACODE
542,5
11100,6
20517,7
26939,13
11987,14
...,...
8480,662367
32505,662829
5960,664430
5776,664923


In [32]:
# read the orpha hpo file "hpo_id_orpha.csv"
df_orpha_hpo = pd.read_csv('hpo_id_orpha.csv')

# Sort by 'orphanet_code' in ascending order
df_orpha_hpo = df_orpha_hpo.sort_values(by="orphanet_code", ascending=True)
print(df_orpha_hpo)

        orphanet_code                                       disease_name  \
0                   5  Long chain 3-hydroxyacyl-CoA dehydrogenase def...   
24                  5  Long chain 3-hydroxyacyl-CoA dehydrogenase def...   
23                  5  Long chain 3-hydroxyacyl-CoA dehydrogenase def...   
22                  5  Long chain 3-hydroxyacyl-CoA dehydrogenase def...   
21                  5  Long chain 3-hydroxyacyl-CoA dehydrogenase def...   
...               ...                                                ...   
114028         658549                 Idiopathic small fibers neuropathy   
114027         658549                 Idiopathic small fibers neuropathy   
114047         658549                 Idiopathic small fibers neuropathy   
114036         658549                 Idiopathic small fibers neuropathy   
114048         658549                 Idiopathic small fibers neuropathy   

            hpo_id  
0       HP:0001249  
24      HP:0007703  
23      HP:0001943  
22 

In [34]:
# Merging the data based on the 'ORPHID' and 'database_id' columns
df_merged = pd.merge(df_orpha, df_orpha_hpo, left_on='ORPHACODE', right_on='orphanet_code', how='inner')

# Display the merged DataFrame
df_merged

Unnamed: 0,ORPHACODE,orphanet_code,disease_name,hpo_id
0,5,5,Long chain 3-hydroxyacyl-CoA dehydrogenase def...,HP:0001249
1,5,5,Long chain 3-hydroxyacyl-CoA dehydrogenase def...,HP:0007703
2,5,5,Long chain 3-hydroxyacyl-CoA dehydrogenase def...,HP:0001943
3,5,5,Long chain 3-hydroxyacyl-CoA dehydrogenase def...,HP:0001290
4,5,5,Long chain 3-hydroxyacyl-CoA dehydrogenase def...,HP:0001252
...,...,...,...,...
44215,589905,589905,PHIP-related behavioral problems-intellectual ...,HP:0002020
44216,589905,589905,PHIP-related behavioral problems-intellectual ...,HP:0000486
44217,589905,589905,PHIP-related behavioral problems-intellectual ...,HP:0002019
44218,589905,589905,PHIP-related behavioral problems-intellectual ...,HP:0001319


In [36]:
# Extract the 'orphanet_code', and 'hpo_id' columns
df_merged = df_merged[['orphanet_code', 'hpo_id']]

# Sort by 'orphanet_code' in ascending order
df_merged = df_merged.sort_values(by="orphanet_code", ascending=True)

# Display the extracted columns
df_merged

Unnamed: 0,orphanet_code,hpo_id
0,5,HP:0001249
24,5,HP:0000488
23,5,HP:0001939
22,5,HP:0011968
21,5,HP:0000532
...,...,...
44195,589905,HP:0000400
44196,589905,HP:0000233
44197,589905,HP:0001182
44189,589905,HP:0000316


In [38]:
# Group by 'orphanet_code' and concatenate 'hpo_id' values for each unique 'orphanet_code'
df_pair_hpo = df_merged.groupby('orphanet_code')['hpo_id'].apply(lambda x: '; '.join(x)).reset_index()

# Display the result
print(df_pair_hpo)

      orphanet_code                                             hpo_id
0                 5  HP:0001249; HP:0000488; HP:0001939; HP:0011968...
1                 6  HP:0002093; HP:0001943; HP:0001252; HP:0004357...
2                 7  HP:0003196; HP:0000369; HP:0000648; HP:0001636...
3                13  HP:0002179; HP:0000711; HP:0000508; HP:0001249...
4                14  HP:0000707; HP:0002066; HP:0002013; HP:0001762...
...             ...                                                ...
1414         567548  HP:0002027; HP:0011947; HP:0001510; HP:0003073...
1415         570422  HP:0002240; HP:0100806; HP:0001508; HP:0001410...
1416         572798  HP:0002352; HP:0000316; HP:0002365; HP:0001249...
1417         589618  HP:0007256; HP:0000252; HP:0000414; HP:0000826...
1418         589905  HP:0007874; HP:0100710; HP:0000508; HP:0000540...

[1419 rows x 2 columns]


In [40]:
df_twice
df_twice.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4380 entries, 1 to 46754
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   CHROM      4380 non-null   object
 1   POS        4380 non-null   int64 
 2   ID         4380 non-null   int64 
 3   REF        4380 non-null   object
 4   ALT        4380 non-null   object
 5   GENEINFO   4380 non-null   object
 6   ORPHACODE  4380 non-null   object
dtypes: int64(2), object(5)
memory usage: 273.8+ KB


In [42]:
orpha_hpo_list = df_pair_hpo["orphanet_code"].tolist()

In [44]:
# Convert the column 'ORPHACODE' from object to int64
# Fitler the rare disease variants with ORPHACODE in the orpha_hpo_list
df_twice.loc[:, "ORPHACODE"] = df_twice["ORPHACODE"].astype("int64")
df_pair_variants = df_twice[df_twice['ORPHACODE'].isin(orpha_hpo_list)]

# Sort by 'ORPHACODE' in ascending order
df_pair_variants = df_pair_variants.sort_values(by="ORPHACODE", ascending=True).reset_index(drop=True)

df_pair_variants

Unnamed: 0,CHROM,POS,ID,REF,ALT,GENEINFO,ORPHACODE
0,2,26414203,1427255,A,T,HADHA,5
1,1,24151875,521752,G,A,HMGCL,5
2,3,182738013,2676471,C,A,MCCC1,6
3,3,182737965,503626,C,A,MCCC1,6
4,8,126044481,92129,A,T,WASHC5,7
...,...,...,...,...,...,...,...
2833,1,119576820,440920,C,G,WARS2,572798
2834,19,36210431,807435,C,T,KMT2B,589618
2835,19,36210859,3066001,C,T,KMT2B,589618
2836,6,79655946,450225,G,A,PHIP,589905


In [82]:
# read the rare disease inheritance file "orpha_inheritance.csv"
df_inheritance = pd.read_csv('orpha_inheritance.csv')

# Sort by 'OrphaCode' in ascending order
df_inheritance = df_inheritance.sort_values(by="OrphaCode", ascending=True)
df_inheritance

Unnamed: 0,OrphaCode,TypeOfInheritance
1469,5,AR
1394,6,AR
508,7,AR
424,13,AR
131,14,AR
...,...,...
3465,675216,AD
3470,675767,AD
3501,675775,AR
3502,675782,AR


In [164]:
# Merging the data df_pair_variants and df_inheritance based on the 'ORPHACODE' and 'database_id' columns
df_variants_inheritance = pd.merge(df_pair_variants, df_inheritance, left_on='ORPHACODE', right_on='OrphaCode', how='inner')

# Display the merged DataFrame
df_variants_inheritance

Unnamed: 0,CHROM,POS,ID,REF,ALT,GENEINFO,ORPHACODE,OrphaCode,TypeOfInheritance
0,2,26414203,1427255,A,T,HADHA,5,5,AR
1,1,24151875,521752,G,A,HMGCL,5,5,AR
2,3,182738013,2676471,C,A,MCCC1,6,6,AR
3,3,182737965,503626,C,A,MCCC1,6,6,AR
4,8,126044481,92129,A,T,WASHC5,7,7,AR
...,...,...,...,...,...,...,...,...,...
2589,5,74054703,440788,T,G,GFM2,565624,565624,AR
2590,19,36210431,807435,C,T,KMT2B,589618,589618,AD
2591,19,36210859,3066001,C,T,KMT2B,589618,589618,AD
2592,6,79655946,450225,G,A,PHIP,589905,589905,AD


In [166]:
# Dropping the "OrphaCode" column and renaming "TypeOfInheritance" to "INHERITANCE"
df_variants_inheritance = df_variants_inheritance.drop(columns=["OrphaCode"])
df_variants_inheritance = df_variants_inheritance.rename(columns={"TypeOfInheritance": "INHERITANCE"})
df_variants_inheritance

Unnamed: 0,CHROM,POS,ID,REF,ALT,GENEINFO,ORPHACODE,INHERITANCE
0,2,26414203,1427255,A,T,HADHA,5,AR
1,1,24151875,521752,G,A,HMGCL,5,AR
2,3,182738013,2676471,C,A,MCCC1,6,AR
3,3,182737965,503626,C,A,MCCC1,6,AR
4,8,126044481,92129,A,T,WASHC5,7,AR
...,...,...,...,...,...,...,...,...
2589,5,74054703,440788,T,G,GFM2,565624,AR
2590,19,36210431,807435,C,T,KMT2B,589618,AD
2591,19,36210859,3066001,C,T,KMT2B,589618,AD
2592,6,79655946,450225,G,A,PHIP,589905,AD


In [168]:
# Sorting the DataFrame by "INHERITANCE" and "ORPHACODE"
df_variants_pair = df_variants_inheritance.sort_values(by=["INHERITANCE", "ORPHACODE"]).reset_index(drop=True)
df_variants_pair

Unnamed: 0,CHROM,POS,ID,REF,ALT,GENEINFO,ORPHACODE,INHERITANCE
0,4,1806104,16330,G,T,FGFR3,15,AD
1,4,1805658,2664079,C,G,FGFR3,15,AD
2,7,138391446,1344704,T,C,ATP6V0A4,18,AD
3,2,71185243,12228,T,C,ATP6V1B1,18,AD
4,1,154557469,14820,A,G,ADAR,41,AD
...,...,...,...,...,...,...,...,...
2589,17,8076766,929264,G,C,TMEM107,542310,AR
2590,1,150990382,932732,T,C,PRUNE1,544469,AR
2591,1,150990970,427231,C,A,PRUNE1,544469,AR
2592,5,74018387,225206,C,T,GFM2,565624,AR


In [170]:
# Counting the values in the "INHERITANCE" column
inheritance_counts = df_variants_pair["INHERITANCE"].value_counts()
print(inheritance_counts)

INHERITANCE
AR    1410
AD    1184
Name: count, dtype: int64


In [172]:
df_variants_pair.to_csv('pairs_variants_gene_orpha_inher.csv', index=False)

In [22]:
# Define the function with file-saving capability
def pairs_genes(n, m):
    
    # Read the file into a DataFrame, assuming it is tab-separated
    df = pd.read_csv('pairs_variants_gene_orpha_inher.csv')
    
    # Ensure n and m are within the specified limits
    n = min(n, 1184)
    m = min(m, 1410)
    
    # Extract first n rows and last m rows
    rows_extracted = pd.concat([df.head(n), df.tail(m)])
    
    # Extract first 5+n columns and last m columns
    columns_extracted = pd.concat([df.iloc[:, :5+n], df.iloc[:, -m:]], axis=1)
    
    # Combine extracted rows and columns
    extracted_df = rows_extracted[columns_extracted.columns]
    
    # Save the result to a txt file
    extracted_df.to_csv('pairs_genes.csv', index=False)
    
    return extracted_df

In [24]:
df_pairs_genes = pairs_genes(20, 20)

In [174]:
df_variants_pair.loc[:, 'QUAL'] = '.'
df_variants_pair.loc[:, 'FILTER'] = '.'
df_variants_pair.loc[:, 'INFO'] = 'PR'
df_variants_pair.loc[:, 'FORMAT'] = 'GT'
df_variants_pair

Unnamed: 0,CHROM,POS,ID,REF,ALT,GENEINFO,ORPHACODE,INHERITANCE,QUAL,FILTER,INFO,FORMAT
0,4,1806104,16330,G,T,FGFR3,15,AD,.,.,PR,GT
1,4,1805658,2664079,C,G,FGFR3,15,AD,.,.,PR,GT
2,7,138391446,1344704,T,C,ATP6V0A4,18,AD,.,.,PR,GT
3,2,71185243,12228,T,C,ATP6V1B1,18,AD,.,.,PR,GT
4,1,154557469,14820,A,G,ADAR,41,AD,.,.,PR,GT
...,...,...,...,...,...,...,...,...,...,...,...,...
2589,17,8076766,929264,G,C,TMEM107,542310,AR,.,.,PR,GT
2590,1,150990382,932732,T,C,PRUNE1,544469,AR,.,.,PR,GT
2591,1,150990970,427231,C,A,PRUNE1,544469,AR,.,.,PR,GT
2592,5,74018387,225206,C,T,GFM2,565624,AR,.,.,PR,GT


In [58]:
# To implement this logic where each of the new columns follows a specific pattern,
# with only one row containing "0/1" or "1/0"  if the value of INHERITANCE is "AD", 
# with only one row cotaining "1/1" if the value of INHERITANCE is "AR" and the rest containing "0/0", 
# we can programmatically generate the values for each of the 2594 new columns. 
# For column "1", the value at row 1 will be "0/1" or "1/0", if the value of INHERITANCE is "AD", 
# For column "1", the value at row 1 will be "1/1", if the value of INHERITANCE is "AR", 
# For column "2", the value at row 2 will be "1/0" or "0/1",if the value of INHERITANCE is "AD", 
# For column "2", the value at row 2 will be "1/1", if the value of INHERITANCE is "AR",  and so on.

In [176]:
# Number of new columns to generate
num_new_columns = 2594
num_rows = len(df_variants_pair)

# Initialize all new columns to "0/0" at once for efficiency
new_columns = pd.DataFrame("0/0", index=df_variants_pair.index, columns=[f'col_{i+1}' for i in range(num_new_columns)])
df_variants_pair = pd.concat([df_variants_pair, new_columns], axis=1)

# Creating lists of indices and values to update
ad_indices = df_variants_pair[df_variants_pair["INHERITANCE"] == "AD"].index
ar_indices = df_variants_pair[df_variants_pair["INHERITANCE"] == "AR"].index

# Update "AD" rows with "0/1" or "1/0"
for idx in ad_indices:
    col_name = f'col_{idx + 1}'
    df_variants_pair.at[idx, col_name] = np.random.choice(["0/1", "1/0"])

# Update "AR" rows with "1/1"
for idx in ar_indices:
    col_name = f'col_{idx + 1}'
    df_variants_pair.at[idx, col_name] = "1/1"

# De-fragmenting DataFrame for performance improvement
df_variants_pair = df_variants_pair.copy()

df_variants_pair

Unnamed: 0,CHROM,POS,ID,REF,ALT,GENEINFO,ORPHACODE,INHERITANCE,QUAL,FILTER,...,col_2585,col_2586,col_2587,col_2588,col_2589,col_2590,col_2591,col_2592,col_2593,col_2594
0,4,1806104,16330,G,T,FGFR3,15,AD,.,.,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
1,4,1805658,2664079,C,G,FGFR3,15,AD,.,.,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
2,7,138391446,1344704,T,C,ATP6V0A4,18,AD,.,.,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
3,2,71185243,12228,T,C,ATP6V1B1,18,AD,.,.,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
4,1,154557469,14820,A,G,ADAR,41,AD,.,.,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2589,17,8076766,929264,G,C,TMEM107,542310,AR,.,.,...,0/0,0/0,0/0,0/0,0/0,1/1,0/0,0/0,0/0,0/0
2590,1,150990382,932732,T,C,PRUNE1,544469,AR,.,.,...,0/0,0/0,0/0,0/0,0/0,0/0,1/1,0/0,0/0,0/0
2591,1,150990970,427231,C,A,PRUNE1,544469,AR,.,.,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,1/1,0/0,0/0
2592,5,74018387,225206,C,T,GFM2,565624,AR,.,.,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,1/1,0/0


In [178]:
# Removing the specified columns "GENEINFO", "ORPHACODE", and "INHERITANCE"
df_variants_pair_vcf = df_variants_pair.drop(columns=["GENEINFO", "ORPHACODE", "INHERITANCE"])
df_variants_pair_vcf

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,col_1,...,col_2585,col_2586,col_2587,col_2588,col_2589,col_2590,col_2591,col_2592,col_2593,col_2594
0,4,1806104,16330,G,T,.,.,PR,GT,0/1,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
1,4,1805658,2664079,C,G,.,.,PR,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
2,7,138391446,1344704,T,C,.,.,PR,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
3,2,71185243,12228,T,C,.,.,PR,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
4,1,154557469,14820,A,G,.,.,PR,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2589,17,8076766,929264,G,C,.,.,PR,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,1/1,0/0,0/0,0/0,0/0
2590,1,150990382,932732,T,C,.,.,PR,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,1/1,0/0,0/0,0/0
2591,1,150990970,427231,C,A,.,.,PR,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,1/1,0/0,0/0
2592,5,74018387,225206,C,T,.,.,PR,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,1/1,0/0


In [180]:
df_variants_pair_vcf.to_csv('variants_pairs_vcf.csv', index=False)

In [67]:
# Extracting the "ORPHACODE" column as a new DataFrame
df_orphacode = df_variants_pair[["ORPHACODE"]]
df_orphacode

Unnamed: 0,ORPHACODE
0,15
1,15
2,18
3,18
4,41
...,...
2589,542310
2590,544469
2591,544469
2592,565624


In [70]:
# Merging the data df_orphacode and df_pair_hpo based on the 'ORPHACODE' and 'orphanet_code' columns
df_orpha_hpoid = pd.merge(df_orphacode, df_pair_hpo, left_on='ORPHACODE', right_on='orphanet_code', how='inner')

# Display the merged DataFrame
df_orpha_hpoid

Unnamed: 0,ORPHACODE,orphanet_code,hpo_id
0,15,15,HP:0001513; HP:0000463; HP:0011867; HP:0000238...
1,15,15,HP:0001513; HP:0000463; HP:0011867; HP:0000238...
2,18,18,HP:0000114; HP:0011964; HP:0002013; HP:0004396...
3,18,18,HP:0000114; HP:0011964; HP:0002013; HP:0004396...
4,41,41,HP:0012733; HP:0001304; HP:0011509; HP:0007988
...,...,...,...
2589,542310,542310,HP:0100320; HP:0100543; HP:0001260; HP:0002516...
2590,544469,544469,HP:0002540; HP:0000347; HP:0001285; HP:0001639...
2591,544469,544469,HP:0002540; HP:0000347; HP:0001285; HP:0001639...
2592,565624,565624,HP:0000028; HP:0011344; HP:0007371; HP:0006956...


In [72]:
# Removing the specified columns "orphanet_code"
df_orpha_hpoid = df_orpha_hpoid.drop(columns=["orphanet_code"])
df_orpha_hpoid = df_orpha_hpoid.rename(columns={"hpo_id": "HPOID"})
df_orpha_hpoid

Unnamed: 0,ORPHACODE,HPOID
0,15,HP:0001513; HP:0000463; HP:0011867; HP:0000238...
1,15,HP:0001513; HP:0000463; HP:0011867; HP:0000238...
2,18,HP:0000114; HP:0011964; HP:0002013; HP:0004396...
3,18,HP:0000114; HP:0011964; HP:0002013; HP:0004396...
4,41,HP:0012733; HP:0001304; HP:0011509; HP:0007988
...,...,...
2589,542310,HP:0100320; HP:0100543; HP:0001260; HP:0002516...
2590,544469,HP:0002540; HP:0000347; HP:0001285; HP:0001639...
2591,544469,HP:0002540; HP:0000347; HP:0001285; HP:0001639...
2592,565624,HP:0000028; HP:0011344; HP:0007371; HP:0006956...


In [75]:
df_orpha_hpoid.to_csv('pairs_orphacode_hposet.csv', index=False)