In [96]:
import pandas as pd
import numpy as np

In [98]:
df = pd.read_csv('variants_pathogenic.csv')

In [100]:
print(df)

       CHROM       POS       ID REF ALT GENEINFO  ORPHACODE
0          1    949523   183381   C   T    ISG15     319563
1          1    957748  3255595   C   A     AGRN        590
2          1    976962   126556   C   T     AGRN        590
3          1    977355  2749616   C   A     AGRN        590
4          1    977433   644955   C   G     AGRN        590
...      ...       ...      ...  ..  ..      ...        ...
46762     22  51160726   975999   G   T   SHANK3      48652
46763     22  51169227  1803071   C   G   SHANK3      48652
46764     22  51169229  3069185   C   A   SHANK3      48652
46765     22  51169297   800506   G   T   SHANK3      48652
46766     22  51169426   975165   C   A   SHANK3      48652

[46767 rows x 7 columns]


In [102]:
df.loc[:, "ORPHACODE"] = df["ORPHACODE"].astype("int64")

In [104]:
# To keep only the first two occurrences of each "ORPHID" value and remove any further duplicates in a DataFrame.
df_filtered = df.groupby('ORPHACODE').head(1)
df_filtered.loc[:, "ORPHACODE"] = df_filtered["ORPHACODE"].astype("int64")
df_filtered

Unnamed: 0,CHROM,POS,ID,REF,ALT,GENEINFO,ORPHACODE
0,1,949523,183381,C,T,ISG15,319563
1,1,957748,3255595,C,A,AGRN,590
12,1,1149118,96692,G,A,TNFRSF4,431149
13,1,1167659,60484,A,G,B3GALT6,536467
16,1,1167851,60488,A,G,B3GALT6,642099
...,...,...,...,...,...,...,...
46666,22,51018184,1075204,C,A,CHKB,280671
46686,22,51063820,3052,G,A,ARSA,309263
46704,22,51064656,3093,C,A,ARSA,309256
46738,22,51065649,3089,A,G,ARSA,309271


In [106]:
# read the rare disease inheritance file "orpha_inheritance.csv"
df_inheritance = pd.read_csv('orpha_inheritance.csv')

# Sort by 'OrphaCode' in ascending order
df_inheritance = df_inheritance.sort_values(by="OrphaCode", ascending=True)
df_inheritance

Unnamed: 0,OrphaCode,TypeOfInheritance
1469,5,AR
1394,6,AR
508,7,AR
424,13,AR
131,14,AR
...,...,...
3465,675216,AD
3470,675767,AD
3501,675775,AR
3502,675782,AR


In [108]:
# Merging the data df_pair_variants and df_inheritance based on the 'ORPHACODE' and 'database_id' columns
df_variants_inheritance = pd.merge(df_filtered, df_inheritance, left_on='ORPHACODE', right_on='OrphaCode', how='inner')

# Display the merged DataFrame
df_variants_inheritance

Unnamed: 0,CHROM,POS,ID,REF,ALT,GENEINFO,ORPHACODE,OrphaCode,TypeOfInheritance
0,1,949523,183381,C,T,ISG15,319563,319563,AR
1,1,957748,3255595,C,A,AGRN,590,590,AD
2,1,1149118,96692,G,A,TNFRSF4,431149,431149,AR
3,1,1167659,60484,A,G,B3GALT6,536467,536467,AR
4,1,1470772,161193,G,C,TMEM240,98773,98773,AD
...,...,...,...,...,...,...,...,...,...
2210,22,42456982,2725305,A,C,NAGA,3137,3137,AR
2211,22,43011324,2443787,A,G,RNU12,85199,85199,AR
2212,22,50297711,3438,G,C,ALG12,79324,79324,AR
2213,22,50844792,1120017,A,G,PPP6R2,363981,363981,AR


In [110]:
# read the orpha hpo file "hpo_id_orpha.csv"
df_orpha_hpo = pd.read_csv('hpo_id_orpha.csv')

# Sort by 'orphanet_code' in ascending order
df_orpha_hpo = df_orpha_hpo.sort_values(by="orphanet_code", ascending=True)
df_orpha_hpo

Unnamed: 0,orphanet_code,disease_name,hpo_id
0,5,Long chain 3-hydroxyacyl-CoA dehydrogenase def...,HP:0001249
24,5,Long chain 3-hydroxyacyl-CoA dehydrogenase def...,HP:0007703
23,5,Long chain 3-hydroxyacyl-CoA dehydrogenase def...,HP:0001943
22,5,Long chain 3-hydroxyacyl-CoA dehydrogenase def...,HP:0001290
21,5,Long chain 3-hydroxyacyl-CoA dehydrogenase def...,HP:0001252
...,...,...,...
114028,658549,Idiopathic small fibers neuropathy,HP:0010829
114027,658549,Idiopathic small fibers neuropathy,HP:0100502
114047,658549,Idiopathic small fibers neuropathy,HP:0032147
114036,658549,Idiopathic small fibers neuropathy,HP:0001097


In [112]:
# Merging the data based on the 'ORPHID' and 'database_id' columns
df_merged_hpo = pd.merge(df_variants_inheritance, df_orpha_hpo, left_on='ORPHACODE', right_on='orphanet_code', how='inner')

# Display the merged DataFrame
df_merged_hpo

Unnamed: 0,CHROM,POS,ID,REF,ALT,GENEINFO,ORPHACODE,OrphaCode,TypeOfInheritance,orphanet_code,disease_name,hpo_id
0,1,957748,3255595,C,A,AGRN,590,590,AD,590,Congenital myasthenic syndrome,HP:0000276
1,1,957748,3255595,C,A,AGRN,590,590,AD,590,Congenital myasthenic syndrome,HP:0001761
2,1,957748,3255595,C,A,AGRN,590,590,AD,590,Congenital myasthenic syndrome,HP:0001558
3,1,957748,3255595,C,A,AGRN,590,590,AD,590,Congenital myasthenic syndrome,HP:0002751
4,1,957748,3255595,C,A,AGRN,590,590,AD,590,Congenital myasthenic syndrome,HP:0001249
...,...,...,...,...,...,...,...,...,...,...,...,...
44983,22,50297711,3438,G,C,ALG12,79324,79324,AR,79324,ALG12-CDG,HP:0000028
44984,22,50297711,3438,G,C,ALG12,79324,79324,AR,79324,ALG12-CDG,HP:0100776
44985,22,50297711,3438,G,C,ALG12,79324,79324,AR,79324,ALG12-CDG,HP:0000119
44986,22,50297711,3438,G,C,ALG12,79324,79324,AR,79324,ALG12-CDG,HP:0003645


In [114]:
# Extract the 'orphanet_code', and 'hpo_id' columns
df_merged_hpo = df_merged_hpo[['orphanet_code', 'hpo_id', 'TypeOfInheritance']]

# Display the extracted columns
df_merged_hpo

Unnamed: 0,orphanet_code,hpo_id,TypeOfInheritance
0,590,HP:0000276,AD
1,590,HP:0001761,AD
2,590,HP:0001558,AD
3,590,HP:0002751,AD
4,590,HP:0001249,AD
...,...,...,...
44983,79324,HP:0000028,AR
44984,79324,HP:0100776,AR
44985,79324,HP:0000119,AR
44986,79324,HP:0003645,AR


In [116]:
# Group by 'orphanet_code' and concatenate 'hpo_id' values for each unique 'orphanet_code'
df_case_hpo = df_merged_hpo.groupby('orphanet_code')['hpo_id'].apply(lambda x: '; '.join(x)).reset_index()
df_case_hpo

Unnamed: 0,orphanet_code,hpo_id
0,5,HP:0001249; HP:0007703; HP:0001943; HP:0001290...
1,6,HP:0100022; HP:0001987; HP:0001992; HP:0100659...
2,7,HP:0001522; HP:0001305; HP:0000256; HP:0002808...
3,13,HP:0001266; HP:0002487; HP:0001251; HP:0000713...
4,14,HP:0012804; HP:0001640; HP:0001635; HP:0003198...
...,...,...
1503,562528,HP:0002650; HP:0011968; HP:0001250; HP:0002093...
1504,565624,HP:0001347; HP:0002013; HP:0001260; HP:0002059...
1505,572013,HP:0010864; HP:0001302; HP:0000324; HP:0004305...
1506,589618,HP:0000821; HP:0033049; HP:0031959; HP:0000473...


In [118]:
df_inheritance

Unnamed: 0,OrphaCode,TypeOfInheritance
1469,5,AR
1394,6,AR
508,7,AR
424,13,AR
131,14,AR
...,...,...
3465,675216,AD
3470,675767,AD
3501,675775,AR
3502,675782,AR


In [173]:
# Merging the data based on the 'ORPHID' and 'database_id' columns
df_inheritance_hposet = pd.merge(df_case_hpo, df_inheritance, left_on='orphanet_code', right_on='OrphaCode', how='inner')

# Display the merged DataFrame
df_inheritance_hposet

Unnamed: 0,orphanet_code,hpo_id,OrphaCode,TypeOfInheritance
0,5,HP:0001249; HP:0007703; HP:0001943; HP:0001290...,5,AR
1,6,HP:0100022; HP:0001987; HP:0001992; HP:0100659...,6,AR
2,7,HP:0001522; HP:0001305; HP:0000256; HP:0002808...,7,AR
3,13,HP:0001266; HP:0002487; HP:0001251; HP:0000713...,13,AR
4,14,HP:0012804; HP:0001640; HP:0001635; HP:0003198...,14,AR
...,...,...,...,...
1503,562528,HP:0002650; HP:0011968; HP:0001250; HP:0002093...,562528,AD
1504,565624,HP:0001347; HP:0002013; HP:0001260; HP:0002059...,565624,AR
1505,572013,HP:0010864; HP:0001302; HP:0000324; HP:0004305...,572013,AD
1506,589618,HP:0000821; HP:0033049; HP:0031959; HP:0000473...,589618,AD


In [175]:
df_inheritance_hposet = df_inheritance_hposet.drop(columns = "OrphaCode")

In [177]:
df_inheritance_hposet = df_inheritance_hposet.rename(columns={"TypeOfInheritance": "INHERITANCE"})
df_inheritance_hposet = df_inheritance_hposet.rename(columns={"hpo_id": "HPOID"})
df_inheritance_hposet = df_inheritance_hposet.rename(columns={"orphanet_code": "ORPHACODE"})
df_inheritance_hposet = df_inheritance_hposet.sort_values(by=["INHERITANCE", "ORPHACODE"]).reset_index(drop=True)
df_inheritance_hposet

Unnamed: 0,ORPHACODE,HPOID,INHERITANCE
0,15,HP:0000242; HP:0011452; HP:0002979; HP:0000365...,AD
1,18,HP:0032066; HP:0012405; HP:0011387; HP:0003355...,AD
2,41,HP:0007988; HP:0011509; HP:0001304; HP:0012733,AD
3,51,HP:0030038; HP:0000508; HP:0004809; HP:0011834...,AD
4,53,HP:0001373; HP:0002758; HP:0002754; HP:0002007...,AD
...,...,...,...
1503,542306,HP:0011704; HP:0000546; HP:0005155; HP:0010864...,AR
1504,542310,HP:0010576; HP:0000712; HP:0011153; HP:0001250...,AR
1505,544469,HP:0000648; HP:0010864; HP:0002313; HP:0002650...,AR
1506,557003,HP:0000278; HP:0000286; HP:0012758; HP:0000519...,AR


In [179]:
df_inheritance_hposet = df_inheritance_hposet.drop(columns = "INHERITANCE")
df_inheritance_hposet

Unnamed: 0,ORPHACODE,HPOID
0,15,HP:0000242; HP:0011452; HP:0002979; HP:0000365...
1,18,HP:0032066; HP:0012405; HP:0011387; HP:0003355...
2,41,HP:0007988; HP:0011509; HP:0001304; HP:0012733
3,51,HP:0030038; HP:0000508; HP:0004809; HP:0011834...
4,53,HP:0001373; HP:0002758; HP:0002754; HP:0002007...
...,...,...
1503,542306,HP:0011704; HP:0000546; HP:0005155; HP:0010864...
1504,542310,HP:0010576; HP:0000712; HP:0011153; HP:0001250...
1505,544469,HP:0000648; HP:0010864; HP:0002313; HP:0002650...
1506,557003,HP:0000278; HP:0000286; HP:0012758; HP:0000519...


In [182]:
case_hpo_list = df_inheritance_hposet["ORPHACODE"].tolist()
df_inheritance_hposet.to_csv('case_orphacode_hposet.csv',index=False)

In [184]:
# Convert the column 'ORPHACODE' from object to int64
# Fitler the rare disease variants with ORPHACODE in the case_hpo_list
df.loc[:, "ORPHACODE"] = df["ORPHACODE"].astype("int64")

In [133]:
# Filter the DataFrame based on ORPHACODE being in case_hpo_list
df_case_variants = df[df['ORPHACODE'].isin(case_hpo_list)]

# Sort by 'ORPHACODE' in ascending order and reset the index
df_case_variants = df_case_variants.sort_values(by="ORPHACODE", ascending=True).reset_index(drop=True)

# Display the resulting DataFrame
df_case_variants

Unnamed: 0,CHROM,POS,ID,REF,ALT,GENEINFO,ORPHACODE
0,1,24151875,521752,G,A,HMGCL,5
1,2,26414203,1427255,A,T,HADHA,5
2,2,26437359,8737,G,A,HADHA,5
3,2,26453059,944315,C,T,HADHA,5
4,2,26455132,2107593,G,A,HADHA,5
...,...,...,...,...,...,...,...
32878,6,79671492,545398,G,A,PHIP,589905
32879,6,79679778,982876,G,T,PHIP,589905
32880,6,79656427,1708980,C,T,PHIP,589905
32881,6,79672955,2664246,C,A,PHIP,589905


In [135]:
# Merging the data df_pair_variants and df_inheritance based on the 'ORPHACODE' and 'database_id' columns
df_merged_inheritance = pd.merge(df_case_variants, df_inheritance, left_on='ORPHACODE', right_on='OrphaCode', how='inner')

# Display the merged DataFrame
df_merged_inheritance

Unnamed: 0,CHROM,POS,ID,REF,ALT,GENEINFO,ORPHACODE,OrphaCode,TypeOfInheritance
0,1,24151875,521752,G,A,HMGCL,5,5,AR
1,2,26414203,1427255,A,T,HADHA,5,5,AR
2,2,26437359,8737,G,A,HADHA,5,5,AR
3,2,26453059,944315,C,T,HADHA,5,5,AR
4,2,26455132,2107593,G,A,HADHA,5,5,AR
...,...,...,...,...,...,...,...,...,...
32878,6,79671492,545398,G,A,PHIP,589905,589905,AD
32879,6,79679778,982876,G,T,PHIP,589905,589905,AD
32880,6,79656427,1708980,C,T,PHIP,589905,589905,AD
32881,6,79672955,2664246,C,A,PHIP,589905,589905,AD


In [137]:
# Extract the first two columns (CHROM and POS)
df_variants_case = df_merged_inheritance.drop(columns=["OrphaCode"])
# Renaming the column "TypeOfInheritance" to "INHERITANCE"
df_variants_case = df_variants_case.rename(columns={"TypeOfInheritance": "INHERITANCE"})
df_variants_case

Unnamed: 0,CHROM,POS,ID,REF,ALT,GENEINFO,ORPHACODE,INHERITANCE
0,1,24151875,521752,G,A,HMGCL,5,AR
1,2,26414203,1427255,A,T,HADHA,5,AR
2,2,26437359,8737,G,A,HADHA,5,AR
3,2,26453059,944315,C,T,HADHA,5,AR
4,2,26455132,2107593,G,A,HADHA,5,AR
...,...,...,...,...,...,...,...,...
32878,6,79671492,545398,G,A,PHIP,589905,AD
32879,6,79679778,982876,G,T,PHIP,589905,AD
32880,6,79656427,1708980,C,T,PHIP,589905,AD
32881,6,79672955,2664246,C,A,PHIP,589905,AD


In [139]:
# Sorting the DataFrame by "INHERITANCE" and "ORPHACODE"
df_variants_case = df_variants_case.sort_values(by=["INHERITANCE", "ORPHACODE"]).reset_index(drop=True)
df_variants_case

Unnamed: 0,CHROM,POS,ID,REF,ALT,GENEINFO,ORPHACODE,INHERITANCE
0,4,1807371,16337,C,A,FGFR3,15,AD
1,4,1805658,2664079,C,G,FGFR3,15,AD
2,4,1806104,16330,G,T,FGFR3,15,AD
3,4,1806119,16328,G,C,FGFR3,15,AD
4,7,138391446,1344704,T,C,ATP6V0A4,18,AD
...,...,...,...,...,...,...,...,...
32878,2,172582561,635400,A,G,DYNC1I2,544469,AR
32879,11,17167409,599543,C,A,PIK3C2A,557003,AR
32880,5,74055190,225205,T,C,GFM2,565624,AR
32881,5,74054703,440788,T,G,GFM2,565624,AR


In [141]:
df_variants_case.to_csv('case_variants_all.csv')

In [193]:
# To keep only the first two occurrences of each "ORPHID" value and remove any further duplicates in a DataFrame.
df_variants_case_unique = df_variants_case.groupby('ORPHACODE').head(1)
df_variants_case_unique

Unnamed: 0,CHROM,POS,ID,REF,ALT,GENEINFO,ORPHACODE,INHERITANCE
0,4,1807371,16337,C,A,FGFR3,15,AD
4,7,138391446,1344704,T,C,ATP6V0A4,18,AD
6,1,154574727,2942810,G,A,ADAR,41,AD
24,2,163128866,812537,G,C,IFIH1,51,AD
105,16,1505761,1012215,A,G,CLCN7,53,AD
...,...,...,...,...,...,...,...,...
32849,15,52425575,2575918,C,T,GNB5,542306,AR
32858,17,8076848,929285,A,C,TMEM107,542310,AR
32873,1,150990382,932732,T,C,PRUNE1,544469,AR
32879,11,17167409,599543,C,A,PIK3C2A,557003,AR


In [195]:
df_variants_case_unique.to_csv('case_variants_gene_orpha_inher.csv', index=False)

In [147]:
# Counting the values in the "INHERITANCE" column
inheritance_counts = df_variants_case_unique["INHERITANCE"].value_counts()
print(inheritance_counts)

INHERITANCE
AR    814
AD    694
Name: count, dtype: int64


In [149]:
# Extracting rows in df_variants_case that are NOT in df_variants_case_unique based on all matching columns
df_variants_case_others = df_variants_case.merge(df_variants_case_unique, how='outer', indicator=True).query('_merge == "left_only"').drop(columns=['_merge'])
df_variants_case_others = df_variants_case_others.sort_values(by=["INHERITANCE", "ORPHACODE"]).reset_index(drop=True)

In [151]:
df_variants_case_others

Unnamed: 0,CHROM,POS,ID,REF,ALT,GENEINFO,ORPHACODE,INHERITANCE
0,4,1805658,2664079,C,G,FGFR3,15,AD
1,4,1806104,16330,G,T,FGFR3,15,AD
2,4,1806119,16328,G,C,FGFR3,15,AD
3,2,71185243,12228,T,C,ATP6V1B1,18,AD
4,1,154557469,14820,A,G,ADAR,41,AD
...,...,...,...,...,...,...,...,...
31370,1,150998010,1711747,T,A,PRUNE1,544469,AR
31371,2,172582561,635400,A,G,DYNC1I2,544469,AR
31372,2,172582801,635401,C,T,DYNC1I2,544469,AR
31373,5,74018387,225206,C,T,GFM2,565624,AR


In [153]:
df_variants_case_others.to_csv('case_variants_others.csv')

In [155]:
df_variants_case_unique = pd.read_csv('case_variants_unique.csv')

In [157]:
# Updating multiple columns in one assignment operation to avoid SettingWithCopyWarning
df_variants_case_unique = df_variants_case_unique.assign(
    QUAL='.',
    FILTER='.',
    INFO='PR',
    FORMAT='GT'
)
df_variants_case_unique

Unnamed: 0.1,Unnamed: 0,CHROM,POS,ID,REF,ALT,GENEINFO,ORPHACODE,INHERITANCE,QUAL,FILTER,INFO,FORMAT
0,0,4,1807371,16337,C,A,FGFR3,15,AD,.,.,PR,GT
1,4,7,138391446,1344704,T,C,ATP6V0A4,18,AD,.,.,PR,GT
2,6,1,154574727,2942810,G,A,ADAR,41,AD,.,.,PR,GT
3,24,2,163128866,812537,G,C,IFIH1,51,AD,.,.,PR,GT
4,105,16,1505761,1012215,A,G,CLCN7,53,AD,.,.,PR,GT
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1503,32849,15,52425575,2575918,C,T,GNB5,542306,AR,.,.,PR,GT
1504,32858,17,8076848,929285,A,C,TMEM107,542310,AR,.,.,PR,GT
1505,32873,1,150990382,932732,T,C,PRUNE1,544469,AR,.,.,PR,GT
1506,32879,11,17167409,599543,C,A,PIK3C2A,557003,AR,.,.,PR,GT


In [159]:
# To implement this logic where each of the new columns follows a specific pattern,
# with only one row containing "0/1" or "1/0"  if the value of INHERITANCE is "AD", 
# with only one row cotaining "1/1" if the value of INHERITANCE is "AR" and the rest containing "0/0", 
# we can programmatically generate the values for each of the 1508 new columns. 
# For column "1", the value at row 1 will be "0/1" or "1/0", if the value of INHERITANCE is "AD", 
# For column "1", the value at row 1 will be "1/1", if the value of INHERITANCE is "AR", 
# For column "2", the value at row 2 will be "1/0" or "0/1",if the value of INHERITANCE is "AD", 
# For column "2", the value at row 2 will be "1/1", if the value of INHERITANCE is "AR",  and so on.

In [161]:
# Number of new columns to generate
num_new_columns = 1508
num_rows = len(df_variants_case_unique)

# Initialize all new columns to "0/0" at once for efficiency
new_columns = pd.DataFrame("0/0", index=df_variants_case_unique.index, columns=[f'col_{i+1}' for i in range(num_new_columns)])
df_variants_case_unique = pd.concat([df_variants_case_unique, new_columns], axis=1)

# Creating lists of indices and values to update
ad_indices = df_variants_case_unique[df_variants_case_unique["INHERITANCE"] == "AD"].index
ar_indices = df_variants_case_unique[df_variants_case_unique["INHERITANCE"] == "AR"].index

# Update "AD" rows with "0/1" or "1/0"
for idx in ad_indices:
    col_name = f'col_{idx + 1}'
    df_variants_case_unique.at[idx, col_name] = np.random.choice(["0/1", "1/0"])

# Update "AR" rows with "1/1"
for idx in ar_indices:
    col_name = f'col_{idx + 1}'
    df_variants_case_unique.at[idx, col_name] = "1/1"

# De-fragmenting DataFrame for performance improvement
df_variants_case_unique = df_variants_case_unique.copy()

# Remove the first column by selecting all columns except the first
df_variants_case_unique = df_variants_case_unique.iloc[:, 1:]

df_variants_case_unique

Unnamed: 0,CHROM,POS,ID,REF,ALT,GENEINFO,ORPHACODE,INHERITANCE,QUAL,FILTER,...,col_1499,col_1500,col_1501,col_1502,col_1503,col_1504,col_1505,col_1506,col_1507,col_1508
0,4,1807371,16337,C,A,FGFR3,15,AD,.,.,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
1,7,138391446,1344704,T,C,ATP6V0A4,18,AD,.,.,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
2,1,154574727,2942810,G,A,ADAR,41,AD,.,.,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
3,2,163128866,812537,G,C,IFIH1,51,AD,.,.,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
4,16,1505761,1012215,A,G,CLCN7,53,AD,.,.,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1503,15,52425575,2575918,C,T,GNB5,542306,AR,.,.,...,0/0,0/0,0/0,0/0,0/0,1/1,0/0,0/0,0/0,0/0
1504,17,8076848,929285,A,C,TMEM107,542310,AR,.,.,...,0/0,0/0,0/0,0/0,0/0,0/0,1/1,0/0,0/0,0/0
1505,1,150990382,932732,T,C,PRUNE1,544469,AR,.,.,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,1/1,0/0,0/0
1506,11,17167409,599543,C,A,PIK3C2A,557003,AR,.,.,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,1/1,0/0


In [163]:
# Removing the specified columns "GENEINFO", "ORPHACODE", and "INHERITANCE"
df_variants_case_unique = df_variants_case_unique.drop(columns=["GENEINFO", "ORPHACODE", "INHERITANCE"])
df_variants_case_unique

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,col_1,...,col_1499,col_1500,col_1501,col_1502,col_1503,col_1504,col_1505,col_1506,col_1507,col_1508
0,4,1807371,16337,C,A,.,.,PR,GT,1/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
1,7,138391446,1344704,T,C,.,.,PR,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
2,1,154574727,2942810,G,A,.,.,PR,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
3,2,163128866,812537,G,C,.,.,PR,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
4,16,1505761,1012215,A,G,.,.,PR,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1503,15,52425575,2575918,C,T,.,.,PR,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,1/1,0/0,0/0,0/0,0/0
1504,17,8076848,929285,A,C,.,.,PR,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,1/1,0/0,0/0,0/0
1505,1,150990382,932732,T,C,.,.,PR,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,1/1,0/0,0/0
1506,11,17167409,599543,C,A,.,.,PR,GT,0/0,...,0/0,0/0,0/0,0/0,0/0,0/0,0/0,0/0,1/1,0/0


In [165]:
df_variants_case_unique.to_csv('variants_case_vcf.csv', index=False)