In [204]:
import gseapy as gp
import pandas as pd
import random 

In [206]:
# The KEGG pathway information is avaiable on GSEA website
# Load GMT file using gseapy's read_gmt function
file_path = 'msigdb.v2024.1.Hs.symbols.gmt'
gmt_data = gp.read_gmt(file_path)

# Print a sample gene set
for gene_set, genes in gmt_data.items():
    print(f"Gene Set: {gene_set}, Number of Genes: {len(genes)}")
    print(f"Genes: {genes[:5]}...") 
    break# Print the first 5 genes as a preview # Only print the first gene set as an example

Gene Set: MT, Number of Genes: 37
Genes: ['MT-ATP6', 'MT-ATP8', 'MT-CO1', 'MT-CO2', 'MT-CO3']...


In [207]:
# Sample function to read a GMT file and extract only gene sets starting with "KEGG"
def extract_kegg_gene_sets(file_path):
    kegg_gene_sets = {}
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split("\t")
            gene_set_name = parts[0]
            if gene_set_name.startswith("KEGG"):
                description = parts[1]
                genes = parts[2:]
                kegg_gene_sets[gene_set_name] = {'description': description, 'genes': genes}
    return kegg_gene_sets

# Example usage
file_path = 'msigdb.v2024.1.Hs.symbols.gmt'  # Replace with your file path
kegg_gene_sets = extract_kegg_gene_sets(file_path)

# Print sample output
for name, details in kegg_gene_sets.items():
    print(f"{name}: {details['description']}")
    print("Genes:", details['genes'][:5], "...")  # Preview first 5 genes
    break

KEGG_ABC_TRANSPORTERS: https://www.gsea-msigdb.org/gsea/msigdb/human/geneset/KEGG_ABC_TRANSPORTERS
Genes: ['ABCA1', 'ABCA10', 'ABCA12', 'ABCA13', 'ABCA2'] ...


In [210]:
# Sample dictionary format: {"KEGG_ONE_CARBON_POOL_BY_FOLATE": {"description": "...", "genes": [...]}, ...}
# Convert dictionary to DataFrame

df_kegg = pd.DataFrame({
    'KeggName': list(kegg_gene_sets.keys()),
    'Genes': [", ".join(details['genes']) for details in kegg_gene_sets.values()]  # Join genes list as a single string
})

# Display the DataFrame
print(df_kegg.head())
print(len(df_kegg))

                                          KeggName  \
0                            KEGG_ABC_TRANSPORTERS   
1                      KEGG_ACUTE_MYELOID_LEUKEMIA   
2                           KEGG_ADHERENS_JUNCTION   
3             KEGG_ADIPOCYTOKINE_SIGNALING_PATHWAY   
4  KEGG_ALANINE_ASPARTATE_AND_GLUTAMATE_METABOLISM   

                                               Genes  
0  ABCA1, ABCA10, ABCA12, ABCA13, ABCA2, ABCA3, A...  
1  AKT1, AKT2, AKT3, ARAF, BAD, BRAF, CCNA1, CCND...  
2  ACP1, ACTB, ACTG1, ACTN1, ACTN2, ACTN3, ACTN4,...  
3  ACACB, ACSL1, ACSL3, ACSL4, ACSL5, ACSL6, ADIP...  
4  ABAT, ACY3, ADSL, ADSS1, ADSS2, AGXT, AGXT2, A...  
844


In [212]:
# read the pairs variants information
df = pd.read_csv('pairs_variants_gene_orpha_inher.csv')
print(df)

      CHROM        POS       ID REF ALT  GENEINFO  ORPHACODE INHERITANCE
0         4    1806104    16330   G   T     FGFR3         15          AD
1         4    1805658  2664079   C   G     FGFR3         15          AD
2         7  138391446  1344704   T   C  ATP6V0A4         18          AD
3         2   71185243    12228   T   C  ATP6V1B1         18          AD
4         1  154557469    14820   A   G      ADAR         41          AD
...     ...        ...      ...  ..  ..       ...        ...         ...
2589     17    8076766   929264   G   C   TMEM107     542310          AR
2590      1  150990382   932732   T   C    PRUNE1     544469          AR
2591      1  150990970   427231   C   A    PRUNE1     544469          AR
2592      5   74018387   225206   C   T      GFM2     565624          AR
2593      5   74054703   440788   T   G      GFM2     565624          AR

[2594 rows x 8 columns]


In [214]:
# Sort the GENEINFO DataFrame alphabetically
df_sorted = df.sort_values(by="GENEINFO").reset_index(drop=True)

In [216]:
# To keep only the one occurrences of each "GENEINFO" value and remove any further duplicates in a DataFrame.
df_sorted = df_sorted.groupby('ORPHACODE').head(1)
print(len(df_sorted))
print(df_sorted)

1297
      CHROM        POS       ID REF ALT  GENEINFO  ORPHACODE INHERITANCE
0        15   67524206    39732   G   A     AAGAB      79501          AD
2         7  121731848   100645   G   C      AASS       2203          AR
4         9  107566949     9496   G   A     ABCA1      31150          AR
6         9  107550750     9502   A   G     ABCA1        425          AD
7         2  215845332     2858   C   T    ABCA12        313          AD
...     ...        ...      ...  ..  ..       ...        ...         ...
2584     14   68238826    30933   G   A   ZFYVE26     100996          AR
2586      1   40735705  1709555   T   A  ZMPSTE24       1662          AD
2588      1   40737629   140536   G   T  ZMPSTE24       2457          AR
2590     10     226034   451936   C   T   ZMYND11     436151          AD
2592     20   44578004   375392   C   A    ZNF335     329228          AR

[1297 rows x 8 columns]


In [218]:
# Get all the gene in the pairs variants dataset
gene_list = df['GENEINFO'].tolist()

In [220]:
# Remove duplicate genes by converting to a set and back to a list
unique_orpha_genes_list = list(set(gene_list))
print(len(unique_orpha_genes_list))

1175


In [222]:
# Ensure the 'Genes' column is in list format
df_kegg['Genes'] = df_kegg['Genes'].apply(lambda x: x.split(", ") if isinstance(x, str) else x)

# Create the 'OrphaGene' column by finding common genes in each row's 'Genes' list and the gene_list
df_kegg['OrphaGene'] = df_kegg['Genes'].apply(lambda genes: ", ".join([gene for gene in genes if gene in gene_list]))

# Display the updated DataFrame
print(df_kegg)

                                              KeggName  \
0                                KEGG_ABC_TRANSPORTERS   
1                          KEGG_ACUTE_MYELOID_LEUKEMIA   
2                               KEGG_ADHERENS_JUNCTION   
3                 KEGG_ADIPOCYTOKINE_SIGNALING_PATHWAY   
4      KEGG_ALANINE_ASPARTATE_AND_GLUTAMATE_METABOLISM   
..                                                 ...   
839  KEGG_MEDICUS_VARIANT_SCRAPIE_CONFORMATION_PRPS...   
840  KEGG_MEDICUS_VARIANT_TEL_AML1_FUSION_TO_TRANSC...   
841  KEGG_MEDICUS_VARIANT_TGFA_OVEREXPRESSION_TO_PI...   
842  KEGG_MEDICUS_VARIANT_TMPRSS2_ERG_FUSION_TO_TRA...   
843  KEGG_MEDICUS_VARIANT_TRK_FUSION_KINASE_TO_RAS_...   

                                                 Genes  \
0    [ABCA1, ABCA10, ABCA12, ABCA13, ABCA2, ABCA3, ...   
1    [AKT1, AKT2, AKT3, ARAF, BAD, BRAF, CCNA1, CCN...   
2    [ACP1, ACTB, ACTG1, ACTN1, ACTN2, ACTN3, ACTN4...   
3    [ACACB, ACSL1, ACSL3, ACSL4, ACSL5, ACSL6, ADI...   
4    [ABAT, A

In [224]:
# Count the common genes in each pathway
df_kegg['GeneCount'] = df_kegg['OrphaGene'].apply(lambda x: len(x.split(", ")))
print(df_kegg)

                                              KeggName  \
0                                KEGG_ABC_TRANSPORTERS   
1                          KEGG_ACUTE_MYELOID_LEUKEMIA   
2                               KEGG_ADHERENS_JUNCTION   
3                 KEGG_ADIPOCYTOKINE_SIGNALING_PATHWAY   
4      KEGG_ALANINE_ASPARTATE_AND_GLUTAMATE_METABOLISM   
..                                                 ...   
839  KEGG_MEDICUS_VARIANT_SCRAPIE_CONFORMATION_PRPS...   
840  KEGG_MEDICUS_VARIANT_TEL_AML1_FUSION_TO_TRANSC...   
841  KEGG_MEDICUS_VARIANT_TGFA_OVEREXPRESSION_TO_PI...   
842  KEGG_MEDICUS_VARIANT_TMPRSS2_ERG_FUSION_TO_TRA...   
843  KEGG_MEDICUS_VARIANT_TRK_FUSION_KINASE_TO_RAS_...   

                                                 Genes  \
0    [ABCA1, ABCA10, ABCA12, ABCA13, ABCA2, ABCA3, ...   
1    [AKT1, AKT2, AKT3, ARAF, BAD, BRAF, CCNA1, CCN...   
2    [ACP1, ACTB, ACTG1, ACTN1, ACTN2, ACTN3, ACTN4...   
3    [ACACB, ACSL1, ACSL3, ACSL4, ACSL5, ACSL6, ADI...   
4    [ABAT, A

In [226]:
# Sort the dataset by the gene counts
df_kegg_sorted = df_kegg.sort_values(by="GeneCount", ascending=True).reset_index(drop=True)
print(df_kegg_sorted)

                                              KeggName  \
0    KEGG_MEDICUS_REFERENCE_ANTIGEN_PROCESSING_AND_...   
1    KEGG_MEDICUS_VARIANT_AMPLIFIED_PDGFR_TO_PLCG_C...   
2    KEGG_MEDICUS_PATHOGEN_HCV_CORE_TO_RXRA_PPARA_M...   
3    KEGG_MEDICUS_PATHOGEN_HCV_CORE_TO_RXRA_LXRA_ME...   
4     KEGG_MEDICUS_REFERENCE_ESTABLISHMENT_OF_COHESION   
..                                                 ...   
839              KEGG_REGULATION_OF_ACTIN_CYTOSKELETON   
840                                      KEGG_LYSOSOME   
841                        KEGG_MAPK_SIGNALING_PATHWAY   
842                                KEGG_FOCAL_ADHESION   
843                            KEGG_PATHWAYS_IN_CANCER   

                                                 Genes  \
0    [CD74, HLA-DMA, HLA-DMB, HLA-DOA, HLA-DOB, HLA...   
1    [CALM1, CALM2, CALM3, CAMK1, CAMK1D, CAMK1G, C...   
2                      [CCND1, CDK4, MYC, PPARA, RXRA]   
3               [CCND1, CDK4, MYC, NR1H3, PSME3, RXRA]   
4      [CDCA5

In [228]:
# Remove the pathways with on common genes
df_kegg_sorted = df_kegg_sorted[df_kegg_sorted['OrphaGene'] != ""].reset_index(drop=True)

# Display the filtered DataFrame
print(df_kegg_sorted)

                                              KeggName  \
0    KEGG_MEDICUS_VARIANT_AMPLIFIED_PDGFR_TO_PLCG_C...   
1     KEGG_MEDICUS_REFERENCE_ESTABLISHMENT_OF_COHESION   
2    KEGG_MEDICUS_REFERENCE_EXTRINSIC_APOPTOTIC_PAT...   
3    KEGG_MEDICUS_PATHOGEN_HCMV_US28_TO_GNAQ_PLCB_G...   
4    KEGG_MEDICUS_PATHOGEN_HCMV_US27_TO_CXCR4_GNB_G...   
..                                                 ...   
682              KEGG_REGULATION_OF_ACTIN_CYTOSKELETON   
683                                      KEGG_LYSOSOME   
684                        KEGG_MAPK_SIGNALING_PATHWAY   
685                                KEGG_FOCAL_ADHESION   
686                            KEGG_PATHWAYS_IN_CANCER   

                                                 Genes  \
0    [CALM1, CALM2, CALM3, CAMK1, CAMK1D, CAMK1G, C...   
1      [CDCA5, CTCF, PDS5A, PDS5B, STAG1, STAG2, WAPL]   
2    [CASP3, CASP7, CASP8, FADD, RIPK1, TNF, TNFRSF...   
3    [CALM1, CALM2, CALM3, CXCL8, GNAQ, NFATC1, NFA...   
4    [BCAR1, 

In [230]:
# Convert each OrphaGene entry into a list of individual genes
df_kegg_sorted['OrphaGene_List'] = df_kegg_sorted['OrphaGene'].apply(lambda x: x.split(", "))

In [232]:
# Reas all the pathogenic variants
df_variants = pd.read_csv('variants_pathogenic.csv')
df_variants

Unnamed: 0,CHROM,POS,ID,REF,ALT,GENEINFO,ORPHACODE
0,1,949523,183381,C,T,ISG15,319563
1,1,957748,3255595,C,A,AGRN,590
2,1,976962,126556,C,T,AGRN,590
3,1,977355,2749616,C,A,AGRN,590
4,1,977433,644955,C,G,AGRN,590
...,...,...,...,...,...,...,...
46762,22,51160726,975999,G,T,SHANK3,48652
46763,22,51169227,1803071,C,G,SHANK3,48652
46764,22,51169229,3069185,C,A,SHANK3,48652
46765,22,51169297,800506,G,T,SHANK3,48652


In [234]:
# To keep only the one occurrences of each "GENEINFO" value and remove any further duplicates in a DataFrame.
df_variants_unique = df_variants.groupby('GENEINFO').head(1)
print(len(df_variants_unique))
print(df_variants_unique)

2923
       CHROM       POS       ID REF ALT GENEINFO  ORPHACODE
0          1    949523   183381   C   T    ISG15     319563
1          1    957748  3255595   C   A     AGRN        590
12         1   1149118    96692   G   A  TNFRSF4     431149
13         1   1167659    60484   A   G  B3GALT6     536467
25         1   1451416   828051   T   G   ATAD3A     615954
...      ...       ...      ...  ..  ..      ...        ...
46620     22  50962167     5679   G   A     SCO2       1561
46627     22  50964236   223058   G   T     TYMP        298
46666     22  51018184  1075204   C   A     CHKB     280671
46681     22  51063656   941799   C   A     ARSA        512
46753     22  51117022  1029037   A   T   SHANK3      48652

[2923 rows x 7 columns]


In [236]:
# Write the unqiue gene as a list
variants_gene_list = df_variants_unique['GENEINFO'].tolist()

In [238]:
print(df_kegg_sorted)
print(len(df_kegg_sorted))

                                              KeggName  \
0    KEGG_MEDICUS_VARIANT_AMPLIFIED_PDGFR_TO_PLCG_C...   
1     KEGG_MEDICUS_REFERENCE_ESTABLISHMENT_OF_COHESION   
2    KEGG_MEDICUS_REFERENCE_EXTRINSIC_APOPTOTIC_PAT...   
3    KEGG_MEDICUS_PATHOGEN_HCMV_US28_TO_GNAQ_PLCB_G...   
4    KEGG_MEDICUS_PATHOGEN_HCMV_US27_TO_CXCR4_GNB_G...   
..                                                 ...   
682              KEGG_REGULATION_OF_ACTIN_CYTOSKELETON   
683                                      KEGG_LYSOSOME   
684                        KEGG_MAPK_SIGNALING_PATHWAY   
685                                KEGG_FOCAL_ADHESION   
686                            KEGG_PATHWAYS_IN_CANCER   

                                                 Genes  \
0    [CALM1, CALM2, CALM3, CAMK1, CAMK1D, CAMK1G, C...   
1      [CDCA5, CTCF, PDS5A, PDS5B, STAG1, STAG2, WAPL]   
2    [CASP3, CASP7, CASP8, FADD, RIPK1, TNF, TNFRSF...   
3    [CALM1, CALM2, CALM3, CXCL8, GNAQ, NFATC1, NFA...   
4    [BCAR1, 

In [240]:
# Include only the genes present in the provided gene_list within each row’s "Genes" column:
df_kegg_sorted['Pathway_Genes'] = df_kegg_sorted['Genes'].apply(lambda gene_set: [gene for gene in gene_set if gene in variants_gene_list])

# Display the resulting DataFrame with only relevant genes in 'Filtered_Genes'
print(df_kegg_sorted)

                                              KeggName  \
0    KEGG_MEDICUS_VARIANT_AMPLIFIED_PDGFR_TO_PLCG_C...   
1     KEGG_MEDICUS_REFERENCE_ESTABLISHMENT_OF_COHESION   
2    KEGG_MEDICUS_REFERENCE_EXTRINSIC_APOPTOTIC_PAT...   
3    KEGG_MEDICUS_PATHOGEN_HCMV_US28_TO_GNAQ_PLCB_G...   
4    KEGG_MEDICUS_PATHOGEN_HCMV_US27_TO_CXCR4_GNB_G...   
..                                                 ...   
682              KEGG_REGULATION_OF_ACTIN_CYTOSKELETON   
683                                      KEGG_LYSOSOME   
684                        KEGG_MAPK_SIGNALING_PATHWAY   
685                                KEGG_FOCAL_ADHESION   
686                            KEGG_PATHWAYS_IN_CANCER   

                                                 Genes  \
0    [CALM1, CALM2, CALM3, CAMK1, CAMK1D, CAMK1G, C...   
1      [CDCA5, CTCF, PDS5A, PDS5B, STAG1, STAG2, WAPL]   
2    [CASP3, CASP7, CASP8, FADD, RIPK1, TNF, TNFRSF...   
3    [CALM1, CALM2, CALM3, CXCL8, GNAQ, NFATC1, NFA...   
4    [BCAR1, 

In [242]:
# Select specified columns using double square brackets
df_gene_gene = df_kegg_sorted[['KeggName', 'OrphaGene_List', 'Pathway_Genes']]

# Display the resulting DataFrame
print(df_gene_gene)

                                              KeggName  \
0    KEGG_MEDICUS_VARIANT_AMPLIFIED_PDGFR_TO_PLCG_C...   
1     KEGG_MEDICUS_REFERENCE_ESTABLISHMENT_OF_COHESION   
2    KEGG_MEDICUS_REFERENCE_EXTRINSIC_APOPTOTIC_PAT...   
3    KEGG_MEDICUS_PATHOGEN_HCMV_US28_TO_GNAQ_PLCB_G...   
4    KEGG_MEDICUS_PATHOGEN_HCMV_US27_TO_CXCR4_GNB_G...   
..                                                 ...   
682              KEGG_REGULATION_OF_ACTIN_CYTOSKELETON   
683                                      KEGG_LYSOSOME   
684                        KEGG_MAPK_SIGNALING_PATHWAY   
685                                KEGG_FOCAL_ADHESION   
686                            KEGG_PATHWAYS_IN_CANCER   

                                        OrphaGene_List  \
0                                              [CALM2]   
1                                               [CTCF]   
2                                           [TNFRSF1A]   
3                                              [CALM2]   
4            

In [244]:
# Ensure df_gene_gene is a separate copy before making modifications
df_gene_gene = df_gene_gene.copy()
# Count the gene in the pathway gene list
df_gene_gene['GeneCount'] = df_gene_gene['Pathway_Genes'].apply(len)
df_gene_gene = df_gene_gene.sort_values(by="GeneCount", ascending=True).reset_index(drop=True)
print(df_gene_gene)

                                              KeggName  \
0    KEGG_MEDICUS_REFERENCE_NON_CANONICAL_INFLAMMAS...   
1    KEGG_MEDICUS_VARIANT_MUTATION_INACTIVATED_DJ1_...   
2    KEGG_MEDICUS_REFERENCE_REGULATION_OF_COMPLEMEN...   
3    KEGG_MEDICUS_VARIANT_MUTATION_INACTIVATED_ERBB...   
4    KEGG_MEDICUS_REFERENCE_BLOCKING_UBIQUITINATION...   
..                                                 ...   
682        KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION   
683              KEGG_REGULATION_OF_ACTIN_CYTOSKELETON   
684                        KEGG_MAPK_SIGNALING_PATHWAY   
685                                KEGG_FOCAL_ADHESION   
686                            KEGG_PATHWAYS_IN_CANCER   

                                        OrphaGene_List  \
0                                              [NLRP3]   
1                                              [PARK7]   
2                                                [CFH]   
3                                              [ERBB4]   
4            

In [246]:
# Remove rows where 'GeneCount' is 1
df_gene_gene = df_gene_gene[df_gene_gene['GeneCount'] > 1].reset_index(drop=True)
print(df_gene_gene)
print(len(df_gene_gene))

                                              KeggName  \
0    KEGG_MEDICUS_VARIANT_MUTATION_CAUSED_ABERRANT_...   
1    KEGG_MEDICUS_VARIANT_MUTATION_CAUSED_ABERRANT_...   
2              KEGG_MEDICUS_REFERENCE_CREATINE_PATHWAY   
3    KEGG_MEDICUS_REFERENCE_CLASSICAL_PATHWAY_OF_CO...   
4    KEGG_MEDICUS_PATHOGEN_SHIGELLA_IPGB1_TO_ITGA_B...   
..                                                 ...   
658        KEGG_CYTOKINE_CYTOKINE_RECEPTOR_INTERACTION   
659              KEGG_REGULATION_OF_ACTIN_CYTOSKELETON   
660                        KEGG_MAPK_SIGNALING_PATHWAY   
661                                KEGG_FOCAL_ADHESION   
662                            KEGG_PATHWAYS_IN_CANCER   

                                        OrphaGene_List  \
0                                            [SLC25A4]   
1                                            [EIF2AK3]   
2                                               [GAMT]   
3                                           [C1R, C1S]   
4            

In [248]:
# Combine all genes from the 'OrphaGene_List' column into a single list
all_genes_list = [gene for sublist in df_gene_gene['OrphaGene_List'] for gene in sublist]
print(len(all_genes_list))

3252


In [250]:
# Remove duplicate genes by converting to a set and back to a list
unique_genes_list = list(set(all_genes_list))
print(len(unique_genes_list))

651


In [252]:
print(df_gene_gene['Pathway_Genes'].head())
print(df_gene_gene['Pathway_Genes'].apply(type).value_counts())

0    [CYCS, SLC25A4]
1    [EIF2AK3, SOD1]
2       [GAMT, GATM]
3         [C1R, C1S]
4      [ELMO2, RAC1]
Name: Pathway_Genes, dtype: object
Pathway_Genes
<class 'list'>    663
Name: count, dtype: int64


In [254]:
# Initialize lists to store the expanded data for df_genes
genes_list = []
kegg_list = []

# Ensure the 'Pathway_Genes' column contains lists
for gene_list in df_gene_gene['Pathway_Genes']:
    if not isinstance(gene_list, list):
        continue  # Skip rows where 'Pathway_Genes' is not a list
    
    for gene in gene_list:
        # Add each gene as a single entry in 'GENE'
        genes_list.append(gene)
        
        # Filter other genes excluding the current gene
        other_genes = [g for g in gene_list if g != gene]
        
        # Randomly select 1 gene (if there are other genes) or leave KEGG empty
        if other_genes:
            kegg_list.append(random.choice(other_genes))
        else:
            kegg_list.append("")  # Add an empty string if no other genes are available

# Create the new DataFrame with 'GENE' and 'KEGG' columns
df_genes = pd.DataFrame({"GENE": genes_list, "KEGG": kegg_list})

# Display the resulting DataFrame and the number of rows
print(df_genes.head())

      GENE     KEGG
0     CYCS  SLC25A4
1  SLC25A4     CYCS
2  EIF2AK3     SOD1
3     SOD1  EIF2AK3
4     GAMT     GATM


In [256]:
df_genes_filtered = df_genes[df_genes['GENE'].isin(unique_genes_list)].reset_index(drop=True)

# Display the filtered DataFrame
df_genes_filtered

Unnamed: 0,GENE,KEGG
0,SLC25A4,CYCS
1,EIF2AK3,SOD1
2,GAMT,GATM
3,C1R,C1S
4,C1S,C1R
...,...,...
3247,VHL,CDK4
3248,WNT10A,STK4
3249,WNT4,MAPK1
3250,WNT5A,BRCA2


In [258]:
df_gene_gene_path = df_genes_filtered.groupby('GENE').head(1)
print(df_gene_gene_path)

           GENE      KEGG
0       SLC25A4      CYCS
1       EIF2AK3      SOD1
2          GAMT      GATM
3           C1R       C1S
4           C1S       C1R
...         ...       ...
3051      SUMF1     AP4E1
3072  TNFRSF11B       AMH
3081     DIAPH1     ACTG1
3119       FLNB  MAPKAPK3
3120       FLNC    MAP2K1

[651 rows x 2 columns]


In [260]:
# To keep only the one occurrences of each "GENEINFO" value and remove any further duplicates in a DataFrame.
print(df_variants_unique)

       CHROM       POS       ID REF ALT GENEINFO  ORPHACODE
0          1    949523   183381   C   T    ISG15     319563
1          1    957748  3255595   C   A     AGRN        590
12         1   1149118    96692   G   A  TNFRSF4     431149
13         1   1167659    60484   A   G  B3GALT6     536467
25         1   1451416   828051   T   G   ATAD3A     615954
...      ...       ...      ...  ..  ..      ...        ...
46620     22  50962167     5679   G   A     SCO2       1561
46627     22  50964236   223058   G   T     TYMP        298
46666     22  51018184  1075204   C   A     CHKB     280671
46681     22  51063656   941799   C   A     ARSA        512
46753     22  51117022  1029037   A   T   SHANK3      48652

[2923 rows x 7 columns]


In [262]:
# Merge the gene-gene pathway data and the pathogenic variants information
df_gene_gene_variants = df_variants_unique.merge(df_gene_gene_path, left_on="GENEINFO", right_on="KEGG", how="inner")

print(df_gene_gene_variants)

     CHROM       POS       ID REF ALT GENEINFO  ORPHACODE     GENE    KEGG
0        1    949523   183381   C   T    ISG15     319563     CYLD   ISG15
1        1    957748  3255595   C   A     AGRN        590  COL11A2    AGRN
2        1   1737942   224715   A   T     GNB1        513     GNB5    GNB1
3        1   9322320    31586   C   G     H6PD     168588   TALDO1    H6PD
4        1   9777666   578525   C   G   PIK3CD     397596   PIK3CA  PIK3CD
..     ...       ...      ...  ..  ..      ...        ...      ...     ...
646     22  37628897     7575   C   T     RAC2     183707    LAMA2    RAC2
647     22  38508510  2709169   C   G   PLA2G6      35069    HSPB1  PLA2G6
648     22  39621728    75241   C   G    PDGFB       1980    TGFB2   PDGFB
649     22  41513680   861851   C   A    EP300     353284    AUTS2   EP300
650     22  42456982  2725305   A   C     NAGA       3137     HEXB    NAGA

[651 rows x 9 columns]


In [264]:
df_gene_gene_variants = df_gene_gene_variants.drop(columns=["ORPHACODE", "KEGG"])
df_gene_gene_variants

Unnamed: 0,CHROM,POS,ID,REF,ALT,GENEINFO,GENE
0,1,949523,183381,C,T,ISG15,CYLD
1,1,957748,3255595,C,A,AGRN,COL11A2
2,1,1737942,224715,A,T,GNB1,GNB5
3,1,9322320,31586,C,G,H6PD,TALDO1
4,1,9777666,578525,C,G,PIK3CD,PIK3CA
...,...,...,...,...,...,...,...
646,22,37628897,7575,C,T,RAC2,LAMA2
647,22,38508510,2709169,C,G,PLA2G6,HSPB1
648,22,39621728,75241,C,G,PDGFB,TGFB2
649,22,41513680,861851,C,A,EP300,AUTS2


In [266]:
# Merge the pair rare disease variants and the pathway variants
df_gene_kegg_variants = df_sorted.merge(df_gene_gene_variants, left_on="GENEINFO", right_on="GENE", how="inner")
print(df_gene_kegg_variants)

     CHROM_x      POS_x     ID_x REF_x ALT_x GENEINFO_x  ORPHACODE  \
0          7  121731848   100645     G     C       AASS       2203   
1          9  107566949     9496     G     A      ABCA1      31150   
2          9  107550750     9502     A     G      ABCA1        425   
3          2  215845332     2858     C     T     ABCA12        313   
4          2  215807742  1693261     C     G     ABCA12        457   
..       ...        ...      ...   ...   ...        ...        ...   
731        5   82400865  1329084     T     A      XRCC4     436182   
732       16   17228331  2137795     G     A      XYLT1       1425   
733       12   32903678   102435     G     A      YARS2       2598   
734        2   98354447    13253     G     A      ZAP70        911   
735       10   31608166   488897     G     A       ZEB1      98974   

    INHERITANCE  CHROM_y      POS_y     ID_y REF_y ALT_y GENEINFO_y    GENE  
0            AR        1   12008042  2849163     T     A      PLOD1    AASS  
1  

In [268]:
# To keep only the first occurrences of ORPHACODE value and remove any further duplicates in a DataFrame.
df_gene_kegg_variants = df_gene_kegg_variants.groupby('ORPHACODE').head(1)
df_gene_kegg_variants

Unnamed: 0,CHROM_x,POS_x,ID_x,REF_x,ALT_x,GENEINFO_x,ORPHACODE,INHERITANCE,CHROM_y,POS_y,ID_y,REF_y,ALT_y,GENEINFO_y,GENE
0,7,121731848,100645,G,C,AASS,2203,AR,1,12008042,2849163,T,A,PLOD1,AASS
1,9,107566949,9496,G,A,ABCA1,31150,AR,2,169781229,1410817,G,A,ABCB11,ABCA1
2,9,107550750,9502,A,G,ABCA1,425,AD,2,169781229,1410817,G,A,ABCB11,ABCA1
3,2,215845332,2858,C,T,ABCA12,313,AD,6,32815380,2848869,G,A,TAP1,ABCA12
4,2,215807742,1693261,C,G,ABCA12,457,AR,6,32815380,2848869,G,A,TAP1,ABCA12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
731,5,82400865,1329084,T,A,XRCC4,436182,AR,13,108860927,1020928,G,C,LIG4,XRCC4
732,16,17228331,2137795,G,A,XYLT1,1425,AR,11,62383998,978467,G,A,B3GAT3,XYLT1
733,12,32903678,102435,G,A,YARS2,2598,AR,16,23546483,803228,G,T,EARS2,YARS2
734,2,98354447,13253,G,A,ZAP70,911,AR,16,81953154,39696,C,A,PLCG2,ZAP70


In [270]:
# Sorting the DataFrame by "INHERITANCE" and "ORPHACODE"
df_gene_kegg_variants = df_gene_kegg_variants.sort_values(by=["INHERITANCE", "ORPHACODE"]).reset_index(drop=True)
df_gene_kegg_variants

Unnamed: 0,CHROM_x,POS_x,ID_x,REF_x,ALT_x,GENEINFO_x,ORPHACODE,INHERITANCE,CHROM_y,POS_y,ID_y,REF_y,ALT_y,GENEINFO_y,GENE
0,4,1806104,16330,G,T,FGFR3,15,AD,15,66727442,13350,T,C,MAP2K1,FGFR3
1,7,138391446,1344704,T,C,ATP6V0A4,18,AD,7,117120149,53423,A,G,CFTR,ATP6V0A4
2,1,154557734,2023208,A,T,ADAR,41,AD,8,42146209,1357742,C,T,IKBKB,ADAR
3,1,154558656,1184494,C,T,ADAR,51,AD,8,42146209,1357742,C,T,IKBKB,ADAR
4,2,227872191,17408,G,T,COL4A4,63,AD,10,89624227,484600,A,G,PTEN,COL4A4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
731,1,1167660,1452181,T,C,B3GALT6,536467,AR,16,17211598,1977942,C,T,XYLT1,B3GALT6
732,9,94985660,253318,A,T,IARS1,541423,AR,6,30884022,3255217,G,T,VARS2,IARS1
733,15,52416686,1685857,C,G,GNB5,542306,AR,1,1737942,224715,A,T,GNB1,GNB5
734,17,7980230,929296,C,T,ALOX12B,542310,AR,4,108852800,989059,A,C,CYP2U1,ALOX12B


In [272]:
df_gene_kegg_variants.to_csv('pathway_genes_variants.csv', index=False)

In [274]:
# Counting the values in the "INHERITANCE" column
inheritance_counts = df_gene_kegg_variants["INHERITANCE"].value_counts()
print(inheritance_counts)

INHERITANCE
AR    394
AD    342
Name: count, dtype: int64


In [276]:
print(df_kegg_sorted)

                                              KeggName  \
0    KEGG_MEDICUS_VARIANT_AMPLIFIED_PDGFR_TO_PLCG_C...   
1     KEGG_MEDICUS_REFERENCE_ESTABLISHMENT_OF_COHESION   
2    KEGG_MEDICUS_REFERENCE_EXTRINSIC_APOPTOTIC_PAT...   
3    KEGG_MEDICUS_PATHOGEN_HCMV_US28_TO_GNAQ_PLCB_G...   
4    KEGG_MEDICUS_PATHOGEN_HCMV_US27_TO_CXCR4_GNB_G...   
..                                                 ...   
682              KEGG_REGULATION_OF_ACTIN_CYTOSKELETON   
683                                      KEGG_LYSOSOME   
684                        KEGG_MAPK_SIGNALING_PATHWAY   
685                                KEGG_FOCAL_ADHESION   
686                            KEGG_PATHWAYS_IN_CANCER   

                                                 Genes  \
0    [CALM1, CALM2, CALM3, CAMK1, CAMK1D, CAMK1G, C...   
1      [CDCA5, CTCF, PDS5A, PDS5B, STAG1, STAG2, WAPL]   
2    [CASP3, CASP7, CASP8, FADD, RIPK1, TNF, TNFRSF...   
3    [CALM1, CALM2, CALM3, CXCL8, GNAQ, NFATC1, NFA...   
4    [BCAR1, 

In [278]:
# Extract the required columns
df_kegg_sorted = df_kegg_sorted[["KeggName", "Genes", "OrphaGene_List", "Pathway_Genes"]]
df_kegg_sorted.to_csv('pathway_genes.csv', index=False)

In [280]:
# Filter rows that contain both 'FGFR3' and 'MAPK1' in the 'Genes' column
df_with_specific_genes = df_kegg_sorted[df_kegg_sorted['Genes'].apply(lambda x: 'FGFR3' in x and 'MAPK1' in x)]
print(df_with_specific_genes)

                                              KeggName  \
477  KEGG_MEDICUS_VARIANT_MUTATION_ACTIVATED_FGFR3_...   
597                                KEGG_BLADDER_CANCER   
617  KEGG_MEDICUS_REFERENCE_FGF_FGFR_RAS_ERK_SIGNAL...   
670  KEGG_MEDICUS_REFERENCE_GF_RTK_RAS_ERK_SIGNALIN...   
682              KEGG_REGULATION_OF_ACTIN_CYTOSKELETON   
684                        KEGG_MAPK_SIGNALING_PATHWAY   
686                            KEGG_PATHWAYS_IN_CANCER   

                                                 Genes  \
477  [ARAF, BRAF, FGFR3, GRB2, HRAS, KRAS, MAP2K1, ...   
597  [ARAF, BRAF, CCND1, CDH1, CDK4, CDKN1A, CDKN2A...   
617  [ARAF, BRAF, FGF1, FGF10, FGF16, FGF17, FGF18,...   
670  [ANGPT1, ANGPT2, ANGPT4, ARAF, AREG, ARTN, BDN...   
682  [ABI2, ACTB, ACTG1, ACTN1, ACTN2, ACTN3, ACTN4...   
684  [AKT1, AKT2, AKT3, ARRB1, ARRB2, ATF2, ATF4, B...   
686  [ABL1, AKT1, AKT2, AKT3, APC, APC2, APPL1, AR,...   

                                        OrphaGene_List  \
477         