In [2]:
import xml.etree.ElementTree as ET
import pandas as pd

In [4]:
# OrphaCode and Inheritance information "en_product9_ages.xml" is available on this website 
# https://github.com/Orphanet/Orphadata_aggregated/tree/master/Epidemiological%20data/Rare%20diseases%20natural%20history

tree = ET.parse('en_product9_ages.xml')
root = tree.getroot()

# Extract OrphaCode and Type of Inheritance Name (lang="en")
inheritance_data = []
for disorder in root.findall(".//Disorder"):
    orpha_code = disorder.find("OrphaCode").text
    inheritance_element = disorder.find(".//TypeOfInheritanceList/TypeOfInheritance/Name[@lang='en']")
    if inheritance_element is not None:
        inheritance_data.append({
            "OrphaCode": orpha_code,
            "TypeOfInheritance": inheritance_element.text
        })

In [6]:
# Convert list to DataFrame
df = pd.DataFrame(inheritance_data)

In [8]:
print(df)

     OrphaCode    TypeOfInheritance
0       166024  Autosomal recessive
1           58   Autosomal dominant
2           61  Autosomal recessive
3           93  Autosomal recessive
4          585  Autosomal recessive
...        ...                  ...
5529    641361  Autosomal recessive
5530    642747   Autosomal dominant
5531    617919   Autosomal dominant
5532    619363   Autosomal dominant
5533    619233   Autosomal dominant

[5534 rows x 2 columns]


In [14]:
# Filter rows where TypeOfInheritance is either "Autosomal recessive" or "Autosomal dominant"
df_filtered = df[df['TypeOfInheritance'].isin(['Autosomal recessive', 'Autosomal dominant'])]

# Display the filtered DataFrame
print(df_filtered)

     OrphaCode    TypeOfInheritance
0       166024  Autosomal recessive
1           58   Autosomal dominant
2           61  Autosomal recessive
3           93  Autosomal recessive
4          585  Autosomal recessive
...        ...                  ...
5529    641361  Autosomal recessive
5530    642747   Autosomal dominant
5531    617919   Autosomal dominant
5532    619363   Autosomal dominant
5533    619233   Autosomal dominant

[3617 rows x 2 columns]


In [18]:
# Use .loc to replace 'Autosomal recessive' with 'AR' and 'Autosomal dominant' with 'AD' directly
df_filtered 
df_filtered.loc[df_filtered['TypeOfInheritance'] == 'Autosomal recessive', 'TypeOfInheritance'] = 'AR'
df_filtered.loc[df_filtered['TypeOfInheritance'] == 'Autosomal dominant', 'TypeOfInheritance'] = 'AD'

# Display the modified DataFrame
print(df_filtered)

     OrphaCode TypeOfInheritance
0       166024                AR
1           58                AD
2           61                AR
3           93                AR
4          585                AR
...        ...               ...
5529    641361                AR
5530    642747                AD
5531    617919                AD
5532    619363                AD
5533    619233                AD

[3617 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.loc[df_filtered['TypeOfInheritance'] == 'Autosomal recessive', 'TypeOfInheritance'] = 'AR'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.loc[df_filtered['TypeOfInheritance'] == 'Autosomal dominant', 'TypeOfInheritance'] = 'AD'


In [57]:
# Group by TypeOfInheritance and count occurrences
inheritance_group = df_filtered.groupby('TypeOfInheritance').size().reset_index(name='Count')

# Display grouped data
print(inheritance_group)

     TypeOfInheritance  Count
0   Autosomal dominant   1708
1  Autosomal recessive   1909


In [59]:
# Save DataFrame as a CSV file
df_filtered.to_csv('Orpha_Inheritance.csv', index=False)