In [None]:
# All SCIVIAS data paths
subjects_path = ''
hpo_path = ''
diag_path = ''
gene_path = ''
protein_path = ''

In [2]:
import pandas as pd

hpo = pd.read_csv(hpo_path, encoding='ascii', delimiter=';')
diag = pd.read_csv(diag_path, encoding='ISO-8859-1', delimiter=';')
gene = pd.read_csv(gene_path, encoding='ascii', delimiter=';')
protein = pd.read_csv(protein_path, encoding='ascii', delimiter=';')

# Standardize patient ID column name across all dataframes
diag.rename(columns={'subject_id': 'SCIDPSEUDONYM'}, inplace=True)

In [3]:
# HPO Binary Encoding

hpo_binary = hpo.pivot_table(index='SCIDPSEUDONYM', columns='hpoTermId', aggfunc=lambda x: 1, fill_value=0)
hpo_binary.columns = [f"HPO_{col}" for col in hpo_binary.columns]
hpo_binary.reset_index(inplace=True)# Create binary encoding for HPO terms
hpo_binary = hpo.pivot_table(index='SCIDPSEUDONYM', columns='hpoTermId', aggfunc=lambda x: 1, fill_value=0)
hpo_binary.columns = [f"HPO_{col}" for col in hpo_binary.columns]
hpo_binary.reset_index(inplace=True)

In [None]:
hpo_binary.head()

In [None]:
diag_processed = diag.copy()

# Combine 'diag' and 'genetically_confirmed' into a single column
diag_processed['Combined'] = diag_processed.apply(
    lambda row: f"{row['diag']} (Genetically confirmed: {row['genetically_confirmed']})"
    if not pd.isna(row['genetically_confirmed']) else row['diag'],
    axis=1
)

# Group by 'SCIDPSEUDONYM' and aggregate diagnoses into a single text
diag_processed = diag_processed.groupby('SCIDPSEUDONYM')['Combined'].apply(
    lambda x: '; '.join(x.dropna().unique())
).reset_index()

diag_processed.rename(columns={'Combined': 'Diagnosis'}, inplace=True)


diag_processed.head()

In [None]:
# One column per gene
gene['CADD'] = gene['CADD'].str.replace(',', '.').astype(float)
gene_pivot = gene.pivot_table(index='SCIDPSEUDONYM', columns='gene', values='CADD', aggfunc='first')
gene_pivot.columns = [f"Gene_{col}" for col in gene_pivot.columns]
gene_pivot.reset_index(inplace=True)

gene_pivot.head()

In [None]:
# One column per protein
protein_pivot = protein.pivot_table(index='SCIDPSEUDONYM', columns='Gene', values='VALUE', aggfunc='first')
protein_pivot.columns = [f"Protein_{col}" for col in protein_pivot.columns]
protein_pivot.reset_index(inplace=True)

protein_pivot.head()

In [8]:
# Merge all

merged_df = diag_processed.merge(hpo_binary, on='SCIDPSEUDONYM', how='outer')
merged_df = merged_df.merge(gene_pivot, on='SCIDPSEUDONYM', how='outer')
merged_df = merged_df.merge(protein_pivot, on='SCIDPSEUDONYM', how='outer')

In [9]:
print("Rows:", merged_df.shape[0])
print("Columns:", merged_df.shape[1])

Rows: 2345
Columns: 2522


In [10]:
merged_df['SCIDPSEUDONYM'].nunique()

2345

In [11]:
print("HPO rows:", hpo_binary.shape[0], "HPO unique IDs:", hpo_binary.index.nunique())
print("Gene rows:", gene_pivot.shape[0], "Gene unique IDs:", gene_pivot.index.nunique())
print("Protein rows:", protein_pivot.shape[0], "Protein unique IDs:", protein_pivot.index.nunique())

HPO rows: 1158 HPO unique IDs: 1158
Gene rows: 82 Gene unique IDs: 82
Protein rows: 573 Protein unique IDs: 573


In [12]:
print("Merged rows:", merged_df.shape[0], "Unique IDs:", merged_df['SCIDPSEUDONYM'].nunique())

Merged rows: 2345 Unique IDs: 2345


In [None]:
#Check duplicates
duplicates = merged_df[merged_df.duplicated(subset='SCIDPSEUDONYM', keep=False)]
print("Duplicate rows:\n", duplicates)

In [None]:
merged_df.head()

In [None]:
print(merged_df.dtypes)

In [19]:
merged_df.to_csv('merged_data.csv')