In [None]:
# All SCIVIAS data path
subjects_path = ''
hpo_path = ''
diag_path = ''
gene_path = ''
protein_path = ''

In [28]:
import pandas as pd

subjects = pd.read_csv(subjects_path, encoding='ascii', delimiter=';')
hpo = pd.read_csv(hpo_path, encoding='ascii', delimiter=';')
diag = pd.read_csv(diag_path, encoding='ISO-8859-1', delimiter=';')
gene = pd.read_csv(gene_path, encoding='ascii', delimiter=';')
protein = pd.read_csv(protein_path, encoding='ascii', delimiter=';')

# Standardize patient ID column name across all dataframes
subjects.rename(columns = {'SCIDPSEUDONYM': 'subject_id'}, inplace=True)
hpo.rename(columns={'SCIDPSEUDONYM': 'subject_id'}, inplace=True)
diag.rename(columns={'subject_id': 'subject_id'}, inplace=True)
gene.rename(columns={'SCIDPSEUDONYM': 'subject_id'}, inplace=True)
protein.rename(columns={'SCIDPSEUDONYM': 'subject_id'}, inplace=True)


In [None]:
print(subjects.head())
print(subjects.columns)
print(subjects.dtypes)
print("Info:", subjects.info())
print(subjects.sample(5))
print("Rows:", subjects.shape[0])
print("Columns:", subjects.shape[1])

In [None]:
print(gene.head())
print(gene.columns)
print(gene.dtypes)
print("Info:", gene.info())
print(gene.sample(5))
print("Rows:", gene.shape[0])
print("Columns:", gene.shape[1])

In [None]:
print(hpo.head())
print(hpo.columns)
print(hpo.dtypes)
print("Info:", hpo.info())
print(hpo.sample(5))
print("Rows:", hpo.shape[0])
print("Columns:", hpo.shape[1])

In [None]:
# Are encodings correctly specified above?
print(diag.head())
print(diag.columns)
print(diag.dtypes)
print("Info", diag.info())
print(diag.sample(5))  
print("Rows:", diag.shape[0])
print("Columns:", diag.shape[1])


In [None]:
print(protein.head())
print(protein.columns)
print(protein.dtypes)
print("Info", protein.info())
print(protein.sample(5))  
print("Rows:", protein.shape[0])
print("Columns:", protein.shape[1])

In [8]:
# Get dimensions of each dataframe
print("Subjects:", subjects.shape)
print("HPO:", hpo.shape)
print("Diag:", diag.shape)
print("Gene:", gene.shape)
print("Protein:", protein.shape)

Subjects: (2444, 1)
HPO: (2438, 2)
Diag: (2483, 4)
Gene: (1284, 3)
Protein: (407003, 3)


In [9]:
# Count unique patients
print("Unique Patient IDs in each file:")
print("Subjects:", subjects['subject_id'].nunique())
print("HPO:", hpo['subject_id'].nunique())
print("Diag:", diag['subject_id'].nunique())
print("Gene:", gene['subject_id'].nunique())
print("Protein:", protein['subject_id'].nunique())

Unique Patient IDs in each file:
Subjects: 2444
HPO: 1158
Diag: 2217
Gene: 82
Protein: 573


In [None]:
ids_subjects = set(subjects['subject_id'])
ids_hpo = set(hpo['subject_id'])
ids_diag = set(diag['subject_id'])
ids_gene = set(gene['subject_id'])
ids_protein = set(protein['subject_id'])

In [None]:
# Find subjects present in all files
overlap_all = ids_subjects & ids_hpo & ids_diag & ids_gene & ids_protein

# Look for specific overlaps
overlap_subjects_hpo = ids_subjects & ids_hpo
overlap_subjects_diag = ids_subjects & ids_diag
overlap_subjects_gene = ids_subjects & ids_gene
overlap_subjects_protein = ids_subjects & ids_protein

overlap_hpo_diag_gene = ids_hpo & ids_diag & ids_gene

# Print the counts for each overlap
print("Subjects in all files:", len(overlap_all))
print("Subjects in Subjects and HPO:", len(overlap_subjects_hpo))
print("Subjects in Subjects and Diag:", len(overlap_subjects_diag))
print("Subjects in Subjects and Gene:", len(overlap_subjects_gene))
print("Subjects in Subjects and Protein:", len(overlap_subjects_protein), "\n")
print("Subjects in HPO, Diag, and Gene:", len(overlap_hpo_diag_gene))

Subjects in all files: 28
Subjects in Subjects and HPO: 1158
Subjects in Subjects and Diag: 2217
Subjects in Subjects and Gene: 82
Subjects in Subjects and Protein: 573 

Subjects in HPO, Diag, and Gene: 67


In [12]:
# Calculate overlaps for each combination
ids_subjects_and_ids_diag = ids_subjects & ids_diag
ids_subjects_and_ids_hpo = ids_subjects & ids_hpo
ids_subjects_and_ids_gene = ids_subjects & ids_gene
ids_subjects_and_ids_protein = ids_subjects & ids_protein
ids_diag_and_ids_hpo = ids_diag & ids_hpo
ids_diag_and_ids_gene = ids_diag & ids_gene
ids_diag_and_ids_protein = ids_diag & ids_protein
ids_hpo_and_ids_gene = ids_hpo & ids_gene
ids_hpo_and_ids_protein = ids_hpo & ids_protein
ids_gene_and_ids_protein = ids_gene & ids_protein

ids_subjects_and_ids_diag_and_ids_hpo = ids_subjects & ids_diag & ids_hpo
ids_subjects_and_ids_diag_and_ids_gene = ids_subjects & ids_diag & ids_gene
ids_subjects_and_ids_diag_and_ids_protein = ids_subjects & ids_diag & ids_protein
ids_subjects_and_ids_hpo_and_ids_gene = ids_subjects & ids_hpo & ids_gene
ids_subjects_and_ids_hpo_and_ids_protein = ids_subjects & ids_hpo & ids_protein
ids_subjects_and_ids_gene_and_ids_protein = ids_subjects & ids_gene & ids_protein
ids_diag_and_ids_hpo_and_ids_gene = ids_diag & ids_hpo & ids_gene
ids_diag_and_ids_hpo_and_ids_protein = ids_diag & ids_hpo & ids_protein
ids_diag_and_ids_gene_and_ids_protein = ids_diag & ids_gene & ids_protein
ids_hpo_and_ids_gene_and_ids_protein = ids_hpo & ids_gene & ids_protein

ids_subjects_and_ids_diag_and_ids_hpo_and_ids_gene = ids_subjects & ids_diag & ids_hpo & ids_gene
ids_subjects_and_ids_diag_and_ids_hpo_and_ids_protein = ids_subjects & ids_diag & ids_hpo & ids_protein
ids_subjects_and_ids_diag_and_ids_gene_and_ids_protein = ids_subjects & ids_diag & ids_gene & ids_protein
ids_subjects_and_ids_hpo_and_ids_gene_and_ids_protein = ids_subjects & ids_hpo & ids_gene & ids_protein
ids_diag_and_ids_hpo_and_ids_gene_and_ids_protein = ids_diag & ids_hpo & ids_gene & ids_protein

ids_subjects_and_ids_diag_and_ids_hpo_and_ids_gene_and_ids_protein = ids_subjects & ids_diag & ids_hpo & ids_gene & ids_protein

print(f"Subjects and Diagnosis: {len(ids_subjects_and_ids_diag)} subjects")
print(f"Subjects and HPO: {len(ids_subjects_and_ids_hpo)} subjects")
print(f"Subjects and Gene: {len(ids_subjects_and_ids_gene)} subjects")
print(f"Subjects and Protein: {len(ids_subjects_and_ids_protein)} subjects")
print(f"Diagnosis and HPO: {len(ids_diag_and_ids_hpo)} subjects")
print(f"Diagnosis and Gene: {len(ids_diag_and_ids_gene)} subjects")
print(f"Diagnosis and Protein: {len(ids_diag_and_ids_protein)} subjects")
print(f"HPO and Gene: {len(ids_hpo_and_ids_gene)} subjects")
print(f"HPO and Protein: {len(ids_hpo_and_ids_protein)} subjects")
print(f"Gene and Protein: {len(ids_gene_and_ids_protein)} subjects")

print(f"Subjects and Diagnosis and HPO: {len(ids_subjects_and_ids_diag_and_ids_hpo)} subjects")
print(f"Subjects and Diagnosis and Gene: {len(ids_subjects_and_ids_diag_and_ids_gene)} subjects")
print(f"Subjects and Diagnosis and Protein: {len(ids_subjects_and_ids_diag_and_ids_protein)} subjects")
print(f"Subjects and HPO and Gene: {len(ids_subjects_and_ids_hpo_and_ids_gene)} subjects")
print(f"Subjects and HPO and Protein: {len(ids_subjects_and_ids_hpo_and_ids_protein)} subjects")
print(f"Subjects and Gene and Protein: {len(ids_subjects_and_ids_gene_and_ids_protein)} subjects")
print(f"Diagnosis and HPO and Gene: {len(ids_diag_and_ids_hpo_and_ids_gene)} subjects")
print(f"Diagnosis and HPO and Protein: {len(ids_diag_and_ids_hpo_and_ids_protein)} subjects")
print(f"Diagnosis and Gene and Protein: {len(ids_diag_and_ids_gene_and_ids_protein)} subjects")
print(f"HPO and Gene and Protein: {len(ids_hpo_and_ids_gene_and_ids_protein)} subjects")

print(f"Subjects and Diagnosis and HPO and Gene: {len(ids_subjects_and_ids_diag_and_ids_hpo_and_ids_gene)} subjects")
print(f"Subjects and Diagnosis and HPO and Protein: {len(ids_subjects_and_ids_diag_and_ids_hpo_and_ids_protein)} subjects")
print(f"Subjects and Diagnosis and Gene and Protein: {len(ids_subjects_and_ids_diag_and_ids_gene_and_ids_protein)} subjects")
print(f"Subjects and HPO and Gene and Protein: {len(ids_subjects_and_ids_hpo_and_ids_gene_and_ids_protein)} subjects")
print(f"Diagnosis and HPO and Gene and Protein: {len(ids_diag_and_ids_hpo_and_ids_gene_and_ids_protein)} subjects")

print(f"Subjects and Diagnosis and HPO and Gene and Protein: {len(ids_subjects_and_ids_diag_and_ids_hpo_and_ids_gene_and_ids_protein)} subjects")

Subjects and Diagnosis: 2217 subjects
Subjects and HPO: 1158 subjects
Subjects and Gene: 82 subjects
Subjects and Protein: 573 subjects
Diagnosis and HPO: 1053 subjects
Diagnosis and Gene: 71 subjects
Diagnosis and Protein: 512 subjects
HPO and Gene: 76 subjects
HPO and Protein: 367 subjects
Gene and Protein: 35 subjects
Subjects and Diagnosis and HPO: 1053 subjects
Subjects and Diagnosis and Gene: 71 subjects
Subjects and Diagnosis and Protein: 512 subjects
Subjects and HPO and Gene: 76 subjects
Subjects and HPO and Protein: 367 subjects
Subjects and Gene and Protein: 35 subjects
Diagnosis and HPO and Gene: 67 subjects
Diagnosis and HPO and Protein: 327 subjects
Diagnosis and Gene and Protein: 29 subjects
HPO and Gene and Protein: 34 subjects
Subjects and Diagnosis and HPO and Gene: 67 subjects
Subjects and Diagnosis and HPO and Protein: 327 subjects
Subjects and Diagnosis and Gene and Protein: 29 subjects
Subjects and HPO and Gene and Protein: 34 subjects
Diagnosis and HPO and Gene a

In [None]:
# FURTHER ANALYSIS OF CATEGORICAL AND NUMERICAL VALUES
#Gene

categorical_columns = ['gene'] 
numeric_columns = ['CADD']

# CADD is stored as object. Turn into float64 for a proper analysis
print(gene['CADD'].dtypes)

# Check value counts for each categorical column
for col in categorical_columns:
    value_counts = gene[col].value_counts()
    print(f"Value counts for {col}:\n", value_counts)
    print("-" * 40)

    # Calculate the number of least frequent values
    least_frequent = value_counts[value_counts == 1]
    num_least_frequent = len(least_frequent)
    print(f"Number of unique {col} values: {len(value_counts)}")
    print(f"Number of {col} values that appear only once: {num_least_frequent}")
    print("=" * 40)

# Describe numeric columns to get summary statistics
print("Summary statistics for numeric columns:\n", gene[numeric_columns].describe())

# Check for missing values across all columns
print("Missing values per column:\n", gene.isnull().sum())


In [None]:
#HPO

categorical_columns = ['hpoTermId'] 

# Check value counts for each categorical column
for col in categorical_columns:
    unique_count = hpo[col].nunique()
    
    print(f"Value counts for {col}:\n", hpo[col].value_counts())
    print(f"Total unique values (Length): {unique_count}")
    print("-" * 40)

# Check for missing values across all columns
print("Missing values per column:\n", hpo.isnull().sum())

In [None]:
categorical_columns = ['Gene'] 
numeric_columns = ['VALUE']

# Check value counts for each categorical column
for col in categorical_columns:
    value_counts = protein[col].value_counts()
    print(f"Value counts for {col}:\n", value_counts)
    print("-" * 40)

    # Calculate the number of least frequent values 
    least_frequent = value_counts[value_counts == 1]
    num_least_frequent = len(least_frequent)
    print(f"Number of unique {col} values: {len(value_counts)}")
    print(f"Number of {col} values that appear only once: {num_least_frequent}")
    print("=" * 40)

# Describe numeric columns to get summary statistics
print("Summary statistics for numeric columns:\n", protein[numeric_columns].describe())

# Check for missing values across all columns
print("Missing values per column:\n", protein.isnull().sum())

In [None]:
# Diagnosis

# Frequency analysis
print("Diagnosis Category Distribution:\n", diag['diag'].value_counts())
print("\nYes/No Distribution in Genetically Confirmed:\n", diag['genetically_confirmed'].value_counts())
print("\nYes/No Distribution in Testset:\n", diag['testset'].value_counts())

# Least Frequent Diagnoses
least_frequent_diag = diag['diag'].value_counts()[diag['diag'].value_counts() == 1]
num_least_frequent = len(least_frequent_diag)
print(f"\nNumber of diag values that appear only once: {num_least_frequent}")