In [None]:
# Study: Deep learning in the estimation of chronological age and biological sex using panoramic radiographs
# Author: Willian Oliveira
# Start: 31/03/2023
# Motivation: An exploratory data analysis of odontological panoramic radiographs dataset.
# Study Status: Done

In [None]:
# Importing libraries

import pandas as pd
import numpy as np
import os
from glob import glob
from IPython.display import display, Markdown
from PIL import Image
import matplotlib.pyplot as plt
import seaborn as sns
import re
import hashlib

# Silence warnings

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Loading image metadata

id_df = pd.read_excel('PATH_TO_EXCEL_FILE')
id_df.rename(columns={'idade_meses': 'age_in_months'}, inplace=True)
id_df.drop(columns=['age_in_months'], inplace=True)
id_df.head()

In [None]:
# Loading image metadata

id_df2 = pd.read_csv('PATH_TO_CSV_FILE')
id_df2.drop(columns=['PanoURL'], inplace=True)
id_df2.head()

In [None]:
# check if there are duplicated ids between the two datasets

id_df2[id_df2['id'].isin(id_df['id'])]

In [None]:
# Check dataset sizes

print(f'First dataset size: {id_df.shape[0]}')
print(f'Second dataset size: {id_df2.shape[0]}')

In [None]:
# Concatenate datasets
id_df = pd.concat([id_df, id_df2], ignore_index=True)

# Check dataset size
print(f'Concatenated dataset size: {id_df.shape[0]}')
id_df.head()

In [None]:
# Converting string age to float
def convert_age_to_years(age_str):
    try:
        years, months = age_str.split(" anos e ")
        years = int(years)
        months = int(months.strip()[:-6])
        total_years = round((years + (months / 12)), 2)
        return total_years
    except ValueError as e:
        print(f"Error on '{age_str}': {e}")
        return None


id_df['age_in_years'] = id_df['age'].apply(convert_age_to_years)

In [None]:
# Add Path to the exam image on dataframe
all_image_paths = {os.path.basename(x): x for x in 
                   glob(os.path.join('PATH_TO_PANORAMICS', '*.jpg'))}

print('PXR found:', len(all_image_paths), ', Total Headers', id_df.shape[0])

id_df['id_with_extension'] = id_df['id'].apply(lambda x: f'{x}.jpg')
id_df['path'] = id_df['id_with_extension'].map(all_image_paths.get)
id_df = id_df[['id', 'path', 'sex', 'age_in_years']]

id_df.head()



In [None]:
# Verify if there are repeated ids

print(f"There are {id_df['id'].duplicated().sum()} repeated ids")

In [None]:
# Add Hash identifier of the pacient
# creatre a list to store the data
data = []

# Open the file and read the lines
with open('PATH_TO_PATIENTS_ID', 'r') as f:
    for line in f:
        # Ignore blank lines
        if line.strip() == '':
            continue

        # Create a dictionary to store the data
        dict_data = {}

        # Search for 'id' information in the line and store it in the dictionary
        id_match = re.search("'id': '([^']+)", line)
        if id_match:
            dict_data['id'] = id_match.group(1)

        # Create a hashlib object
        m = hashlib.sha256()

        # First try to find the email
        email_match = re.search("'email': '([^']+)", line)
        if email_match:
            m.update(email_match.group(1).encode())
        else:  # If no email is found, try to find the name
            name_match = re.search("'name': '([^']+)", line)
            if name_match:
                m.update(name_match.group(1).encode())

        dict_data['p_hash'] = m.hexdigest()

        # Append the dictionary to the list
        data.append(dict_data)

# Convert the list to a Pandas DataFrame
hash_df = pd.DataFrame(data)

# Set variables as string
hash_df = hash_df.astype(str)

# Drop nan IDs
hash_df.dropna(subset=['id'], inplace=True)

# Check duplicated ids

if hash_df['id'].duplicated().sum() > 0:
    display(Markdown("### Duplicated ids"))
    display(hash_df[hash_df['id'].duplicated(keep=False)].sort_values(by='id').style)


In [None]:
# Set variables as string
id_df['id'] = id_df.id.astype(str)

# Merge hash_df with id_df
id_df = id_df.merge(hash_df, on='id', how='left')

# Check the DataFrame
print(f"There are {id_df['id'].duplicated().sum()} duplicated ids.")
print(f"There are {id_df['p_hash'].duplicated().sum()} duplicated hashes.")
print(f"There are {id_df['id'].nunique()} unique ids.")
print(f"There are {id_df['p_hash'].nunique()} unique hashes.")
print(f"There are {id_df['p_hash'].isna().sum()} missing hashes.")

# Show a sample of DataFrame
id_df.head().style


In [None]:
# Drop nan hashes

id_df.dropna(subset=['p_hash'], inplace=True)
dup_id_df = id_df[id_df['p_hash'].duplicated(keep=False)]

# Check the DataFrame
print(f"There are {id_df['id'].duplicated().sum()} duplicated ids.")
print(f"There are {dup_id_df['p_hash'].nunique()} duplicated hashes.")
print(f"There are {dup_id_df.shape[0]} exams with repeated hashes.")
print()
print(f"There are {id_df['id'].nunique()} unique ids.")
print(f"There are {id_df['p_hash'].nunique()} unique hashes.")
print(f"There are {id_df['p_hash'].isna().sum()} missing hashes.")

# Show a sample of DataFrame
id_df.head().style

In [None]:
# Verify duplicated hashes:

print(dup_id_df.shape)
print(dup_id_df['p_hash'].nunique())

dup_id_df.head().style


In [None]:
# Drop duplicated hashes

id_df.drop_duplicates(subset=['p_hash'], inplace=True)


In [None]:
# Check if there are duplicated hashes

print(f"There are {id_df['p_hash'].duplicated().sum()} duplicated hashes.")

In [None]:
# Returning the image shape for the non-corrupted images to verify if there is inconsistency.

def get_image_shape(image_path):
    '''
    Check if image is corrupted and return explicit error message to help debugging if so.
    It also returns the image shape for the non-corrupted images.
    
    '''
    try:
        img = Image.open(image_path)
        return img.size
    except Exception as e:
        print(f"{e}")
        return None

id_df["image_shape"] = id_df["path"].apply(get_image_shape)

In [None]:
# Checking how many images have the same shape

id_df[['path', 'image_shape']].groupby('image_shape').count()

In [None]:
# Verify images with diferent shapes

display(id_df[id_df['image_shape'] == (596, 474)])
display(Image.open('data/panoramics/122606.jpg'))

display(id_df[id_df['image_shape'] == (474, 596)])
display(Image.open('data/panoramics/136554.jpg'))

display(id_df[id_df['image_shape'] == (960, 768)])
display(Image.open('data/panoramics/95298.jpg'))

display(id_df[id_df['image_shape'] == (1163, 1600)])
display(Image.open('data/panoramics/130771.jpg'))

display(id_df[id_df['image_shape'] == (4781, 3781)])
display(Image.open('data/panoramics/137704.jpg'))
display(Image.open('data/panoramics/135025.jpg'))


In [None]:
# Checking missing values

def display_missing_values(df):
    """
    This function calculates the percentage of missing values in each column of a
    Pandas DataFrame and displays the results in descending order. 

    :return: None
    """

    missing_values = df.isnull().sum() / len(df) * 100
    missing_values = missing_values.sort_values(ascending=False)
    missing_values.rename("% Missing Values", inplace=True)
    display(Markdown(missing_values.to_markdown()))
    del missing_values

display_missing_values(id_df)

In [None]:
# Checking feature unique values

def unique_values_table(df, uv=3):
    """
    Print a markdown table
    with the col, the number of unique values and the unique values 
    list if there are less than 3 unique values (uv) by defalt.

    :param uv: int
    :return: None
    """
    from IPython.display import display, Markdown
    md_table_str = '|Column Name|Unique Values||\n|---|---|---|\n'
    for col_name, unique_values in df.nunique().items():
        if unique_values > uv:
            md_table_str += '|{}|{}|\n'.format(col_name, unique_values)
        else:
            md_unique_str = ' '.join([
                f'{name}: {value*100:.2f}\%'
                for name, value in 
                df[col_name].value_counts(normalize=True).items()
            ])

            md_table_str += '|{}|{}|{}\n'.format(
                col_name, unique_values, md_unique_str)
    display(Markdown(md_table_str))

unique_values_table(id_df)

In [None]:
# Drop corrupted images

id_df.dropna(subset=['image_shape'], inplace=True)

# Drop cropped images, unexpected and unprocessed images

id_df.drop(id_df[id_df['image_shape'] == (596, 474)].index, inplace=True)
id_df.drop(id_df[id_df['image_shape'] == (474, 596)].index, inplace=True)
id_df.drop(id_df[id_df['image_shape'] == (960, 768)].index, inplace=True)
id_df.drop(id_df[id_df['image_shape'] == (1163, 1600)].index, inplace=True)
id_df.drop(id_df[id_df['image_shape'] == (4781, 3781)].index, inplace=True)

# Checking basic statistics

id_df.describe()

In [None]:
# Generate hash for each image to be used as a unique identifier of the image

import hashlib
from PIL import Image
import io

def get_image_hash(image_path):
    try:
        with Image.open(image_path) as img:
            img_byte_arr = io.BytesIO()
            img.save(img_byte_arr, format='JPEG')
            img_byte_arr = img_byte_arr.getvalue()
            hash_object = hashlib.sha256(img_byte_arr)
        return hash_object.hexdigest()
    except Exception as e:
        print(f"Failed to process image at {image_path}: {e}")
        return None

id_df['i_hash'] = id_df['path'].apply(get_image_hash)


In [None]:
# Check if inclusion of hash column was successful
print(f"Dataset shape: {id_df.shape}")
id_df.head().style

In [None]:
# Check if there are any duplicate images using the hash

print(f"Number of unique images: {id_df['i_hash'].nunique()}")
print(f"Number of duplicate images: {id_df.shape[0] - id_df['i_hash'].nunique()}")

# Print duplicate images

id_df[id_df.duplicated(subset=['i_hash'], keep=False)].sort_values(by=['i_hash']).style

In [None]:
# id_df['sex'] = id_df['sex'].map({0:'M', 1: 'F'})

In [None]:
id_df.head()

In [None]:
# Verify age distribution among the patients
plt.figure(figsize=(10, 5), dpi=300)
sns.set_style("whitegrid")
palette = sns.color_palette("pastel")
sns.histplot(data=id_df, x='age_in_years', hue='sex', kde=True, multiple="stack")
#plt.title('Age distribution among the patients by gender')
plt.xlabel('Age in years', fontsize=15, labelpad=15)
plt.ylabel('Number of Patients', fontsize=15, labelpad=15)
plt.tight_layout()
plt.show()

In [None]:
# Drop patients with age == 0.00
id_df = id_df[id_df['age_in_years'] != 0.00]

print(f"Mean age: {id_df['age_in_years'].mean():.2f}")
print(f"Median age: {id_df['age_in_years'].median():.2f}")
print(f"Mode age: {id_df['age_in_years'].mode()[0]:.2f}")
print(f"Standard Deviation of age: {id_df['age_in_years'].std():.2f}")
print(f"Age range: {id_df['age_in_years'].min():.2f} to {id_df['age_in_years'].max():.2f}")

#Distribution of patients by sex: [Insert number] Male, [Insert number] Female
# Count patients by sex

num_female_patients = id_df[id_df['sex'] == 1].shape[0]
num_male_patients = id_df[id_df['sex']== 0].shape[0]

print(f'Male: {num_male_patients}, Female: {num_female_patients}')


In [None]:
# Group patients by age in groups of 5 years

id_df['age_group'] = pd.cut(id_df['age_in_years'], bins=range(0, 101, 5), right=False)
id_df['age_group'] = id_df['age_group'].astype(str)

id_df['age_group'] = np.where(id_df['age_in_years'] >= 90, '[90, 100)', id_df['age_group'])

id_df[['age_group', 'age_in_years']].groupby('age_group').count()

In [None]:
# Convert sex to binary values
id_df['sex'] = id_df['sex'].map({'M': 0, 'F': 1})

In [None]:
# Exporting the treated dataframe to a csv file

id_df = id_df[['id', 'path', 'p_hash', 'i_hash', 'sex', 'age_in_years', 'age_group']]
id_df.to_csv('data/ccs_dataset.csv', index=False)