# SCIN Data Modeling
This notebook combines the SCIN cases and labels datasets for analysis.

In [1]:
import pandas as pd

## Load Datasets

In [None]:
# Load the cases dataset
cases_df = pd.read_csv("data/raw/dataset/scin_cases.csv")
print(f"Cases dataset shape: {cases_df.shape}")
print(f"Number of cases: {len(cases_df)}")
cases_df.head()

In [None]:
# Load the labels dataset
labels_df = pd.read_csv("data/raw/dataset/scin_labels.csv")
print(f"Labels dataset shape: {labels_df.shape}")
print(f"Number of labels: {len(labels_df)}")
labels_df.head()

## Merge Datasets
Combine the cases and labels datasets using the `case_id` column.

In [4]:
# Merge datasets on case_id
combined_df = pd.merge(cases_df, labels_df, on="case_id", how="inner")
print(f"Combined dataset shape: {combined_df.shape}")
print(f"Number of rows: {len(combined_df)}")
print(f"Number of columns: {len(combined_df.columns)}")
combined_df.head()

Combined dataset shape: (5033, 73)
Number of rows: 5033
Number of columns: 73


Unnamed: 0,case_id,source,release,year,age_group,sex_at_birth,fitzpatrick_skin_type,race_ethnicity_american_indian_or_alaska_native,race_ethnicity_asian,race_ethnicity_black_or_african_american,...,dermatologist_gradable_for_fitzpatrick_skin_type_1,dermatologist_gradable_for_fitzpatrick_skin_type_2,dermatologist_gradable_for_fitzpatrick_skin_type_3,dermatologist_fitzpatrick_skin_type_label_1,dermatologist_fitzpatrick_skin_type_label_2,dermatologist_fitzpatrick_skin_type_label_3,gradable_for_monk_skin_tone_india,gradable_for_monk_skin_tone_us,monk_skin_tone_label_india,monk_skin_tone_label_us
0,-1000600354148496558,SCIN,1.0.0,2023,AGE_UNKNOWN,OTHER_OR_UNSPECIFIED,,,,,...,YES,,,FST2,,,True,True,2.0,1.0
1,-1002039107727665188,SCIN,1.0.0,2023,AGE_UNKNOWN,OTHER_OR_UNSPECIFIED,,,,,...,YES,,,FST1,,,True,True,3.0,3.0
2,-1003358831658393077,SCIN,1.0.0,2023,AGE_18_TO_29,MALE,NONE_IDENTIFIED,,,,...,YES,,,FST4,,,True,True,3.0,4.0
3,-1003826561155964328,SCIN,1.0.0,2023,AGE_UNKNOWN,OTHER_OR_UNSPECIFIED,,,,,...,NO,,,,,,True,True,2.0,4.0
4,-1003844406100696311,SCIN,1.0.0,2023,AGE_40_TO_49,FEMALE,FST3,,,,...,YES,,,FST1,,,True,True,1.0,1.0


In [5]:
# Display column names
print("Combined dataset columns:")
print(combined_df.columns.tolist())

Combined dataset columns:
['case_id', 'source', 'release', 'year', 'age_group', 'sex_at_birth', 'fitzpatrick_skin_type', 'race_ethnicity_american_indian_or_alaska_native', 'race_ethnicity_asian', 'race_ethnicity_black_or_african_american', 'race_ethnicity_hispanic_latino_or_spanish_origin', 'race_ethnicity_middle_eastern_or_north_african', 'race_ethnicity_native_hawaiian_or_pacific_islander', 'race_ethnicity_white', 'race_ethnicity_other_race', 'race_ethnicity_prefer_not_to_answer', 'textures_raised_or_bumpy', 'textures_flat', 'textures_rough_or_flaky', 'textures_fluid_filled', 'body_parts_head_or_neck', 'body_parts_arm', 'body_parts_palm', 'body_parts_back_of_hand', 'body_parts_torso_front', 'body_parts_torso_back', 'body_parts_genitalia_or_groin', 'body_parts_buttocks', 'body_parts_leg', 'body_parts_foot_top_or_side', 'body_parts_foot_sole', 'body_parts_other', 'condition_symptoms_bothersome_appearance', 'condition_symptoms_bleeding', 'condition_symptoms_increasing_size', 'condition_

In [6]:
# Basic info about the combined dataset
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5033 entries, 0 to 5032
Data columns (total 73 columns):
 #   Column                                              Non-Null Count  Dtype  
---  ------                                              --------------  -----  
 0   case_id                                             5033 non-null   int64  
 1   source                                              5033 non-null   object 
 2   release                                             5033 non-null   object 
 3   year                                                5033 non-null   int64  
 4   age_group                                           5032 non-null   object 
 5   sex_at_birth                                        5032 non-null   object 
 6   fitzpatrick_skin_type                               2503 non-null   object 
 7   race_ethnicity_american_indian_or_alaska_native     73 non-null     object 
 8   race_ethnicity_asian                                96 non-null     object 
 9

## Data Download

Download the raw dataset from the public Google Cloud Storage bucket to `data/raw/`.

The bucket (`dx-scin-public-data`) is public, so no credentials are required.  
Image downloads can take several minutes — already-downloaded files are skipped automatically.

In [None]:
import sys

sys.path.insert(0, ".")

from scin_data_modeling.data.download import download_csvs, download_images

# Step 1: download the four metadata CSVs (fast, < 10 MB)
download_csvs()

# Step 2: download all case images (~5 000 cases × up to 3 images, may take a few minutes)
download_images()

## Label Engineering

Each case is reviewed by **up to three dermatologists** who independently vote on whether the image quality is sufficient to grade the skin condition:

| Value | Meaning |
|-------|---------|
| `DEFAULT_YES_IMAGE_QUALITY_SUFFICIENT` | Image is gradable → **label = 1** |
| `NO_IMAGE_QUALITY_INSUFFICIENT` | Image is not gradable → **label = 0** |

The final binary label is the **majority vote** across the available assessments (rater 1, 2, and 3).  
Cases where no rater provided an assessment are dropped.

In [None]:
from scin_data_modeling.data.preprocess import (
    build_processed_df,
    create_train_test_split,
    save_splits,
)

# Raw gradability columns (one per dermatologist rater)
GRADABLE_COLS = [
    "dermatologist_gradable_for_skin_condition_1",
    "dermatologist_gradable_for_skin_condition_2",
    "dermatologist_gradable_for_skin_condition_3",
]

# Inspect distinct values and coverage
for col in GRADABLE_COLS:
    n_non_null = combined_df[col].notna().sum()
    print(f"{col}  →  {n_non_null:,} ratings")
    print(combined_df[col].value_counts(dropna=True).to_string())
    print()

## Image Path Preparation

Each case can have **1 to 3 images** stored at `data/raw/dataset/images/<image_id>.png`.  
The `build_processed_df` helper collects the available image paths into a JSON list and computes a majority-vote binary label, producing a compact DataFrame ready for model input.

In [None]:
processed_df = build_processed_df(combined_df)

print(
    f"Processed dataset: {len(processed_df):,} cases  (dropped {len(combined_df) - len(processed_df):,} with no label)"
)
print("\nLabel distribution:")
print(processed_df["label"].value_counts().rename({1: "Gradable (1)", 0: "Not gradable (0)"}).to_string())
print("\nImages per case:")
print(processed_df["num_images"].value_counts().sort_index().to_string())
processed_df.head(10)

## Train / Test Split

An **80 / 20 stratified split** is created on the binary label to preserve class proportions in both sets.  
The random seed is fixed at 42 for reproducibility.

In [None]:
train_df, test_df = create_train_test_split(processed_df, test_size=0.2, random_state=42)

print(f"Train: {len(train_df):,} cases")
print(f"  label=1 (gradable):     {(train_df['label'] == 1).sum():,}  ({(train_df['label'] == 1).mean():.1%})")
print(f"  label=0 (not gradable): {(train_df['label'] == 0).sum():,}  ({(train_df['label'] == 0).mean():.1%})")
print()
print(f"Test:  {len(test_df):,} cases")
print(f"  label=1 (gradable):     {(test_df['label'] == 1).sum():,}  ({(test_df['label'] == 1).mean():.1%})")
print(f"  label=0 (not gradable): {(test_df['label'] == 0).sum():,}  ({(test_df['label'] == 0).mean():.1%})")
print()
print("Image counts in train split:")
print(train_df["num_images"].value_counts().sort_index().to_string())
print()
print("Image counts in test split:")
print(test_df["num_images"].value_counts().sort_index().to_string())

## Save Processed Data

Save the processed train and test splits to `data/processed/`.  
The `image_paths` column is stored as a **JSON string** so it survives CSV round-trips. Use `load_split()` to reload and deserialise it automatically.

In [None]:
save_splits(train_df, test_df)

# Verify round-trip: reload and deserialise image_paths
from scin_data_modeling.data.preprocess import load_split

train_reloaded = load_split("train")
print("Reloaded train split — first row image_paths:", train_reloaded.loc[0, "image_paths"])
print("Type:", type(train_reloaded.loc[0, "image_paths"]))