In [117]:
import pandas as pd
import numpy as np

In [2]:
from diabetic_retinopathy.constants import DATA_DIR, RAWDATA_DIR, WEIGHTS_DIR

ModuleNotFoundError: No module named 'ehrapylat'

In [118]:
from pathlib import Path


PROJECT_DIR = Path("../project_folder")
DATA_DIR = PROJECT_DIR / "data"
RAWDATA_DIR = DATA_DIR / "raw"
PROCESSED_DIR = DATA_DIR / "processed"
DATASPLIT_DIR = DATA_DIR / "data_split"

In [119]:
annotations_file = RAWDATA_DIR / "labels" / "trainLabels.csv"
labels = pd.read_csv(annotations_file)
labels

Unnamed: 0,image,level
0,10_left,0
1,10_right,0
2,13_left,0
3,13_right,0
4,15_left,1
...,...,...
35121,44347_right,0
35122,44348_left,0
35123,44348_right,0
35124,44349_left,0


In [120]:
labels.shape, labels.level.value_counts()

((35126, 2),
 level
 0    25810
 2     5292
 1     2443
 3      873
 4      708
 Name: count, dtype: int64)

In [121]:
labels["patient_id"] = labels.apply(lambda x: x.image.split("_")[0], axis=1)
labels["laterality"] = labels.apply(
    lambda x: str.upper(x.image.split("_")[1][0]), axis=1
)
labels

Unnamed: 0,image,level,patient_id,laterality
0,10_left,0,10,L
1,10_right,0,10,R
2,13_left,0,13,L
3,13_right,0,13,R
4,15_left,1,15,L
...,...,...,...,...
35121,44347_right,0,44347,R
35122,44348_left,0,44348,L
35123,44348_right,0,44348,R
35124,44349_left,0,44349,L


In [122]:
label_encoding = {0: "Normal", 1: "Mild", 2: "Medium", 3: "Severe", 4: "Proliferative"}

In [123]:
labels["level_decoded"] = labels[["level"]].replace({"level": label_encoding})
labels

Unnamed: 0,image,level,patient_id,laterality,level_decoded
0,10_left,0,10,L,Normal
1,10_right,0,10,R,Normal
2,13_left,0,13,L,Normal
3,13_right,0,13,R,Normal
4,15_left,1,15,L,Mild
...,...,...,...,...,...
35121,44347_right,0,44347,R,Normal
35122,44348_left,0,44348,L,Normal
35123,44348_right,0,44348,R,Normal
35124,44349_left,0,44349,L,Normal


In [124]:
# Lets see how many patients had different diagnosis for different eyes

In [125]:
df_no_dups = labels[~labels.duplicated(subset=["patient_id"], keep=False)]
df_no_dups.shape

(0, 5)

In [126]:
# Info on patients with 2 eyes:
# df_with_different_diagnosis = (
#         labels[labels.duplicated(subset=["id"], keep=False)]
#         .groupby("id")
#         .filter(lambda x: x["level"].nunique() > 1)
#     )  #keep=False =>  Mark all duplicates as True

t = labels[labels.duplicated(subset=["patient_id"], keep=False)]
t_comb = t.groupby("patient_id")
t_comb = t_comb.apply(lambda x: np.sort(list(x.level.unique())))
t_comb = pd.DataFrame(t_comb)

In [127]:
t_comb[0] = t_comb.apply(lambda x: " ".join([str(el) for el in x[0]]), axis=1)
print("Value counts of response combinations of patients with both eyes present")
print(t_comb[0].value_counts())
print("__________________________________________________________")

Value counts of response combinations of patients with both eyes present
0
0      12155
2       1998
0 1      842
0 2      631
1        600
1 2      393
3        307
4        263
2 3      183
2 4       89
3 4       67
0 4       21
0 3        6
1 4        5
1 3        3
Name: count, dtype: int64
__________________________________________________________


# Save two types - one with different diagnosis for eys, one without

In [129]:
# Info on patients with 2 eyes:
def mark_groups(group):
    different_diagnosis = group["level"].nunique() > 1
    # print(different_diagnosis)
    group["different_diagnosis"] = different_diagnosis
    # display(group)

    return group


labels = labels.groupby("patient_id", as_index=False).apply(lambda x: mark_groups(x))
# keep=False =>  Mark all duplicates as True
labels.reset_index(inplace=True, drop=True)

In [130]:
# annotations_file_processed = PROCESSED_DIR
import os

os.makedirs(PROCESSED_DIR, exist_ok=True)
labels.to_csv(PROCESSED_DIR / "labels_full.csv", index=False)

In [131]:
# annotations_file_processed = PROCESSED_DIR
import os

os.makedirs(PROCESSED_DIR, exist_ok=True)
labels[~labels.different_diagnosis].to_csv(
    PROCESSED_DIR / "labels_wo_diff_diagnosis.csv", index=False
)

# Data split  = train 0.6 / val 0.1 test 0.3

In [146]:
from sklearn.model_selection import train_test_split


def create_random_split(
    df,
    n_train=0.6,
    n_val=0.1,
    random_state=228,
    stratificaion_col="level",
):
    """
    Split data at the subject level into training, (optional) validation and test set with equivalent response_col and additional stratification columns distributions at the first visit
    Pipeline:
        1. Drop duplicated "patient_id" entries, so we are left with one row per patient_id.
         Note: we assume filtering was done before, so that if patient had 2 eyes recorded, both had same response type, or otherwise the patient was filtered out.
        2. Collapse all columns that should be used for stratification into one joint column
        3. split df into train_val and test dataset using joint stratification as a stratification factor
        4. if n_val > 0, we will split train_val dataset into train and validation subsets, otherwise we will assign train_val as train and leave val empty

    :param df: dataframe to split, assumes df to not contain duplicates
    :param n_train: ratio of patient ids which will end up in train set. Test set size is the (1-n_train-n_val)
    :param n_val: ratio of patient ids which will end up in val set. Test set size is the (1-n_train-n_val)
    :param random_state:
    :param stratificaion_col: main column by which to stratify. Response type by default
    :return:
    """
    assert n_train + n_val < 1

    return_columns = ["patient_id", stratificaion_col]
    full_df = df.copy()

    train_val_df, test_df = train_test_split(
        full_df,
        train_size=n_train + n_val,
        random_state=random_state,
        stratify=full_df[stratificaion_col].values,
    )

    if n_val > 0:
        train_df, val_df = train_test_split(
            train_val_df,
            train_size=n_train / (n_train + n_val),
            random_state=random_state,
            stratify=train_val_df[stratificaion_col].values,
        )
    else:
        train_df = train_val_df
        val_df = pd.DataFrame(columns=train_df.columns)

    assert [
        (el not in list(test_df.patient_id.values))
        for el in list(train_df.patient_id.values)
    ]

    assert [
        (el not in list(test_df.patient_id.values))
        for el in list(val_df.patient_id.values)
    ]

    assert [
        (el not in list(train_df.patient_id.values))
        for el in list(val_df.patient_id.values)
    ]
    return {
        "train": train_df[return_columns],
        "val": val_df[return_columns],
        "test": test_df[return_columns],
    }


def split_sequences(
    input_path,
    result_folder,
    n_train=0.6,
    n_val=0.1,
    random_state=228,
    response_col="level",
    subset="wo_diff_diagnosis",
):
    """
    Takes data from "responders_filepath" and splits by "patient_id, laterality

    :param df_path:
    :param result_folder:
    :param n_train:
    :param random_state:
    :param response_col:
    :return:
    """
    raw_df = pd.read_csv(str(input_path / f"labels_{subset}.csv"))

    print(
        "Data distribution of the full dataset \n",
        raw_df[response_col].value_counts(normalize=True),
    )

    df_wo_dups = raw_df.drop_duplicates(["patient_id"], keep="first")

    df_wo_dups.reset_index(inplace=True, drop=True)
    print(
        "Data distribution of the dataset w/o duplicated patients  \n",
        df_wo_dups[response_col].value_counts(normalize=True),
    )

    split_dict = create_random_split(
        df_wo_dups,
        n_train=n_train,
        n_val=n_val,
        random_state=random_state,
        stratificaion_col=response_col,
    )

    os.makedirs(result_folder, exist_ok=True)
    for key in split_dict.keys():
        df = split_dict[key]
        df_merged = pd.merge(raw_df, df[["patient_id"]], on="patient_id", how="inner")

        print(
            f"Data distribution of the {key} dataset of len {df_merged.shape[0]}, ratio is {df_merged.shape[0]/raw_df.shape[0]}\n",
            df_merged[response_col].value_counts(),
        )

        df_merged.to_csv(os.path.join(result_folder, f"{key}_df.csv"), index=False)

In [147]:
split_sequences(
    input_path=PROCESSED_DIR,
    result_folder=DATASPLIT_DIR,
    n_train=0.6,
    n_val=0.1,
)

Data distribution of the full dataset 
 level
0    0.793252
2    0.130392
1    0.039157
3    0.020035
4    0.017164
Name: proportion, dtype: float64
Data distribution of the dataset w/o duplicated patients  
 level
0    0.793252
2    0.130392
1    0.039157
3    0.020035
4    0.017164
Name: proportion, dtype: float64
Data distribution of the train dataset of len 18386, ratio is 0.5999477909025648
 level
0    14584
2     2398
1      720
3      368
4      316
Name: count, dtype: int64
Data distribution of the val dataset of len 3066, ratio is 0.10004568296025583
 level
0    2432
2     400
1     120
3      62
4      52
Name: count, dtype: int64
Data distribution of the test dataset of len 9194, ratio is 0.30000652613717943
 level
0    7294
2    1198
1     360
3     184
4     158
Name: count, dtype: int64
