In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


In [None]:
clinic = pd.read_csv("clinic.csv")
clinic

In [None]:
baseline_features = ["r_gender", "age", "r_ivtrom", "r_treall", "b_pvstr", "b_pvdm", "b_pvrr", "b_pvaf", "b_pvhc", "nihsco_abl_c", "StudySubjectID"]
target_feature = "mrs_d90d_c"

## Preprocess Y

In [None]:
baseline_y_pre = clinic[target_feature]
baseline_y_pre

In [None]:
y_pre_distribution = baseline_y_pre.value_counts()
y_pre_distribution

In [None]:
plt.pie(x=y_pre_distribution.values, labels=y_pre_distribution.index)
plt.show()

In [None]:
y_val_to_replace = {'mRS 0 - No symptoms (code 6)': 0,
                    'mRS 1 - Minor symptoms, no limitations (code 5)': 0,
                    'mRS 2 - Slight disability, no help needed (code 4)': 0,
                    'mRS 3 - Moderate disability, still independent (code 3)': 1,
                    'mRS 4 - Moderately severe disability (code 2)': 1,
                    'mRS 5 - Severe disability, completely dependent (code 1)': 1,
                    'mRS 6 - Death (code 0)': 1}
y_baseline = baseline_y_pre.replace(y_val_to_replace)
y_baseline

In [None]:
y_distribution = y_baseline.value_counts()
y_distribution

In [None]:
plt.pie(x=y_distribution.values, labels=y_distribution.index)
plt.show()

### Result

Labels are not equal distributed. might need to balance data!

## Preprocess X

In [None]:
baseline_X_pre = clinic[baseline_features]
baseline_X_pre

In [None]:
X_val_to_replace = {'Male' : 0,
                    'Female' : 1,
                    'Yes' : 1,
                    'No' : 0,
                    '1 - Intra-arterial treatment' : 1,
                    '0 - No intra-arterial treatment' : 0
                    }
X_baseline = baseline_X_pre.replace(X_val_to_replace).rename({"r_gender": "female", 'r_treall': 'intra_arterial_treatment'}, axis="columns")
X_baseline

In [None]:
baseline_NIHSS_mean = X_baseline["nihsco_abl_c"].mean()
baseline_age_mean = X_baseline["age"].mean()
baseline_NIHSS_mean

In [None]:
baseline_NIHSS_std = X_baseline["nihsco_abl_c"].std()
baseline_age_std = X_baseline["age"].std()
baseline_NIHSS_std

In [None]:
X_baseline["nihsco_abl_c"] = (X_baseline["nihsco_abl_c"] - baseline_NIHSS_mean) / baseline_NIHSS_std
X_baseline["age"] = (X_baseline["age"] - baseline_age_mean) / baseline_age_std
X_baseline

In [None]:
baseline_age_std, baseline_age_mean

In [None]:
X_baseline["nihsco_abl_c"].mean()

In [None]:
baseline_dataset = pd.concat((X_baseline, y_baseline), axis="columns")
baseline_dataset

In [None]:
# set clinical center
baseline_dataset["StudySubjectID"] = baseline_dataset.StudySubjectID.apply(lambda s: s[:3])
baseline_dataset

In [None]:
BIGGEST_CENTERS = ["ANT",
     "AMC",
     "LUM",
     "AZM",
     "RIJ",
     "MCH"]

In [None]:
baseline_dataset["StudySubjectID"].value_counts().index.values.tolist()

In [None]:
def keep_only_centers(df: pd.DataFrame, centers: list[str]) -> pd.DataFrame:
    return df[df["StudySubjectID"].isin(centers)]

In [None]:
TRAIN_RATIO = 0.75
VAL_RATIO = 0.0
TEST_RATIO = 0.25

# TRAIN_RATIO = 0.65
# VAL_RATIO = 0.15
# TEST_RATIO = 0.20

def split_df_centers(df: pd.DataFrame):

    all_train_sets_org = []
    all_val_sets_org = []
    all_test_sets_org = []

    all_train_sets_norm = []
    all_val_sets_norm = []
    all_test_sets_norm = []

    for center in df["StudySubjectID"].value_counts().index.values.tolist():

        df_center = df[df["StudySubjectID"] == center]

        X_center = df_center.iloc[:,:-1]
        y_center = df_center.iloc[:,-1]

        X_train_center, X_test_temp_center, y_train_center, y_test_temp_center = train_test_split(X_center, y_center, test_size=1-TRAIN_RATIO)

        X_val_center, X_test_center, y_val_center, y_test_center = train_test_split(X_test_temp_center, y_test_temp_center, test_size=TEST_RATIO/(TEST_RATIO + VAL_RATIO))

        #save DFs before normalisation
        train_concat_org = pd.concat((X_train_center, y_train_center), axis="columns")
        val_concat_org = pd.concat((X_val_center, y_val_center), axis="columns")
        test_concat_org = pd.concat((X_test_center, y_test_center), axis="columns")

        train_concat_org.to_csv(f"./datasets/train_org_{center}.csv", index=False)
        val_concat_org.to_csv(f"./datasets/val_org_{center}.csv", index=False)
        test_concat_org.to_csv(f"./datasets/test_org_{center}.csv", index=False)

        all_train_sets_org.append(train_concat_org)
        all_val_sets_org.append(val_concat_org)
        all_test_sets_org.append(test_concat_org)



        # calculate normalisation
        NIHSS_mean_center = X_train_center["nihsco_abl_c"].mean()
        age_mean_center = X_train_center["age"].mean()

        NIHSS_std_center = X_train_center["nihsco_abl_c"].std()
        age_std_center = X_train_center["age"].std()

        X_train_center["nihsco_abl_c"] = (X_train_center["nihsco_abl_c"] - NIHSS_mean_center) / NIHSS_std_center
        X_train_center["age"] = (X_train_center["age"] - age_mean_center) / age_std_center

        X_val_center["nihsco_abl_c"] = (X_val_center["nihsco_abl_c"] - NIHSS_mean_center) / NIHSS_std_center
        X_val_center["age"] = (X_val_center["age"] - age_mean_center) / age_std_center

        X_test_center["nihsco_abl_c"] = (X_test_center["nihsco_abl_c"] - NIHSS_mean_center) / NIHSS_std_center
        X_test_center["age"] = (X_test_center["age"] - age_mean_center) / age_std_center

        #save DFs before normalisation
        train_concat_norm = pd.concat((X_train_center, y_train_center), axis="columns")
        val_concat_norm = pd.concat((X_val_center, y_val_center), axis="columns")
        test_concat_norm = pd.concat((X_test_center, y_test_center), axis="columns")

        train_concat_norm.to_csv(f"./datasets/train_norm_{center}.csv", index=False)
        val_concat_norm.to_csv(f"./datasets/val_norm_{center}.csv", index=False)
        test_concat_norm.to_csv(f"./datasets/test_norm_{center}.csv", index=False)

        all_train_sets_norm.append(train_concat_norm)
        all_val_sets_norm.append(val_concat_norm)
        all_test_sets_norm.append(test_concat_norm)



    pd.concat(all_train_sets_org).to_csv(f"./datasets/train_org_full.csv", index=False)
    pd.concat(all_train_sets_norm).to_csv(f"./datasets/train_norm_full.csv", index=False)

    pd.concat(all_val_sets_org).to_csv(f"./datasets/val_org_full.csv", index=False)
    pd.concat(all_val_sets_norm).to_csv(f"./datasets/val_norm_full.csv", index=False)

    pd.concat(all_test_sets_org).to_csv(f"./datasets/test_org_full.csv", index=False)
    pd.concat(all_test_sets_norm).to_csv(f"./datasets/test_norm_full.csv", index=False)




In [None]:
reduced = keep_only_centers(baseline_dataset, BIGGEST_CENTERS)
reduced

In [None]:
split_df_centers(reduced)

In [None]:
baseline_dataset.to_csv("./clinic_federated_preprocessed_without_normalisation.csv", index=False)

In [None]:
train_ratio = 0.65
validation_ratio = 0.15
test_ratio = 0.20

x_full = baseline_dataset.iloc[:,:-1]
y_full = baseline_dataset.iloc[:,-1]

# train is now 65% of the entire data set
X_train, x_test_temp, y_train, y_test_temp = train_test_split(x_full, y_full, test_size=1 - train_ratio)

# test is now 20% of the initial data set
# validation is now 15% of the initial data set
X_val, X_test, y_val, y_test = train_test_split(x_test_temp, y_test_temp, test_size=test_ratio/(test_ratio + validation_ratio))

X_train.shape, X_val.shape, X_test.shape

In [None]:
X_train.StudySubjectID.value_counts()

In [None]:
pd.concat((X_train, y_train), axis="columns").to_csv("./clinic_fed_train.csv")

In [None]:
pd.concat((X_val, y_val), axis="columns").to_csv("./clinic_fed_val.csv")
pd.concat((X_test, y_test), axis="columns").to_csv("./clinic_fed_test.csv")

In [None]:
baseline_dataset.drop("StudySubjectID", axis="columns").to_csv("./baseline_dataset_medium_in_one.csv", index=False)

In [None]:
baseline_dataset.to_pickle("./baseline_dataset.pkl", protocol=4)

In [None]:
pd.concat((X_baseline, y_baseline), axis="columns").drop("StudySubjectID", axis="columns").to_csv("./baseline_dataset_three_cat.csv", index=False) #without normalisation

In [None]:
# plot y per center
# -> bias?


In [None]:
plt.pie(x=y_pre_distribution.values, labels=y_pre_distribution.index)
plt.show()

In [None]:
baseline_dataset.iloc[:, :-2]

In [None]:
baseline_dataset

In [None]:
age = baseline_dataset.age
age

In [None]:
plt.scatter(age, baseline_dataset.mrs_d90d_c)
plt.show()

In [None]:
print(baseline_dataset[["age", "mrs_d90d_c"]].corr().iloc[0,-1])

In [None]:
baseline_dataset[["age", "mrs_d90d_c"]].corr()

In [None]:
import numpy as np

In [None]:
baseline_X_pre

In [None]:
np.array_split(baseline_X_pre, 3)

In [None]:
baseline_X_pre.iloc[3:, :]

In [None]:
baseline_X_pre.iloc[:3, :]

In [None]:
baseline_X_pre["StudySubjectID"] = baseline_X_pre.StudySubjectID.apply(lambda s: s[:3])

In [None]:
((np.array(baseline_X_pre["StudySubjectID"].value_counts().iloc[:6].to_list()) / 370) * 273).round(0).sum()

In [None]:
len(baseline_X_pre)

In [None]:
x = 60 / 100 * 273

In [None]:
x

In [None]:
round(x)

In [None]:
round(25 / 100 * 273)

In [None]:
round(15 / 100 * 273)

In [None]:
164 + 68 + 41

In [None]:
44.0
38.0
33.0
32.0
29.0
27.0