In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
clinic = pd.read_csv("./clinic.csv")
clinic

# Goals

- NaNs per feature
- different values per features
- one encoding of enums and maybe text
- instances per clinic

In [None]:
clinic.describe()

In [None]:
# make blanks to NaN
clinic = clinic.replace(r'^\s*$', np.nan, regex=True)


In [None]:
nan_per_feature = clinic.isna().sum()
nan_per_feature = pd.DataFrame(nan_per_feature, columns=["count"])
nan_per_feature

In [None]:
nan_per_feature["share"] = nan_per_feature["count"] / len(clinic)
nan_per_feature

=> remove columns with share of NaNs > 20%?

In [None]:
NaN_threshold = 0.2
NaN_threshold_value = len(clinic) * NaN_threshold
(nan_per_feature['count'] > NaN_threshold_value).sum()

In [None]:
def calculate_number_of_informative_features(df: pd.DataFrame, threshold: float) -> int:
    return (df["share"] < threshold).sum()

In [None]:
calculate_number_of_informative_features(nan_per_feature, 0.2)

In [None]:
thresholds = np.arange(0, 1, 0.01)
relevant_features = [calculate_number_of_informative_features(nan_per_feature, threshold) for threshold in thresholds]

plt.plot(thresholds, relevant_features)
plt.xlabel("Threshold")
plt.ylabel("# Features")
plt.grid()
plt.show()

Just from data perspective set threshold to 3% NaNs or 10% or 20%

In [None]:
calculate_number_of_informative_features(nan_per_feature, 0.05)

In [None]:
calculate_number_of_informative_features(nan_per_feature, 0.1)

In [None]:
calculate_number_of_informative_features(nan_per_feature, 0.2)


Show features to be removed

In [None]:
nan_per_feature[nan_per_feature['share'] > 0.05]

Now, we proceed with the filtered dataset with <= 5 % NaNs

In [None]:
features_to_use = nan_per_feature[nan_per_feature["share"] <= 0.05]
features_to_use.T.columns

In [None]:
features_to_use = features_to_use.T.columns
features_to_use

In [None]:
clinic_reduced_NaNs = clinic[features_to_use]
clinic_reduced_NaNs

In [None]:
clinic_reduced_NaNs.isnull().sum()

Max 20 = 4% of feature is null -> no action required

Next:
- Encode YES/NO to 1/0
- gender female = 1,  male = 0
- one-hot-encode 1 to 5 enums
- remove timestamps "merge_date", "r_time", "studySubjectId"

In [None]:
clinic_reduced_NaNs = clinic_reduced_NaNs.drop(["r_time", "merge_date", "StudySubjectID", "Unnamed: 0"], axis=1 , errors='ignore')
clinic_reduced_NaNs

In [None]:
from sklearn.preprocessing import OneHotEncoder

drop_binary_enc = OneHotEncoder(drop='if_binary').fit(clinic_reduced_NaNs)
drop_binary_enc.transform(clinic_reduced_NaNs)

In [None]:
pd.get_dummies(clinic_reduced_NaNs['r_gender'], prefix='gender', drop_first=True)

In [None]:
# use pd.concat to join the new columns with your original dataframe
clinic_encoded = pd.concat([clinic_reduced_NaNs,pd.get_dummies(clinic_reduced_NaNs['r_gender'], prefix='gender', drop_first=True)], axis=1)

# now drop the original 'country' column (you don't need it anymore)
clinic_encoded.drop(['r_gender'],axis=1, inplace=True)
clinic_encoded

In [None]:
binary_features = ["r_gender","r_treall", "r_ivtrom", "b_pvstr", "b_pvmi", "b_pvpad", "b_pvdm", "b_pvrr", "b_pvaf", "b_pvhc", "b_smoke", "b_medap", "b_medcou", "b_medhep", "b_medsta", "b_medahs", "i_iatrt","sympt_side", "sICH", "anysae", "SAE_IS_1stonly", "SAE_ICH_1stonly", "SAE_EH_1stonly", "SAE_CI_1stonly", "SAE_AR_1stonly", "SAE_PN_1stonly", "SAE_INF_1stonly", "SAE_OC_1stonly", "SAE_PS_1stonly", "ph2", "ph1", "ph2_c", "ph1_c", "hi1_c", "hi2_c"]
categorical_features = [ "nih1a_abl", "nih1b_abl", "nih1c_abl", "nih2_abl", "nih3_abl", "nih4_abl", "nih5a_abl", "nih5b_abl", "nih6a_abl", "nih6b_abl", "nih7_abl", "nih8_abl", "nih9_abl", "nih10_abl", "nih11_abl", "premrs", "nih1a_b24h", "nih1b_b24h", "nih1c_b24h", "nih2_b24h", "nih3_b24h", "nih4_b24h", "nih5a_b24h", "nih5b_b24h", "nih6a_b24h", "nih6b_b24h", "nih7_b24h", "nih8_b24h", "nih9_b24h", "nih10_b24h", "nih11_b24h", "mrs_d90d_c", "loc_cta_abl", "cgsc_cta_abl_c"]

In [None]:
clinic_binary = clinic_reduced_NaNs.replace("No", 0)
clinic_binary = clinic_binary.replace("Yes", 1)

clinic_binary = clinic_binary.replace("Male", 0)
clinic_binary = clinic_binary.replace("Female", 1)
clinic_binary = clinic_binary.rename(columns={"r_gender": "female"})

clinic_binary = clinic_binary.replace("0 - No intra-arterial treatment", 0)
clinic_binary = clinic_binary.replace("1 - Intra-arterial treatment", 1)
clinic_binary = clinic_binary.rename(columns={"r_treall": "intra_arterial_treatment"})

clinic_binary

https://machinelearningmastery.com/one-hot-encoding-for-categorical-data/
referenes why we do that

=> if use tree based methods, keep n features. If not stay with n-1 to prevent linear dependent features

In [None]:
clinic_categorical = clinic_binary[categorical_features]
clinic_categorical

In [None]:
clinic_categorical = clinic_categorical.replace("0 - Alert", "Alert")
clinic_categorical = clinic_categorical.replace("1 - Not alert, but arousable", "No_Alert_but_arousable")
clinic_categorical = clinic_categorical.replace("2 - Not alert, requires repeated stimulation", "No_Alert_but_repeated_stimulation")
clinic_categorical = clinic_categorical.replace("3 - Comatose", "Comatose")



clinic_categorical

In [None]:
onehot_encoder = OneHotEncoder( sparse=False)
transformed = onehot_encoder.fit_transform(pd.DataFrame(clinic_categorical["nih1a_abl"]).to_numpy().reshape(-1, 1))
transformed

In [None]:
pd.get_dummies(clinic_categorical["nih1b_abl"], prefix='nih1b_abl', drop_first=True)

In [None]:
pd.get_dummies(clinic_categorical["nih1a_abl"], prefix='nih1a_abl', drop_first=True)


to encode categorical features: loop through all categorical features and apply pd.get_dummies(). I need a drop_first parameter for dropping or not dropping the n-th category.


In [None]:
def get_dummies(df: pd.DataFrame, categorical_feature_list: list, drop_first: bool=True, inplace: bool=False) -> pd.DataFrame:

    if inplace:
        result = df
    else:
        result = pd.DataFrame()

    for category in categorical_feature_list:
        result = pd.concat((result, pd.get_dummies(df[category], prefix=category, drop_first=drop_first)), axis=1)
        result.drop(category, axis=1, inplace=True)

    return result

In [None]:
clinic_one_hot_encoded = get_dummies(clinic_categorical, categorical_features, inplace=True)
clinic_one_hot_encoded

In [None]:
clinic_preprocessed = pd.concat((clinic_binary.drop(categorical_features, axis=1), clinic_one_hot_encoded), axis=1)
clinic_preprocessed

In [None]:
plt.matshow(clinic_preprocessed.corr())
plt.show()

In [None]:
f = plt.figure(figsize=(100, 100))
plt.matshow(clinic_preprocessed.corr(), fignum=f.number)
plt.xticks(range(clinic_preprocessed.select_dtypes(['number']).shape[1]), clinic_preprocessed.select_dtypes(['number']).columns, fontsize=14, rotation=45)
plt.yticks(range(clinic_preprocessed.select_dtypes(['number']).shape[1]), clinic_preprocessed.select_dtypes(['number']).columns, fontsize=14)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=14)
plt.title('Correlation Matrix', fontsize=16);

In [None]:
corr = clinic_preprocessed.corr()
corr[corr.abs() < 1].unstack().transpose().sort_values( ascending=False).drop_duplicates()

highly correlated
- ph2_c - ph2 - SAE_ICH_1stonly
- i_iatrt - intra_arterial_treatment
- dur_orp_i - dur_ornd

mrs_d90d_c -> outcome


- SEX
- AGE
- THLYSE -> tryboses (treatment) (r_ivtrom	r_treall)
- PRE_CARE -> (situation before At home, etc)
- NIHSS_1D -> nihsco_abl_c
- HYPERCHOL -> hypercholesterolemia
- DIAB -> diabetis
- ARTER_HYPERT -> hypertension
- PREV_STROKE -> previous stroke
- AF -> atrial fibrillation
- TIME_TO_IMAGE: (acute scan date - stroke onset date)

https://git-ext.charite.de/claim/machine-learning/outcome-prediction/mmop/-/blob/paper_experiments/Data_Preprocessing/Clinical_features_extraction.ipynb




to do
- check which features to use (check with Jana)
- few features as baseline (see Slack)
- study side (as lable for federated learning)
- exlusion criteria ?

ct_abl_time
dur_... -> duration
dur_oa ->