In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer
from datetime import date

In [None]:
data = pd.read_csv("sestro_data.csv")
print(data.shape)
data.head()

In [None]:
data.columns

In [None]:
cols_to_eng = {
    "от куда пациент": "from",
    "N ПАЦИЕНТА(КРОВЬ)": "blood_id",
    "Пол ( 1-м, 2-ж)": "sex",
    "КАРТА ПАЦ-ТА": "man_id",
    "Количество суток с начала заболевания до госпитализации": "ill_to_hosp",
    "Дата начала лечения": "date_start_treat",
    "День заболевания": "date_ill",
    "Исходы (1 умер, 2 выписка)": "death",
    "Степень тяжести заболевания (1-легкая, 2-средняя, 3-тяжелая и кр.тяжелая)": "severity",
    "ДАТА РОЖДЕНИЯ (ВОЗРАСТ)": "age",
    "СЕКВЕНАТОР": "seq_machine",
    "НАЗВАНИЕ БИБЛИОТЕКИ (ЗАПУСКА)": "launch_name",
    "ДАТА ЗАПУСКА": "launch_date",
    "ИНДЕКС": "index",
    "ГРУППА": "group",
    "ЛЕЧЕНИЕ": "treatment",
    "Койко-дней": "days_at_hosp",
    "Мутаци в гене ITGAM": "ITGAM_mut",
    "rs мутации": "rs_mut",
    "Повышение температуры": "temp",
    "Слабость": "weakness",
    "Потеря обонания и вкуса": "smell_taste_loss",
    "ИБС": "chd",  # ишемическая болезнь сердца
    "АКС": "aks",  # ??
    "ЦВБ": "cvd",  # Цереброваскулярная болезнь
    "Количество суток на ИВЛ": "imv_days",
    "МазокПЦР1": "pcr_1",
    "Лейк1": "leuk_1",
    "Нейтр1": "neut_1",  # нейтрофилы(?)
    "Л1": "l_1",  # ??
    "ЛДГ1": "ldh_1",  # Лактатдегидрогеназа
    "СРБ1": "crp_1",  # с-реактивный белок
    "Фер1": "fer_1",
    "Д-димер1": "d_dym_1",
    "ИЛ-6_1": "il6_1",
    "Креатинин1": "creat_1",
    "Глюкоза1": "glu_1",
    "МазокПЦР3": "pcr_3",
    "Лейк3": "leuk_3",
    "Нейтр3": "neut_3",
    "Л3": "l3",
    "ЛДГ3": "ldh_3",
    "СРБ3": "crp_3",
    "Фер3": "fer_3",
    "Д-димер3": "d_dym_3",
    "ИЛ-6_3": "il6_3",
    "Креатинин3": "creat_3",
    "Глюкоза3": "glu_3",
    "Мазок ПЦР5": "pcr_5",
    "Лейк5": "leuk_5",
    "Нейтр5": "neut_5",
    "Л5": "l_5",
    "ЛДГ5": "ldh_5",
    "СРБ_5": "crp_5",
    "Фер_5": "fer_5",
    "Д-димер5": "d_dym_5",
    "ИЛ-6_5": "il6_5",
    "Креатинин5": "creat_5",
    "Глюкоза5": "glu_5",
    "Лейк7": "leuk_7",
    "Нейтр7": "neut_7",
    "Л7": "l_7",
    "ЛДГ7": "ldh_7",
    "СРБ_7": "crp_7",
    "Фер_7": "fer_7",
    "Д-димер7": "d_dym_7",
    "ИЛ-6_7": "il6_7",
    "Креатинин7": "creat_7",
    "Глюкоза7": "glu_7",
    "Мазок ПЦР 9-10": "pcr_9",
    "Прокальцитонин": "pct",  # Procalcitonin (hehh???)
    "NEWS при поступлении (сумма баллов):": "news_1",  # National Early Warning Score (NEWS)
    "Степень дыхательной недостаточности на 5й день от поступления: ": "resp_fail_5",
    "NEWS на 5й день от поступления": "news_5",
    "Степень дыхательной недостаточности при выписке": "resp_fail_out",
    "NEWS при выписке (сумма баллов):": "news_out",
}

In [None]:
data = data.rename(columns=cols_to_eng)
for key, value in data.iteritems():
    print("--------------------")
    print(f"СТОЛБЕЦ: {key}")
    print(f"РАЗМЕР: {value.unique().shape}")
    print(f"ЗНАЧЕНИЯ: {value.unique()}")

    print("--------------------")

## Correcting values

In [None]:
from datetime import date

# Пол
# Было: 1-м, 2-ж
# Нужно 0 - м, 1 - ж
def get_sex(x):
    try:
        if int(x) == 2:
            return 1
        if int(x) == 1:
            return 0
        else:
            return np.nan
    except ValueError:
        print(f"sex error: {x}")
        return np.nan


# Исходы
# Было: 1 умер, 2 выписка
# Нужно: 1 - умер, 0- живой
def get_death(x):
    try:
        x = int(x)
        x = x * (-1) + 2
        if x == 1 or x == 0:
            return x
        else:
            return np.nan
    except ValueError:
        print(f"death error: {x}")
        return np.nan


def get_severity(x):
    try:
        x = int(x)
        if 1 <= x <= 3:
            return x
        elif 0 <= x <= 1:
            return 1
        else:
            return np.nan
    except ValueError:
        print(f"sev error: {x}")
        return np.nan


def calculate_age(birthDate):
    today = date.today()
    age = (
        today.year
        - birthDate.year
        - ((today.month, today.day) < (birthDate.month, birthDate.day))
    )
    return age


def get_age(x):
    try:
        if not ("/" in x and "." in x):
            x = x.replace("год", "").replace("лет", "").replace(" ", "")
            return int(x)
    except (ValueError, TypeError):
        pass
    try:
        if "/" in x:
            month, day, year = [int(i) for i in x.split("/")]
            return calculate_age(date(year, month, day))
        if "." in x:
            day, month, year = [int(i) for i in x.split(".")]
            return calculate_age(date(year, month, day))
    except (ValueError, TypeError):
        pass
    print(f"age error: {x}")
    return np.nan


# койко-дни days_at_hosp
# ИВЛ check_imv_days
def check_floats(x):
    starting_x = x
    try:
        return float(x)
    except (ValueError, TypeError):
        pass
    try:
        x = x.replace("\n", "").replace("\xa0", " ").replace(" ", "")
        return float(x)
    except (ValueError, TypeError):
        pass
    try:
        if "," in x and "." in x:
            return float(x.replace(",", ""))
        return float(x.replace(",", "."))
    except (ValueError, TypeError):
        pass
    print(f"num error: {starting_x, x}")
    return np.nan

In [None]:
data["sex"] = data["sex"].apply(get_sex)
data["death"] = data["death"].apply(get_death)
data["severity"] = data["severity"].apply(get_severity)
data["age"] = data["age"].apply(get_age)

In [None]:
cols_to_check = [
    "ill_to_hosp",
    "date_ill",
    "days_at_hosp",
    "temp",
    "weakness",
    "smell_taste_loss",
    "chd",
    "aks",
    "cvd",
    "imv_days",
    "pcr_1",
    "leuk_1",
    "neut_1",
    "l_1",
    "ldh_1",
    "crp_1",
    "fer_1",
    "d_dym_1",
    "il6_1",
    "creat_1",
    "glu_1",
    "pcr_3",
    "leuk_3",
    "neut_3",
    "l3",
    "ldh_3",
    "crp_3",
    "fer_3",
    "d_dym_3",
    "il6_3",
    "creat_3",
    "glu_3",
    "pcr_5",
    "leuk_5",
    "neut_5",
    "l_5",
    "ldh_5",
    "crp_5",
    "fer_5",
    "d_dym_5",
    "il6_5",
    "creat_5",
    "glu_5",
    "leuk_7",
    "neut_7",
    "l_7",
    "ldh_7",
    "crp_7",
    "fer_7",
    "d_dym_7",
    "il6_7",
    "creat_7",
    "glu_7",
    "pcr_9",
    "pct",
    "news_1",
    "resp_fail_5",
    "news_5",
    "resp_fail_out",
    "news_out",
]
for col in cols_to_check:
    data[col] = data[col].apply(check_floats)

In [None]:
data.columns

In [None]:
data = data.rename(columns=cols_to_eng)
for key, value in data.iteritems():
    print("--------------------")
    print(f"СТОЛБЕЦ: {key}")
    print(f"РАЗМЕР: {value.unique().shape}")
    print(f"ЗНАЧЕНИЯ: {value.unique()}")

    print("--------------------")

## Imputing

In [None]:
imputing_cols = [
    "sex",
    "ill_to_hosp",
    "death",
    "severity",
    "age",
    "days_at_hosp",  # ee не на машлерн
    "temp",
    "weakness",
    "smell_taste_loss",
    "chd",
    "aks",
    "cvd",
    "imv_days",
    "pcr_1",
    "leuk_1",
    "neut_1",
    "l_1",
    "ldh_1",
    "crp_1",
    "fer_1",
    "d_dym_1",
    "il6_1",
    "creat_1",
    "glu_1",
    "pcr_3",
    "leuk_3",
    "neut_3",
    "l3",
    "ldh_3",
    "crp_3",
    "fer_3",
    "d_dym_3",
    "il6_3",
    "creat_3",
    "glu_3",
    "pcr_5",
    "leuk_5",
    "neut_5",
    "l_5",
    "ldh_5",
    "crp_5",
    "fer_5",
    "d_dym_5",
    "il6_5",
    "creat_5",
    "glu_5",
    "leuk_7",
    "neut_7",
    "l_7",
    "ldh_7",
    "crp_7",
    "fer_7",
    "d_dym_7",
    "il6_7",
    "creat_7",
    "glu_7",
    "pcr_9",
    "pct",
    "news_1",
    "resp_fail_5",
    "news_5",
    "resp_fail_out",
    "news_out",
]
additional_cols = [
    "group",  # ее не на машлерн
    "treatment",  # ee не на машлерн
]
def impute(data, n_neighbors=5):
    imputer = KNNImputer(n_neighbors=n_neighbors)
    return pd.DataFrame(
        imputer.fit_transform(data), columns=data.columns, index=data.index
    )

In [None]:
imputing_data = data[imputing_cols]
imputing_data = impute(imputing_data)
imputing_data

## Filter and impute again

In [None]:
for col in imputing_data.columns:
    sns.histplot(imputing_data[col])
    plt.show()

In [None]:
do_not_cols = ['sex', 'death', 'severity']
for col in imputing_data.columns:
    if col not in do_not_cols:
        imputing_data[col] = imputing_data[col].where(imputing_data[col].between(imputing_data[col].mean()-3*imputing_data[col].std(),imputing_data[col].mean()+3*imputing_data[col].std()))
imputing_data = impute(imputing_data)
for col in imputing_data.columns:
    sns.histplot(imputing_data[col])
    plt.show()

In [None]:
for col in imputing_data.columns:
    data[col] = imputing_data[col]

## Saving

In [None]:
needed_cols = [
    "sex",
    "ill_to_hosp",
    "death",
    "severity",
    "age",
    "group",  # ее не на машлерн
    "treatment",  # ee не на машлерн
    "days_at_hosp",  # ee не на машлерн
    "temp",
    "weakness",
    "smell_taste_loss",
    "chd",
    "aks",
    "cvd",
    "imv_days",
    "pcr_1",
    "leuk_1",
    "neut_1",
    "l_1",
    "ldh_1",
    "crp_1",
    "fer_1",
    "d_dym_1",
    "il6_1",
    "creat_1",
    "glu_1",
    "pcr_3",
    "leuk_3",
    "neut_3",
    "l3",
    "ldh_3",
    "crp_3",
    "fer_3",
    "d_dym_3",
    "il6_3",
    "creat_3",
    "glu_3",
    "pcr_5",
    "leuk_5",
    "neut_5",
    "l_5",
    "ldh_5",
    "crp_5",
    "fer_5",
    "d_dym_5",
    "il6_5",
    "creat_5",
    "glu_5",
    "leuk_7",
    "neut_7",
    "l_7",
    "ldh_7",
    "crp_7",
    "fer_7",
    "d_dym_7",
    "il6_7",
    "creat_7",
    "glu_7",
    "pcr_9",
    "pct",
    "news_1",
    "resp_fail_5",
    "news_5",
    "resp_fail_out",
    "news_out",
]
small_needed_cols = [
    "sex",
    "ill_to_hosp",
    "death",
    "severity",
    "age",
    "group",  # ее не на машлерн
    "treatment",  # ee не на машлерн
    "days_at_hosp",  # ee не на машлерн
    "temp",
    "weakness",
    "smell_taste_loss",
    "chd",
    "aks",
    "cvd",
    "imv_days",
    "pcr_1",
    "leuk_1",
    "neut_1",
    "l_1",
    "ldh_1",
    "crp_1",
    "fer_1",
    "d_dym_1",
    "il6_1",
    "creat_1",
    "glu_1",
    "pct",
    "news_1",
    "resp_fail_5",
]

In [None]:
data_big = data[needed_cols]
display(data_big.head())
data_big.to_csv('big_data.csv', index=False)
data_small = data[small_needed_cols]
display(data_small.head())
data_small.drop(['group','treatment'], axis = 1).to_csv('small_data.csv', index=False)