In [12]:
import pandas as pd
import numpy as np
from datetime import datetime
from tqdm import tqdm

In [13]:
DATASET_PATH = r"archive\covid.csv"


features = ['age',
            'diabetes', 'copd', 'asthma',
            'inmsupr', 'hypertension', 'other_disease',
            'cardiovascular', 'obesity', 'renal_chronic',
            'tobacco']

final_outcomes = ['patient_type',
                 'intubed',
                 'pneumonia',
                 'icu',
                'date_died'
                 ]
date_outcomes = ['entry_date', 'date_symptoms']


def calc_error(row, row_not):
    age_1 = row['age']
    age_2 = row_not['age']
    age_dif = age_1 - age_2
    age_err = np.power(age_dif,2)
    row = row.to_numpy()
    row_not = row_not.to_numpy()
    err = (row != row_not).sum()
    return err + age_err


def search_for_closest(row, not_pregnant, K):
    row_ft = row.loc[features]
    #age = row.loc["age"]

    lst_errors = []
    for index, row_not_ft in not_pregnant.iterrows():
        err = calc_error(row_ft, row_not_ft)
        if len(lst_errors) < K:
            lst_errors.append([err, index])
            lst_errors.sort(key=lambda x: x[0])
        else:
            if err >= lst_errors[-1][0]:
                continue
            lst_errors.pop(len(lst_errors)-1)
            lst_errors.append([err, index])
            lst_errors.sort(key=lambda x: x[0])


    return lst_errors



def calc_avg(lst_errors, not_pregnant):
    try:
        size = len(final_outcomes)
        ITE = np.zeros(size)
        cnt = np.zeros(size)

        for err, index in lst_errors:
            outcomes = not_pregnant.iloc[index]
            outcomes = outcomes.loc[final_outcomes]
            outcomes = outcomes.to_numpy()
            for idx, feat in enumerate(outcomes):
                if feat == 97 or feat == 99 or feat == 98:
                    print(f" Encountered Weird Value! feat = {feat}")
                    continue
                if feat not in [1,2]:
                    print(f"bad feat {feat}")
                ITE[idx] += feat
                cnt[idx] += 1

        for idx, c in enumerate(cnt):
            if c == 0:
                cnt[idx] = 1

        ITE = np.divide(ITE, cnt)
        return ITE

    except:
        return None


def print_and_log(st: str):
    with open("log.txt", "a") as f:
        f.write(st+"\n")
        print(st)

In [15]:
K = 11
# def main(K):
df = pd.read_csv(DATASET_PATH)
df = df.loc[df['sex'] == 1]  # take only females
df = df.loc[df['covid_res'] == 1]  # take only positives
df = df.drop('sex', axis=1)  # drop sex
df = df.drop('id', axis=1)  # drop id
df = df.loc[(df['age'] <= 45) &  (df['age'] >= 18)]
age_range = 45 - 18

In [4]:
# means = df.mean(axis = 0)
# age_avarage = means['age']
# df['age'] = df['age'] - age_avarage
# df['age'] = df['age'].div(age_range)

In [16]:
"""
final_outcomes = ['patient_type',
             'intubed',
             'pneumonia',
             'icu',
            'date_died'
             ]"""

#  Not pregnant data adaptation
not_pregnant = df.loc[(df['pregnancy'] == 2)]
not_pregnant.loc[not_pregnant.date_died == "9999-99-99", 'date_died'] = 2
not_pregnant.loc[not_pregnant.date_died != 2, 'date_died'] = 1
not_pregnant.loc[not_pregnant.intubed == 97, 'intubed'] = 2

#  Pregnant data adaptation
pregnant = df.loc[(df['pregnancy'] == 1)]
pregnant.loc[pregnant.date_died == "9999-99-99", 'date_died'] = 2
pregnant.loc[pregnant.date_died != 2, 'date_died'] = 1
pregnant.loc[pregnant.intubed == 97, 'intubed'] = 2



In [6]:
# print(df.columns.tolist())
not_pregnant = not_pregnant.loc[(not_pregnant['patient_type'].isin([1, 2])) & (not_pregnant['intubed'].isin([1, 2]))
                      & (not_pregnant['pneumonia'].isin([1, 2])) & (not_pregnant['icu'].isin([1, 2]))]
pregnant = pregnant.loc[(pregnant['patient_type'].isin([1,2])) & (pregnant['intubed'].isin([1,2]))
                  & (pregnant['pneumonia'].isin([1,2])) & (pregnant['icu'].isin([1,2]))]


# pregnant.loc[pregnant.date_died == "9999-99-99", 'date_died'] = 2
# pregnant.loc[pregnant.date_died != 2, 'date_died'] = 1
# not_pregnant.loc[not_pregnant.date_died == "9999-99-99", 'date_died'] = 2
# not_pregnant.loc[not_pregnant.date_died != 2, 'date_died'] = 1


In [8]:

ATE = np.zeros(len(final_outcomes))
cnt = 0
num_rows = pregnant.shape[0]

now = datetime.now()
dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
print_and_log("\n\n---------------")
print_and_log(f"Time: {dt_string}")
print_and_log(f"K: {K}")
print_and_log(f"Total pregnant rows: {num_rows}")
print_and_log(f"Total not_pregnant rows: {not_pregnant.shape[0]}")
not_pregnant_features = not_pregnant[features]
cont = np.zeros(len(final_outcomes))
for index, row in tqdm(pregnant.iterrows()):
    lst_errors = search_for_closest(row, not_pregnant_features, K)
    ite = calc_avg(lst_errors, not_pregnant)
    # print(f" The current ite is: {ite}")
    if ite is None:
        continue
    pregnant_outcome = row.loc[final_outcomes]
    pregnant_outcome = pregnant_outcome.to_numpy()
    poc_ite = np.zeros(len(final_outcomes))

    for idx, feat in enumerate(pregnant_outcome):
        if feat == 97 or feat == 99 or feat == 98:
            continue
        if feat not in [1,2]:
            print(f"bad feat {feat}")
        poc_ite[idx] += feat - ite[idx]
        cont[idx] += 1

    for idx, c in enumerate(cont):
        if c == 0:
            cont[idx] = 1
    ATE = np.add(ATE, poc_ite)
    cnt += 1
    if cnt % 10 == 0:
        temp = np.divide(ATE, cont)
        
pregnant_features = pregnant[features]
for index, row in tqdm(pregnant.iterrows()):
    lst_errors = search_for_closest(row, pregnant_features, K)
    ite = calc_avg(lst_errors, pregnant)
    # print(f" The current ite is: {ite}")
    if ite is None:
        continue
    not_pregnant_outcome = row.loc[final_outcomes]
    not_pregnant_outcome = not_pregnant_outcome.to_numpy()
    poc_ite = np.zeros(len(final_outcomes))
    for idx, feat in enumerate(not_pregnant_outcome):
        if feat == 97 or feat == 99 or feat == 98:
            continue
        if feat not in [1,2]:
            print(f"bad feat {feat}")
        poc_ite[idx] += feat - ite[idx]
        cont[idx] += 1

    for idx, c in enumerate(cont):
        if c == 0:
            cont[idx] = 1
    ATE = np.add(ATE, poc_ite)
    cnt += 1
    if cnt % 10 == 0:
        temp = np.divide(ATE, cont)
        #print(f"temp outcome ({cnt}/{num_rows}: {temp}")

ATE = np.divide(ATE, cont)
print_and_log(f"final outcome: {ATE}")
    # return ATE



---------------
Time: 16/03/2021 11:37:01
K: 11
Total pregnant rows: 416
Total not_pregnant rows: 5653
final outcome: [nan nan nan nan nan]



0it [00:00, ?it/s]
1it [00:00,  1.81it/s]
2it [00:01,  1.79it/s]
3it [00:01,  1.75it/s]
4it [00:02,  1.81it/s]
5it [00:02,  1.79it/s]
6it [00:03,  1.76it/s]
7it [00:03,  1.78it/s]
8it [00:04,  1.81it/s]
9it [00:04,  1.86it/s]
10it [00:05,  1.85it/s]
11it [00:06,  1.91it/s]
12it [00:06,  1.88it/s]
13it [00:07,  1.90it/s]
14it [00:07,  1.95it/s]
15it [00:08,  1.94it/s]
16it [00:08,  1.94it/s]
17it [00:09,  1.92it/s]
18it [00:09,  1.90it/s]
19it [00:10,  1.89it/s]
20it [00:10,  1.88it/s]
21it [00:11,  1.91it/s]
22it [00:11,  1.94it/s]
23it [00:12,  1.94it/s]
24it [00:12,  1.92it/s]
25it [00:13,  1.90it/s]
26it [00:13,  1.86it/s]
27it [00:14,  1.84it/s]
28it [00:14,  1.86it/s]
29it [00:15,  1.91it/s]
30it [00:15,  1.93it/s]
31it [00:16,  1.94it/s]
32it [00:16,  1.97it/s]
33it [00:17,  1.94it/s]
34it [00:18,  1.80it/s]
35it [00:18,  1.75it/s]
36it [00:19,  1.66it/s]
37it [00:20,  1.63it/s]
38it [00:20,  1.63it/s]
39it [00:21,  1.59it/s]
40it [00:21,

In [None]:
if __name__ == "__main__":
    for K in range(3, 21):
        main(K)