## Jan 2023
## Process the demo table

In [1]:
!pip install fastparquet



In [2]:
import pandas as pd
import numpy as np
import os as os
import datetime   

pd.set_option('display.max_rows', 500)

import seaborn as sns
import matplotlib as plt

datadir = '/challenge/seeing-through-the-fog/data/train_data'

In [3]:
df_dem = pd.read_parquet(datadir + "/" + "demo.parquet")
df_tar = pd.read_parquet(datadir + "/" + "target.parquet")

In [4]:
df_dem.head()

Unnamed: 0,patientid,birth_yr,gender,race,ethnicity,index_month_year
0,RAADC3-395646,1981,Male,Caucasian,Not Hispanic,2021-09-01
1,RAADC3-242960,1991,Female,Caucasian,Not Hispanic,2020-09-01
2,RAADC3-542300,1966,Male,Caucasian,Not Hispanic,2020-11-01
3,RAADC3-542620,1963,Female,Caucasian,Not Hispanic,2020-09-01
4,RAADC3-468258,1945,Male,Caucasian,Not Hispanic,2022-01-01


In [5]:
df_tar.head()

Unnamed: 0,patientid,has_long_covid_diag
0,RAADC3-395646,0
1,RAADC3-242960,0
2,RAADC3-542300,0
3,RAADC3-542620,0
4,RAADC3-468258,0


In [6]:
df_dem_tar = df_dem.merge(df_tar, on='patientid')

In [7]:
print(df_dem_tar.shape)
print(df_dem.shape)

(395364, 7)
(395364, 6)


In [8]:
df_dem_tar.head()

Unnamed: 0,patientid,birth_yr,gender,race,ethnicity,index_month_year,has_long_covid_diag
0,RAADC3-395646,1981,Male,Caucasian,Not Hispanic,2021-09-01,0
1,RAADC3-242960,1991,Female,Caucasian,Not Hispanic,2020-09-01,0
2,RAADC3-542300,1966,Male,Caucasian,Not Hispanic,2020-11-01,0
3,RAADC3-542620,1963,Female,Caucasian,Not Hispanic,2020-09-01,0
4,RAADC3-468258,1945,Male,Caucasian,Not Hispanic,2022-01-01,0


In [9]:
df_dem_tar[df_dem_tar['patientid']=='RAADC3-395646']['index_month_year']

0   2021-09-01
Name: index_month_year, dtype: datetime64[ns]

## Process index_month_year
### cases after 2021-06-01 tend to have higher COVID rate
### Make a new numerical column as "index_month_year_from 2020-01-01"

In [10]:
## Create datetime64 object for YYYY-MM-DD
date_covid_breakout_string = '2020-01-01'
date_covid_breakout = pd.to_datetime(date_covid_breakout_string).to_numpy()

## 
date_long_covid_string = '2021-06-01'
date_long_covid = pd.to_datetime(date_long_covid_string).to_numpy()

In [12]:
df_dem_tar['index_month_year_from_begin'] = df_dem_tar['index_month_year'] - date_covid_breakout
df_dem_tar['after_long_covid_start'] = df_dem_tar['index_month_year'] > date_long_covid

In [13]:
df_dem_tar.head()

Unnamed: 0,patientid,birth_yr,gender,race,ethnicity,index_month_year,has_long_covid_diag,index_month_year_from_begin,after_long_covid_start
0,RAADC3-395646,1981,Male,Caucasian,Not Hispanic,2021-09-01,0,609 days,True
1,RAADC3-242960,1991,Female,Caucasian,Not Hispanic,2020-09-01,0,244 days,False
2,RAADC3-542300,1966,Male,Caucasian,Not Hispanic,2020-11-01,0,305 days,False
3,RAADC3-542620,1963,Female,Caucasian,Not Hispanic,2020-09-01,0,244 days,False
4,RAADC3-468258,1945,Male,Caucasian,Not Hispanic,2022-01-01,0,731 days,True


In [14]:
print(pd.crosstab(df_dem_tar.after_long_covid_start, df_dem_tar.has_long_covid_diag))

has_long_covid_diag          0     1
after_long_covid_start              
False                   325070    86
True                     68688  1520


In [15]:
df_dem_tar['birth_yr'].unique()

array(['1981', '1991', '1966', '1963', '1945', '1952', '1953', '1956',
       '1989', '1976', '1984', '1982', '1988', '1977', '1951', '1987',
       '1961', '1947', '1970', '1973', '1979', '1968', '1960', '1996',
       '2001', '1962', '1957', '1950', '1985', '1967', '1995', '1974',
       '1937', '1959', '1993', '1965', '1948', '1997', '1980', '1954',
       '1972', '1955', '1998', '2002', '1964', '1944', '1983', '1958',
       '1975', '1990', '1978', '2000', '1942', '1999', '1986', '1992',
       '1994', '1933', '1949', '1971', '1941', '1932 and Earlier', '1939',
       '1946', '1938', '1934', '1943', '1936', '1969', '2003', '1935',
       '1940', 'Unknown', '2004'], dtype=object)

### 1404 patients have unknown birthyr

In [16]:
df_dem_tar[df_dem_tar['birth_yr']=='Unknown'].shape

(1404, 9)

In [17]:
df_dem_tar[df_dem_tar['birth_yr']=='1932 and Earlier'].shape

(5801, 9)

In [18]:
df_dem_tar['birth_yr_processed'] = df_dem_tar['birth_yr']

In [25]:
df_dem_tar['birth_yr_processed'].unique()

array(['1981', '1991', '1966', '1963', '1945', '1952', '1953', '1956',
       '1989', '1976', '1984', '1982', '1988', '1977', '1951', '1987',
       '1961', '1947', '1970', '1973', '1979', '1968', '1960', '1996',
       '2001', '1962', '1957', '1950', '1985', '1967', '1995', '1974',
       '1937', '1959', '1993', '1965', '1948', '1997', '1980', '1954',
       '1972', '1955', '1998', '2002', '1964', '1944', '1983', '1958',
       '1975', '1990', '1978', '2000', '1942', '1999', '1986', '1992',
       '1994', '1933', '1949', '1971', '1941', 1932, '1939', '1946',
       '1938', '1934', '1943', '1936', '1969', '2003', '1935', '1940',
       'Unknown', '2004'], dtype=object)

In [26]:
for row in df_dem_tar:
    if row['birth_yr_processed'].isnull():
        row['birth_yr_processed'] = 1968
    else:
        row['birth_yr_processed'] = row['birth_yr']

TypeError: string indices must be integers

In [28]:
df_dem_tar['birth_yr_processed'] = df_dem_tar['birth_yr_processed'].fillna(1968)

df_dem_tar.loc[df_dem_tar['birth_yr_processed'].str.contains('Earlier', na=False), 'birth_yr_processed'] = 1932
df_dem_tar.loc[df_dem_tar['birth_yr_processed'].str.contains('Un', na=False), 'birth_yr_processed'] = 1968 # median age

In [29]:
df_dem_tar.shape

(395364, 10)

In [31]:
## Process age, take 2021 as median index_month_year
df_dem_tar['age'] = 2021 - df_dem_tar['birth_yr_processed'].astype(int)

In [32]:
df_dem_tar['age']

0         40
1         30
2         55
3         58
4         76
          ..
395359    46
395360    64
395361    56
395362    50
395363    35
Name: age, Length: 395364, dtype: int64

In [33]:
print(df_dem_tar.shape)
print(df_dem.shape)

(395364, 11)
(395364, 6)


In [34]:
df_dem_tar.head()

Unnamed: 0,patientid,birth_yr,gender,race,ethnicity,index_month_year,has_long_covid_diag,index_month_year_from_begin,after_long_covid_start,birth_yr_processed,age
0,RAADC3-395646,1981,Male,Caucasian,Not Hispanic,2021-09-01,0,609 days,True,1981,40
1,RAADC3-242960,1991,Female,Caucasian,Not Hispanic,2020-09-01,0,244 days,False,1991,30
2,RAADC3-542300,1966,Male,Caucasian,Not Hispanic,2020-11-01,0,305 days,False,1966,55
3,RAADC3-542620,1963,Female,Caucasian,Not Hispanic,2020-09-01,0,244 days,False,1963,58
4,RAADC3-468258,1945,Male,Caucasian,Not Hispanic,2022-01-01,0,731 days,True,1945,76


In [35]:
df_dem_tar.columns

Index(['patientid', 'birth_yr', 'gender', 'race', 'ethnicity',
       'index_month_year', 'has_long_covid_diag',
       'index_month_year_from_begin', 'after_long_covid_start',
       'birth_yr_processed', 'age'],
      dtype='object')

In [36]:
df_dem_tar_processed = df_dem_tar[['patientid', 'birth_yr', 'gender', 'race', 'ethnicity',\
                                  'index_month_year', 'has_long_covid_diag', \
                                  'index_month_year_from_begin', 'after_long_covid_start','age']]

In [None]:
df_dem_tar_processed.to_csv("/home/huangz36/dem_t", sep='\t', \
                                index=False, header=True)