# Load saved data

In [3]:
import pandas as pd
symptoms = pd.read_csv('symptoms_survey_dump.csv')
diagnosis = pd.read_csv('diagnosis_survey_dump.csv')

In [4]:
symptoms.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148561 entries, 0 to 148560
Data columns (total 26 columns):
 #   Column                                                Non-Null Count   Dtype  
---  ------                                                --------------   -----  
 0   Unnamed: 0                                            148561 non-null  int64  
 1   pid                                                   148561 non-null  object 
 2   version                                               148561 non-null  int64  
 3   name                                                  148561 non-null  object 
 4   time_scheduled                                        0 non-null       float64
 5   time_received                                         148561 non-null  object 
 6   time                                                  148561 non-null  object 
 7   time_start                                            148561 non-null  object 
 8   id                                          

In [5]:
diagnosis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2420 entries, 0 to 2419
Data columns (total 21 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Unnamed: 0                      2420 non-null   int64  
 1   time_received                   2420 non-null   object 
 2   time                            2420 non-null   object 
 3   time_start                      2420 non-null   object 
 4   pid                             2420 non-null   object 
 5   version                         2420 non-null   int64  
 6   name                            2420 non-null   object 
 7   time_scheduled                  0 non-null      float64
 8   id                              2420 non-null   object 
 9   task_who_time_start             0 non-null      float64
 10  task_who_time_end               428 non-null    object 
 11  task_who_value                  2296 non-null   float64
 12  task_how_time_start             42

# Process data

In [51]:
# filter symptom data where the user has covid diagnosis data

In [6]:
len(diagnosis['id'].unique())

1547

In [7]:
# Invalid data points
len(diagnosis[(pd.isnull(diagnosis['task_who_value'])) | (pd.isnull(diagnosis['task_how_value']))]['id'].unique())

107

In [8]:
# Filter for only self and PCR test.
diagnosis_filtered = diagnosis[(diagnosis['task_who_value'] == 0.0) & (diagnosis['task_how_value'] == 0.0)]

In [9]:
len(diagnosis_filtered['id'].unique())

855

In [10]:
diagnosis_filtered['time'].describe()

count                                 1241
unique                                1241
top       2020-11-29 15:42:02.464000+00:00
freq                                     1
Name: time, dtype: object

In [11]:
# Filter symptoms for only the ids that have valid diagnosis
symptoms_filtered = symptoms[symptoms['id'].isin(diagnosis_filtered['id'].unique())]

In [12]:
symptoms_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7926 entries, 193 to 148560
Data columns (total 26 columns):
 #   Column                                                Non-Null Count  Dtype  
---  ------                                                --------------  -----  
 0   Unnamed: 0                                            7926 non-null   int64  
 1   pid                                                   7926 non-null   object 
 2   version                                               7926 non-null   int64  
 3   name                                                  7926 non-null   object 
 4   time_scheduled                                        0 non-null      float64
 5   time_received                                         7926 non-null   object 
 6   time                                                  7926 non-null   object 
 7   time_start                                            7926 non-null   object 
 8   id                                                    

In [11]:
# Merge covid symptoms data with 10 days prior to covid diagnosis data

In [13]:
symptoms_filtered = symptoms_filtered.set_index(pd.to_datetime(symptoms_filtered['time'], infer_datetime_format=True))

In [14]:
symptoms_filtered.head()

Unnamed: 0_level_0,Unnamed: 0,pid,version,name,time_scheduled,time_received,time,time_start,id,task_mood_time_start,...,task_symptoms_value_Cough,task_symptoms_value_Difficulties breathing,task_symptoms_value_Fever,task_symptoms_value_Fatigue,task_symptoms_value_Muscle or joint pain,task_symptoms_value_Headache,task_symptoms_value_Congestion,task_symptoms_value_Nausea,task_symptoms_value_Chills,task_symptoms_value_Confusion
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-11-03 12:25:24.380000+00:00,193,symptoms,1,CoViD symptoms,,2020-11-03 12:25:20.938000+00:00,2020-11-03 12:25:24.380000+00:00,2020-11-03 12:25:06.435000+00:00,02KIBYpUG9QJ9yL0w7fU1oHUysu1,,...,0.0,1.0,0.0,,,,,,,
2020-10-18 20:39:24.957000+00:00,194,symptoms,1,CoViD symptoms,,2020-10-18 20:39:23.830000+00:00,2020-10-18 20:39:24.957000+00:00,2020-10-18 20:39:13.561000+00:00,02KIBYpUG9QJ9yL0w7fU1oHUysu1,,...,0.0,0.0,0.0,,,,,,,
2020-11-05 10:11:26.093000+00:00,195,symptoms,1,CoViD symptoms,,2020-11-05 10:11:21.946000+00:00,2020-11-05 10:11:26.093000+00:00,2020-11-05 10:11:14.060000+00:00,02KIBYpUG9QJ9yL0w7fU1oHUysu1,,...,0.0,0.0,0.0,,,,,,,
2020-10-28 17:07:54.955000+00:00,196,symptoms,1,CoViD symptoms,,2020-10-28 17:07:51.222000+00:00,2020-10-28 17:07:54.955000+00:00,2020-10-28 17:07:45.169000+00:00,02KIBYpUG9QJ9yL0w7fU1oHUysu1,,...,0.0,1.0,0.0,,,,,,,
2020-12-08 10:01:07+00:00,197,symptoms,1,CoViD symptoms,,2020-12-08 10:01:00.120000+00:00,2020-12-08 10:01:07+00:00,2020-12-08 10:00:43.328000+00:00,02KIBYpUG9QJ9yL0w7fU1oHUysu1,,...,0.0,0.0,0.0,,,,,,,


In [15]:
symptoms_filtered.loc['2020-11-03':'2020-11-20']

Unnamed: 0_level_0,Unnamed: 0,pid,version,name,time_scheduled,time_received,time,time_start,id,task_mood_time_start,...,task_symptoms_value_Cough,task_symptoms_value_Difficulties breathing,task_symptoms_value_Fever,task_symptoms_value_Fatigue,task_symptoms_value_Muscle or joint pain,task_symptoms_value_Headache,task_symptoms_value_Congestion,task_symptoms_value_Nausea,task_symptoms_value_Chills,task_symptoms_value_Confusion
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-11-03 12:25:24.380000+00:00,193,symptoms,1,CoViD symptoms,,2020-11-03 12:25:20.938000+00:00,2020-11-03 12:25:24.380000+00:00,2020-11-03 12:25:06.435000+00:00,02KIBYpUG9QJ9yL0w7fU1oHUysu1,,...,0.0,1.0,0.0,,,,,,,
2020-11-05 10:11:26.093000+00:00,195,symptoms,1,CoViD symptoms,,2020-11-05 10:11:21.946000+00:00,2020-11-05 10:11:26.093000+00:00,2020-11-05 10:11:14.060000+00:00,02KIBYpUG9QJ9yL0w7fU1oHUysu1,,...,0.0,0.0,0.0,,,,,,,
2020-11-20 10:15:34.082000+00:00,198,symptoms,1,CoViD symptoms,,2020-11-20 10:15:34.405000+00:00,2020-11-20 10:15:34.082000+00:00,2020-11-20 10:15:25.672000+00:00,02KIBYpUG9QJ9yL0w7fU1oHUysu1,,...,0.0,1.0,0.0,,,,,,,
2020-11-17 10:05:44.983000+00:00,215,symptoms,1,CoViD symptoms,,2020-11-17 10:05:42.219000+00:00,2020-11-17 10:05:44.983000+00:00,2020-11-17 10:04:06.733000+00:00,02KIBYpUG9QJ9yL0w7fU1oHUysu1,,...,0.0,1.0,0.0,,,,,,,
2020-11-09 13:26:00.689000+00:00,222,symptoms,1,CoViD symptoms,,2020-11-09 13:26:00.982000+00:00,2020-11-09 13:26:00.689000+00:00,2020-11-09 13:25:47.786000+00:00,02KIBYpUG9QJ9yL0w7fU1oHUysu1,,...,0.0,1.0,0.0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-11-16 21:25:19.891000+00:00,148426,symptoms,1,CoViD symptoms,,2020-11-16 21:25:16.891000+00:00,2020-11-16 21:25:19.891000+00:00,2020-11-16 21:25:06.006000+00:00,zv7BUkCSS9eatY96QyEc1wKlqJE3,,...,1.0,0.0,0.0,,,,,,,
2020-11-15 08:49:26.224000+00:00,148439,symptoms,1,CoViD symptoms,,2020-11-15 08:49:23.246000+00:00,2020-11-15 08:49:26.224000+00:00,2020-11-15 08:49:14.958000+00:00,zv7BUkCSS9eatY96QyEc1wKlqJE3,,...,1.0,0.0,0.0,,,,,,,
2020-11-13 10:39:10.326000+00:00,148446,symptoms,1,CoViD symptoms,,2020-11-13 10:39:07.405000+00:00,2020-11-13 10:39:10.326000+00:00,2020-11-13 10:38:49.179000+00:00,zv7BUkCSS9eatY96QyEc1wKlqJE3,,...,1.0,0.0,0.0,,,,,,,
2020-11-03 20:28:10.480000+00:00,148448,symptoms,1,CoViD symptoms,,2020-11-03 20:28:10.635000+00:00,2020-11-03 20:28:10.480000+00:00,2020-11-03 20:27:50.068000+00:00,zv7BUkCSS9eatY96QyEc1wKlqJE3,,...,1.0,0.0,0.0,,,,,,,


In [16]:
symptoms_filtered['task_mood_value_valence'][0]

0.5

In [17]:
diagnosis_filtered = diagnosis_filtered.set_index(pd.to_datetime(diagnosis_filtered['time'], infer_datetime_format=True))

In [18]:
diagnosis_filtered[pd.notnull(diagnosis_filtered['task_date_illness_value'])]

Unnamed: 0_level_0,Unnamed: 0,time_received,time,time_start,pid,version,name,time_scheduled,id,task_who_time_start,...,task_who_value,task_how_time_start,task_how_time_end,task_how_value,task_date_diagnosed_time_start,task_date_diagnosed_time_end,task_date_diagnosed_value,task_date_illness_time_start,task_date_illness_time_end,task_date_illness_value
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-26 11:24:13.039000+00:00,29,2021-01-26 11:24:12.241000+00:00,2021-01-26 11:24:13.039000+00:00,2021-01-26 11:23:49.983000+00:00,diagnosis,2,Covid Diagnosis,,0YXuBKrbltQO5FZGv5GHsoF4Ssk2,,...,0.0,2021-01-26 11:23:54.225000+00:00,2021-01-26 11:23:59.540000+00:00,0.0,2021-01-26 11:23:59.540000+00:00,2021-01-26 11:24:05.530000+00:00,2021-01-22 00:00:00+00:00,2021-01-26 11:24:05.530000+00:00,2021-01-26 11:24:13.038000+00:00,2021-01-17 00:00:00+00:00
2021-01-25 21:41:21.773000+00:00,34,2021-01-25 21:41:22.007000+00:00,2021-01-25 21:41:21.773000+00:00,2021-01-25 21:40:03.289000+00:00,diagnosis,2,Covid Diagnosis,,0vTPhZqW0sQCrbGqyvUXtdO8lHz1,,...,0.0,2021-01-25 21:40:10.526000+00:00,2021-01-25 21:40:20.051000+00:00,0.0,2021-01-25 21:40:20.051000+00:00,2021-01-25 21:40:34.473000+00:00,2020-11-11 00:00:00+00:00,2021-01-25 21:40:34.473000+00:00,2021-01-25 21:41:21.773000+00:00,2020-11-11 00:00:00+00:00
2021-01-23 15:04:44.116000+00:00,50,2021-01-23 15:04:42.527000+00:00,2021-01-23 15:04:44.116000+00:00,2021-01-23 15:04:01.725000+00:00,diagnosis,2,Covid Diagnosis,,1JSHZlbCwAgx0CFNSQRkhdAw1Kt1,,...,0.0,2021-01-23 15:04:10.404000+00:00,2021-01-23 15:04:24.328000+00:00,0.0,2021-01-23 15:04:24.328000+00:00,2021-01-23 15:04:33.043000+00:00,2021-01-21 00:00:00+00:00,2021-01-23 15:04:33.043000+00:00,2021-01-23 15:04:44.116000+00:00,2021-01-20 00:00:00+00:00
2021-01-22 23:45:28.571000+00:00,66,2021-01-22 23:45:27.839000+00:00,2021-01-22 23:45:28.571000+00:00,2021-01-22 23:42:37.897000+00:00,diagnosis,2,Covid Diagnosis,,1f70DzlxzeVmYIJQanrVWfNoV5d2,,...,0.0,2021-01-22 23:42:43.327000+00:00,2021-01-22 23:42:56.398000+00:00,0.0,2021-01-22 23:42:56.398000+00:00,2021-01-22 23:43:38.541000+00:00,2021-01-16 00:00:00+00:00,2021-01-22 23:43:38.541000+00:00,2021-01-22 23:45:28.571000+00:00,2021-01-19 00:00:00+00:00
2021-02-07 13:25:28.932000+00:00,81,2021-02-07 13:22:53.857000+00:00,2021-02-07 13:25:28.932000+00:00,2021-02-07 13:25:09.112000+00:00,diagnosis,2,Covid Diagnosis,,24N0DS429CeGFy5p7Dg4JZ0zBGa2,,...,0.0,2021-02-07 13:25:11.778000+00:00,2021-02-07 13:25:16.227000+00:00,0.0,2021-02-07 13:25:16.227000+00:00,2021-02-07 13:25:22.830000+00:00,2021-02-02 00:00:00+00:00,2021-02-07 13:25:22.830000+00:00,2021-02-07 13:25:28.932000+00:00,2021-02-01 00:00:00+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-02-04 07:53:32.009000+00:00,2354,2021-02-04 07:53:29.584000+00:00,2021-02-04 07:53:32.009000+00:00,2021-02-04 07:52:59.383000+00:00,diagnosis,2,Covid Diagnosis,,yD5QOGx7b4Y6o4O9u1NqedOFD8y2,,...,0.0,2021-02-04 07:53:05.792000+00:00,2021-02-04 07:53:14.500000+00:00,0.0,2021-02-04 07:53:14.500000+00:00,2021-02-04 07:53:21.527000+00:00,2021-01-14 00:00:00+00:00,2021-02-04 07:53:21.527000+00:00,2021-02-04 07:53:32.009000+00:00,2021-01-11 00:00:00+00:00
2021-03-02 10:25:08.023000+00:00,2355,2021-03-02 10:25:06.317000+00:00,2021-03-02 10:25:08.023000+00:00,2021-03-02 10:24:39.891000+00:00,diagnosis,2,Covid Diagnosis,,yD5QOGx7b4Y6o4O9u1NqedOFD8y2,,...,0.0,2021-03-02 10:24:45.950000+00:00,2021-03-02 10:24:47.890000+00:00,0.0,2021-03-02 10:24:47.890000+00:00,2021-03-02 10:25:06.843000+00:00,2021-01-14 00:00:00+00:00,2021-03-02 10:25:06.843000+00:00,2021-03-02 10:25:08.023000+00:00,2021-01-11 00:00:00+00:00
2021-02-15 17:39:05.045000+00:00,2392,2021-02-15 17:39:03.642000+00:00,2021-02-15 17:39:05.045000+00:00,2021-02-15 17:38:08.323000+00:00,diagnosis,2,Covid Diagnosis,,z220LGMuCdZL4F9IxkNi277F5Kt2,,...,0.0,2021-02-15 17:38:17.114000+00:00,2021-02-15 17:38:28.128000+00:00,0.0,2021-02-15 17:38:28.128000+00:00,2021-02-15 17:38:47.376000+00:00,2020-10-28 00:00:00+00:00,2021-02-15 17:38:47.376000+00:00,2021-02-15 17:39:05.045000+00:00,2020-10-23 23:00:00+00:00
2021-01-21 22:56:56.350000+00:00,2417,2021-01-21 22:56:53.408000+00:00,2021-01-21 22:56:56.350000+00:00,2021-01-21 22:56:36.790000+00:00,diagnosis,2,Covid Diagnosis,,zzvksuOwiSNtBglkEr8z5nctTz53,,...,0.0,2021-01-21 22:56:40.926000+00:00,2021-01-21 22:56:46.378000+00:00,0.0,2021-01-21 22:56:46.378000+00:00,2021-01-21 22:56:51.237000+00:00,2021-01-21 00:00:00+00:00,2021-01-21 22:56:51.237000+00:00,2021-01-21 22:56:56.350000+00:00,2021-01-15 00:00:00+00:00
