## From Database back into df (querying, etc.)

querying, EDA (groupbys, etc)
https://docs.sqlalchemy.org/en/14/tutorial/data.html

In [1]:
from sqlalchemy import create_engine
import pandas as pd

In [2]:
engine = create_engine("sqlite:///vaers.db")
engine

Engine(sqlite:///vaers.db)

In [3]:
all_tables = engine.table_names()
all_tables

['data', 'symptoms', 'vax']

### SQL statements

https://vaers.hhs.gov/docs/VAERSDataUseGuide_en_September2021.pdf

In [4]:
s1 = '''
SELECT DISTINCT(vax_type) FROM vax
ORDER BY vax_type
'''

In [28]:
s2 = '''
SELECT v.vax_manu, v.vax_dose_series, d.v_adminby, d.numdays, d.age_yrs, v.vaers_id
FROM vax v 
INNER JOIN data d 
ON v.vaers_id = d.vaers_id
WHERE v.vax_type = 'COVID19';
'''

In [48]:
df1 = pd.read_sql(s2, engine)


In [49]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 811629 entries, 0 to 811628
Data columns (total 6 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   vax_manu         811629 non-null  object 
 1   vax_dose_series  808680 non-null  object 
 2   v_adminby        811629 non-null  object 
 3   numdays          718031 non-null  float64
 4   age_yrs          735131 non-null  float64
 5   vaers_id         811629 non-null  int64  
dtypes: float64(2), int64(1), object(3)
memory usage: 37.2+ MB


In [50]:
df1.head()

Unnamed: 0,vax_manu,vax_dose_series,v_adminby,numdays,age_yrs,vaers_id
0,PFIZER\BIONTECH,1,PVT,0.0,56.0,902418
1,PFIZER\BIONTECH,1,PVT,0.0,35.0,902440
2,PFIZER\BIONTECH,1,OTH,0.0,55.0,902446
3,PFIZER\BIONTECH,UNK,PVT,0.0,42.0,902464
4,PFIZER\BIONTECH,1,PUB,0.0,60.0,902465


In [51]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 811629 entries, 0 to 811628
Data columns (total 6 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   vax_manu         811629 non-null  object 
 1   vax_dose_series  808680 non-null  object 
 2   v_adminby        811629 non-null  object 
 3   numdays          718031 non-null  float64
 4   age_yrs          735131 non-null  float64
 5   vaers_id         811629 non-null  int64  
dtypes: float64(2), int64(1), object(3)
memory usage: 37.2+ MB


In [52]:
#dropping nulls
df1 = df1.dropna(subset = ['vax_dose_series', 'v_adminby', 'numdays', 'age_yrs'])

In [53]:
df1.v_adminby.value_counts()

PVT    175303
PHM    152990
UNK    150065
OTH     81239
PUB     76136
WRK     25477
SEN     10960
SCH     10770
MIL      9067
Name: v_adminby, dtype: int64

In [54]:
dict_location = {'PVT': 'Private', 'PHM': 'Pharmacy', 'MIL': 'Military', 'OTH': 'Other', 'PUB': 'Public', 'WRK': 'Workplace Clinic', 'UNK': 'Unknown', 'OTH': 'Other', 'SEN': 'Nursing Home', 'SCH': 'School'}

In [55]:
df1.replace({'v_adminby': dict_location}, inplace=True)

In [56]:
df1.vax_dose_series.value_counts()

1      335737
2      210814
UNK     92444
3       51618
4         782
7+        309
5         194
6         109
Name: vax_dose_series, dtype: int64

In [57]:
def group_multdoses(dose):
    if dose in ['5','6','7+']:
        dose = '5+'
    return dose

In [58]:
df1.vax_dose_series = df1.vax_dose_series.apply(lambda x: group_multdoses(x))

In [60]:
df1.vax_dose_series.value_counts()

1      335737
2      210814
UNK     92444
3       51618
4         782
5+        612
Name: vax_dose_series, dtype: int64

In [61]:
df1.to_pickle('df1.pkl')

In [62]:
df1[:10]

Unnamed: 0,vax_manu,vax_dose_series,v_adminby,numdays,age_yrs,vaers_id
0,PFIZER\BIONTECH,1,Private,0.0,56.0,902418
1,PFIZER\BIONTECH,1,Private,0.0,35.0,902440
2,PFIZER\BIONTECH,1,Other,0.0,55.0,902446
3,PFIZER\BIONTECH,UNK,Private,0.0,42.0,902464
4,PFIZER\BIONTECH,1,Public,0.0,60.0,902465
5,PFIZER\BIONTECH,1,Private,0.0,59.0,902468
6,PFIZER\BIONTECH,1,Other,0.0,46.0,902479
7,PFIZER\BIONTECH,1,Private,0.0,37.0,902490
8,PFIZER\BIONTECH,1,Private,0.0,41.0,902491
9,PFIZER\BIONTECH,1,Public,0.0,44.0,902492


In [65]:
import numpy as np

pandas sources:
https://www.shanelynn.ie/summarising-aggregation-and-grouping-data-in-python-pandas/
https://proinsias.github.io/til/Pandas-Named-Aggregation/

In [71]:
# summary stats

summary_df1 = df1.groupby(['vax_manu', 'vax_dose_series', 'v_adminby']).agg(
    avg_onset_days = pd.NamedAgg(column='numdays', aggfunc='mean'),
    std_onset_days= pd.NamedAgg(column='numdays', aggfunc='std'),
    avg_age = pd.NamedAgg(column='age_yrs', aggfunc='mean'),
    std_age = pd.NamedAgg(column='age_yrs', aggfunc='std'),
    cases = pd.NamedAgg(column='vaers_id', aggfunc='count')).reset_index()


In [75]:
summary_df1.to_pickle('summary_df1.pkl')

In [77]:
summary_df1.vax_manu.unique()

array(['JANSSEN', 'MODERNA', 'PFIZER\\BIONTECH', 'UNKNOWN MANUFACTURER'],
      dtype=object)

In [69]:
#query for all the side effects


#back to sql

## Cleaned dataframe back to SQL as table (for easier processing/visualizing later, also in case steps need to be changed later):
https://datatofish.com/pandas-dataframe-to-sql/

In [None]:
#finding average onset, age, and case number -> use pandas 

s2 = '''
SELECT v.vax_manu, v.vax_dose_series, d.v_adminby, AVG(d.numdays) AS avg_onset_interval, AVG(d.age_yrs) as avg_age, COUNT(DISTINCT(v.vaers_id)) AS cases
FROM vax v 
INNER JOIN data d 
ON v.vaers_id = d.vaers_id
WHERE v.vax_type = 'COVID19'
GROUP BY v.vax_manu, v.vax_dose_series, d.v_adminby
'''

In [None]:
df = pd.read_sql(s2, engine)