# WS_ch02A.ipynb
# WESmith 11/03/22
## WS created this notebook to follow along chap 2 code from book 
# 'Bioinformatics with Python Cookbook' by Tiago Antao
### Each recipe will have its own notebook, suffixed by A, B, etc.

#### data at https://vaers.hhs.gov/data/datasets.html
#### downloaded 2021VAERSData.zip

In [None]:
import pandas as pd
import numpy  as np
import matplotlib.pyplot as plt
import os

# USING PANDAS TO PROCESS VACCINE-ADVERSE EVENTS

In [None]:
data_dir     = 'data/ch02_data'
data_file    = '2021VAERSDATA.csv.gz'
vax_file     = '2021VAERSVAX.csv.gz'
symtoms_file = '2021VAERSSYMPTOMS.csv.gz'
encoding     = 'iso-8859-1'

## GET THE PATIENT DATA

In [None]:
vdata = pd.read_csv(os.path.join(data_dir, data_file), encoding=encoding, low_memory=False)

In [None]:
vdata.info()

In [None]:
vdata.columns

In [None]:
vdata.dtypes

In [None]:
vdata.shape

In [None]:
vdata.iloc[0]

In [None]:
vdata = vdata.set_index("VAERS_ID")

In [None]:
vdata.loc[916600]

In [None]:
vdata.head(3)

In [None]:
vdata.iloc[:3]  # WS same as above, using numpy slicing format (ie, 0,1,2 included)

In [None]:
vdata.iloc[:5, 2:4]  # WS cols 2,3 included; numpy slicing for iloc; VAERS_ID is no longer a column

In [None]:
vdata['AGE_YRS'].max()

In [None]:
vdata.AGE_YRS.max()

In [None]:
vdata['AGE_YRS'].sort_values().head()  # indexes are all over the place

In [None]:
vdata['AGE_YRS'].sort_values().plot(use_index=False, grid=True, figsize=[10,5])  # WS added grid, figsize
plt.show()

In [None]:
vdata['AGE_YRS'].plot.hist(bins=40, grid=True, figsize=[10,5])
plt.show()

In [None]:
fig, ax = plt.subplots(1, 2, sharey=True, figsize=[10,5])
fig.suptitle("Age of adverse events")
vdata["AGE_YRS"].sort_values().plot(use_index=False, ax=ax[0], xlabel="Observation", ylabel="Age", grid=True)
vdata["AGE_YRS"].plot.hist(bins=40, orientation='horizontal', grid=True)
plt.show()

In [None]:
vdata['AGE_YRS'].dropna().apply(lambda x: int(x)).value_counts()

In [None]:
vdata['AGE_YRS'].value_counts()  # WS most ages have no fraction
# WS many rows may round down to 0, so they have more than 1 instance, and don't show in the tail above

In [None]:
vdata['DIED'].value_counts(dropna=False)

In [None]:
vdata['DIED'].value_counts(dropna=True)

In [None]:
vdata['is_dead'] = (vdata['DIED'] == 'Y')

In [None]:
vdata['is_dead'].value_counts()

In [None]:
dead = vdata[vdata['is_dead']]
dead.head(3)

## GET THE VACCINE DATA

In [None]:
vax = pd.read_csv(os.path.join(data_dir, vax_file), encoding=encoding).set_index('VAERS_ID')

In [None]:
vax.shape

In [None]:
vax.columns

In [None]:
vax['VAX_TYPE'].describe()  # WS 69 vax types

In [None]:
vax['VAX_TYPE'].unique()   # WS and here they are

In [None]:
vt = vax.groupby('VAX_TYPE').size().sort_values()  # WS to see the frequency of vaccines
vt # a series object

In [None]:
vt.reset_index()  # turn series object into a dataframe object

In [None]:
vax19 = vax[vax.VAX_TYPE == 'COVID19']

In [None]:
vax19.columns

In [None]:
vax19['VAX_MANU'].unique()  # WS

In [None]:
dead.head(3)

In [None]:
vax19.head(3)

## JOINING DATAFRAMES

In [None]:
vax19_dead = dead.join(vax19) # WS join on the indexex (VAERS_ID for both dataframes): left join by default

In [None]:
vax19_dead.info()

In [None]:
# WS equivalent with merge(): explicitly set index matching for left and right
vax19_dead_merge = pd.merge(dead, vax19, how='left', left_index=True, right_index=True)

In [None]:
vax19_dead_merge.info()

In [None]:
vax19_dead.compare(vax19_dead_merge)

In [None]:
vax19_dead.equals(vax19_dead_merge)

In [None]:
vax19_dead.columns

In [None]:
baddies = vax19_dead.groupby('VAX_LOT').size().sort_values(ascending=False)
baddies.reset_index().head(10)  # WS show as dataframe with reset_index()

In [None]:
vax19_dead[vax19_dead['VAX_LOT']=='ER8727'].head(50)

In [None]:
type(baddies)

In [None]:
for i, (lot, cnt) in enumerate(baddies.items()):
    if i == 0:  # WS
        print('LOT, NUM DEATHS  NUM STATES WITH THAT LOT')
    print(lot, cnt, len(vax19_dead[vax19_dead['VAX_LOT']==lot].groupby("STATE")))
    if i == 10: 
        break

In [None]:
# WS isolate groupby
# have to use fillna() to show state labels that are NaN as something, to count the number of
# patients correctly for NaN states
gg = vax19_dead[vax19_dead['VAX_LOT']=='EN6201'].fillna('state_missing').groupby("STATE")
len(gg), gg.size().sum(), type(gg)
# length of gg (number of states) may be 1 more than result derived from baddies, since some lots
# show NaN for the state, and this is counted as 1 state

In [None]:
gg.size().sort_values(ascending=False)

In [None]:
baddies

In [None]:
# WS It doesn't seem possible to turn a groupby object into a series or dataframe object
# directly. The groupby object appears to be iterable (but it is NOT an iterator according to 
# online discussion: ie, next() doesn't work on it). It is necessary to do an operation on the 
# group (eg, .size(), len(), ...) to get a series object out.
len(gg), gg.size().sort_values(ascending=False).sum()

## GROUPBY ANALYSIS (WS ADDED THIS)
### a good source of info:
### https://towardsdatascience.com/all-pandas-groupby-you-should-know-for-grouping-data-and-performing-operations-2a8ec1327b5

In [None]:
# WS make a subset of data
vax19_dead.columns

In [None]:
# WS make a subset of data to simplify
vax19_dead_subset = vax19_dead.loc[:, ['STATE', 'VAX_TYPE', 'VAX_MANU', 'VAX_LOT', 'VAX_NAME']]
vax19_dead_subset

In [None]:
aa = vax19_dead.groupby('VAX_MANU').size().sort_values(ascending=False).reset_index()
aa

In [None]:
# see how many manufacturers are in all patients
vax19_all = vdata.join(vax19)  # WS this is all patients, alive or dead, that took covid19 vaccine

In [None]:
bb = vax19_all.groupby('VAX_MANU').size().sort_values(ascending=False).reset_index()
bb

In [None]:
mm = pd.merge(aa, bb, on='VAX_MANU')
mm.rename(columns={'0_x': 'died', '0_y': 'total'}, inplace=True)
mm

In [None]:
mm['%'] = mm.apply(lambda x: 100 * x['died']/x['total'], axis=1)
mm

In [None]:
vax19_dead.groupby('NUMDAYS').size().sort_values(ascending=False).plot.hist(bins=400, 
                                                                            figsize=[15,5], 
                                                                            grid=True,
                                                                            )
plt.show()

In [None]:
vax19_dead.groupby('NUMDAYS').size().sort_values(ascending=False).plot.hist(bins=100, 
                                                                            figsize=[15,5], 
                                                                            grid=True,
                                                                            range=[0,50])
plt.show()

In [None]:
vax19_dead.groupby('STATE').size().sort_values(ascending=False).reset_index()

In [None]:
aa = vax19_dead.groupby('STATE').size() #.plot(x_compat=True, figsize=[20,5], grid=True,)
# plot with matplotlib directly to get all of the states to show
fig = plt.figure(figsize=[20,5])
plt.plot(aa)
plt.grid(True)

In [None]:
# WS demonstration that a groupby object is an iterable (it is NOT an iterator: next() doesn't work)
jj = [(state, dframe) for (state, dframe) in gg]

In [None]:
jj[0][0]  # the state

In [None]:
jj[0][1]  # the dataframe for this state

In [None]:
# WS can do the above groupby a different way: the type is different, but the length is the same
dd = vax19_dead[vax19_dead['VAX_LOT']=='EN6201']['STATE'].value_counts()
len(dd), type(dd)

In [None]:
dd

In [None]:
# who made lot EN6201?  several manufacturers
vax[vax['VAX_LOT']=='EN6201']['VAX_MANU'].unique()

In [None]:
# who made lot EN6200?  several manufacturers
vax[vax['VAX_LOT']=='EN6200']['VAX_MANU'].unique()

In [None]:
# who made lot EL0140?  several manufacturers
vax[vax['VAX_LOT']=='EL0140']['VAX_MANU'].unique()