# WS_ch02B.ipynb
# WESmith 11/03/22
## WS created this notebook to follow along chap 2 code
### Each recipe will have its own notebook, suffixed by A, B, etc.

#### data at https://vaers.hhs.gov/data/datasets.html
#### downloaded 2021VAERSData.zip¶

# Dealing with the pitfalls of joining pandas DataFrames

In [None]:
import pandas as pd
import numpy  as np
import matplotlib.pyplot as plt
import os

In [None]:
data_dir     = '../data'
data_file    = '2021VAERSDATA.csv.gz'
vax_file     = '2021VAERSVAX.csv.gz'
symtoms_file = '2021VAERSSYMPTOMS.csv.gz'
encoding     = 'iso-8859-1'
sampled_data = "vdata_sample.csv.gz"
sampled_vax  = "vax_sample.csv.gz"

## 1. INNER JOIN

In [None]:
# get reduced datasets
vdata = pd.read_csv(os.path.join(data_dir, sampled_data), low_memory=False)

In [None]:
vdata.columns

In [None]:
vax = pd.read_csv(os.path.join(data_dir, sampled_vax))

In [None]:
vax.columns

In [None]:
vdata_with_vax = vdata.join(vax.set_index("VAERS_ID"), on='VAERS_ID', how="inner")

In [None]:
len(vdata), len(vax), len(vdata_with_vax)

In [None]:
vdata_with_vax.columns

In [None]:
vdata_with_vax.index

## 2. FIND LOST DATA AFTER THE JOIN

In [None]:
lost_vdata = vdata.loc[~vdata.index.isin(vdata_with_vax.index)]
lost_vax   = vax[~vax["VAERS_ID"].isin(vdata_with_vax["VAERS_ID"])]

In [None]:
len(lost_vdata), len(lost_vax)

## 3. LEFT OUTER JOIN (THE DEFAULT)

In [None]:
vdata_with_vax_left = vdata.join(vax.set_index("VAERS_ID"), on="VAERS_ID")

In [None]:
vdata_with_vax_left.columns  # VAERS_ID is still a column, not an index

In [None]:
len(vdata_with_vax_left)

In [None]:
# with this left-outer join, the VAERS_ID can be repeated: not desirable
vdata_with_vax_left.groupby("VAERS_ID").size().sort_values()

In [None]:
vdata_with_vax_left.loc[vdata_with_vax_left['VAERS_ID']==962303]  # WS to see duplicates
# entire rows are identical, even index

In [None]:
# WS another way
ee = vdata_with_vax_left[vdata_with_vax_left['VAERS_ID'].duplicated()]
len(ee)

In [None]:
len(vdata_with_vax_left) - len(vdata)  # WS

## 4. RIGHT OUTER(?) JOIN

In [None]:
vdata['DIED'].unique()  # WS

In [None]:
dead = vdata[vdata['DIED'] == 'Y']

In [None]:
dead['DIED'].unique()  # WS

In [None]:
vax19 = vax[vax['VAX_TYPE'] == 'COVID19']

In [None]:
len(vax), len(vax19)  # WS

In [None]:
vax19_dead = vax19.join(dead.set_index("VAERS_ID"), on="VAERS_ID", how='right')

In [None]:
dd = vax19_dead[vax19_dead['VAERS_ID'].duplicated()]
len(dd)

In [None]:
len(vax19_dead) - len(dead)

In [None]:
dd['VAERS_ID'].value_counts()  # WS  a '1' presumably means duplicated once

## 5. REVISIT COVID-LOT CALCS SINCE WE MAY BE OVERCOUNTING

In [None]:
# WS this doesn't change anything? (unless some were lc)
vax19_dead['STATE'] = vax19_dead['STATE'].str.upper()

In [None]:
# WS make a double index: to remove duplicates of VAERS_ID and VAX_LOT paired together
# limit result to just the columns we're using
dead_lot = vax19_dead[['VAERS_ID', 'VAX_LOT', 'STATE']].set_index(['VAERS_ID', 'VAX_LOT'])

In [None]:
dead_lot_clean = dead_lot[~dead_lot.index.duplicated()]  # WS remove repeated indexes

In [None]:
len(dead_lot) - len(dead_lot_clean)

In [None]:
dead_lot_clean = dead_lot_clean.reset_index()

In [None]:
dead_lot_clean[dead_lot_clean['VAERS_ID'].isna()]  # WS this should be empty

In [None]:
dead_lot_clean.head()

In [None]:
baddies = dead_lot_clean.groupby('VAX_LOT').size().sort_values(ascending=False)

In [None]:
baddies.head()

In [None]:
for i, (lot, cnt) in enumerate(baddies.items()):
    print(lot, cnt, len(dead_lot_clean[dead_lot_clean['VAX_LOT'] == lot].groupby("STATE")))
    if i == 10:
        break