# WS_ch02B.ipynb
# WESmith 11/03/22
## WS created this notebook to follow along chap 2 code
### Each recipe will have its own notebook, suffixed by A, B, etc.

#### data at https://vaers.hhs.gov/data/datasets.html
#### downloaded 2021VAERSData.zip¶

# Dealing with the pitfalls of joining pandas DataFrames

In [None]:
import pandas as pd
import numpy  as np
import matplotlib.pyplot as plt
import os

In [None]:
data_dir     = '/home/smithw/Downloads/bioinformatics/ch02_data'
data_file    = '2021VAERSDATA.csv.gz'
vax_file     = '2021VAERSVAX.csv.gz'
symtoms_file = '2021VAERSSYMPTOMS.csv.gz'
encoding     = 'iso-8859-1'
sampled_data = "vdata_sample.csv.gz"
sampled_vax  = "vax_sample.csv.gz"

In [None]:
# create randomly sampled data: turn off these lines after sampled data created
vdata = pd.read_csv(os.path.join(data_dir, data_file), encoding=encoding, low_memory=False)

In [None]:
vdata.sample(frac=0.9).to_csv(os.path.join(data_dir, sampled_data), index=False)

In [None]:
vax = pd.read_csv(os.path.join(data_dir, vax_file), encoding=encoding)

In [None]:
vax.sample(frac=0.9).to_csv(os.path.join(data_dir, sampled_vax), index=False)

## 1. INNER JOIN

In [None]:
# get reduced datasets
vdata = pd.read_csv(os.path.join(data_dir, sampled_data), low_memory=False)

In [None]:
# WS look at duplicated VAERS_ID
len(vdata[vdata['VAERS_ID'].duplicated()])  # no patient records duplicated

In [None]:
vdata.columns

In [None]:
vax = pd.read_csv(os.path.join(data_dir, sampled_vax))

In [None]:
# WS look at duplicated VAERS_ID
dd = vax[vax['VAERS_ID'].duplicated()]
len(dd)  # lots of dupes

In [None]:
dd['VAERS_ID'].value_counts()

In [None]:
vax[vax['VAERS_ID']==962303]  # WS the most-duplicated VAERS_ID
# same patient (962303) has different info about the VAX_LOT, VAX_SITE, etc; 11 differences

In [None]:
vax.shape

In [None]:
vax.columns

In [None]:
# WS join needs to  match on an index, hence setting column to index for vax df
vdata_with_vax = vdata.join(vax.set_index("VAERS_ID"), on='VAERS_ID', how="inner")

In [None]:
vdata_with_vax.describe()

In [None]:
vdata_with_vax.info()

In [None]:
# WS equiv with merge(): merging on columns 'VAERS_ID' in both dfs; default left_indes, right_index = False
vdata_with_vax_merge = pd.merge(vdata, vax, how='inner', on='VAERS_ID')

In [None]:
vdata_with_vax_merge.describe()

In [None]:
vdata_with_vax_merge.info()

In [None]:
# WS was getting disagreement because of the index apparently
vdata_with_vax.reset_index(drop=True).compare(vdata_with_vax_merge.reset_index(drop=True))  # WS

In [None]:
vdata_with_vax.reset_index(drop=True).equals(vdata_with_vax_merge.reset_index(drop=True))  # WS

In [None]:
len(vdata), len(vax), len(vdata_with_vax)

In [None]:
vdata_with_vax.columns

In [None]:
vdata_with_vax.index

## 2. FIND LOST DATA AFTER THE JOIN

In [None]:
vdata.head(1)

In [None]:
lost_vdata_1 = vdata.loc[~vdata.index.isin(vdata_with_vax.index)] # WS one way to do it
lost_vdata   = vdata[~vdata['VAERS_ID'].isin(vdata_with_vax['VAERS_ID'])]  # ws another way to do it
lost_vax     = vax[~vax["VAERS_ID"].isin(vdata_with_vax["VAERS_ID"])]

In [None]:
lost_vdata_1.reset_index(drop=True).compare(lost_vdata.reset_index(drop=True))

In [None]:
lost_vdata_1.reset_index(drop=True).equals(lost_vdata.reset_index(drop=True))

In [None]:
len(lost_vdata), len(lost_vax)

## 3. LEFT OUTER JOIN (THE DEFAULT)

In [None]:
vdata_with_vax_left = vdata.join(vax.set_index("VAERS_ID"), on="VAERS_ID")

In [None]:
# WS merge equivalent
vdata_with_vax_left_merge = pd.merge(vdata, vax, how='left', on='VAERS_ID')

In [None]:
vdata_with_vax_left.reset_index(drop=True).equals(vdata_with_vax_left_merge.reset_index(drop=True))

In [None]:
vdata_with_vax_left.columns  # VAERS_ID is still a column, not an index

In [None]:
len(vdata_with_vax_left)

In [None]:
# KEY POINT with this left-outer join, the VAERS_ID can be repeated: not desirable
vdata_with_vax_left.groupby("VAERS_ID").size().sort_values()

In [None]:
vdata_with_vax_left.loc[vdata_with_vax_left['VAERS_ID']==962303]  # WS to see duplicates
# rows are identical for patient info, but vax info is DIFFERENT for same patient, in 12 different ways

In [None]:
len(vdata_with_vax_left) - len(vdata)  # WS number of duplicated VAERS_ID

In [None]:
# WS another way
len(vdata_with_vax_left[vdata_with_vax_left['VAERS_ID'].duplicated()])

## 4. RIGHT JOIN

In [None]:
vdata['DIED'].unique()  # WS

In [None]:
dead = vdata[vdata['DIED'] == 'Y']

In [None]:
dead['DIED'].unique()  # WS

In [None]:
vax19 = vax[vax['VAX_TYPE'] == 'COVID19']

In [None]:
len(vax), len(vax19)  # WS

In [None]:
vax19_dead = vax19.join(dead.set_index("VAERS_ID"), on="VAERS_ID", how='right')

In [None]:
# WS merge equivalent
vax19_dead_merge = pd.merge(vax19, dead, on='VAERS_ID', how='right')

In [None]:
vax19_dead.reset_index(drop=True).equals(vax19_dead_merge.reset_index(drop=True))

In [None]:
dd = vax19_dead[vax19_dead['VAERS_ID'].duplicated()]
len(dd)

In [None]:
len(vax19_dead) - len(dead)

In [None]:
dd['VAERS_ID'].value_counts()  # WS  a '1' presumably means duplicated once

In [None]:
# WS look at id with most VAERS_ID dupes: 9 rows since original and 8 dupes
vax19_dead.loc[vax19_dead['VAERS_ID']==1215401]  # look at id with most VAERS_ID dupes: 9 rows

## 5. REVISIT COVID-LOT CALCS SINCE WE MAY BE OVERCOUNTING

In [None]:
# WS this doesn't change anything, unless some were lowercase
vax19_dead['STATE'] = vax19_dead['STATE'].str.upper()

In [None]:
vax19_dead.shape

In [None]:
# WS make a double index: to remove duplicates of VAERS_ID and VAX_LOT paired together
# also limit result to just the columns we're using
dead_lot = vax19_dead[['VAERS_ID', 'VAX_LOT', 'STATE']].set_index(['VAERS_ID', 'VAX_LOT'])

In [None]:
dead_lot.shape

In [None]:
# WS encapsulate index value as a list since double index: 8 dupes plus orig = 9 rows
dead_lot.loc[[1215401]]

In [None]:
~dead_lot.loc[[1215401]].index.duplicated() # note that NaN shows unique double index

In [None]:
# WS remove repeated indexes
dead_lot_clean = dead_lot[~dead_lot.index.duplicated()]

In [None]:
len(dead_lot) - len(dead_lot_clean)

In [None]:
dead_lot_clean = dead_lot_clean.reset_index()

In [None]:
dead_lot_clean[dead_lot_clean['VAERS_ID'].isna()]  # WS this should be empty

In [None]:
# WS since VAX_TYPE is NaN (originally part of double index), see how many VAERS_ID dupes remain now
ee = dead_lot_clean[dead_lot_clean['VAERS_ID'].duplicated()]
len(ee)

In [None]:
ee['VAERS_ID'].value_counts()

In [None]:
# still dupes because of different VAX_LOTs: still seems to over-represent the patients
dead_lot_clean[dead_lot_clean['VAERS_ID']==1874377]

In [None]:
dead_lot_clean.head()

In [None]:
baddies = dead_lot_clean.groupby('VAX_LOT').size().sort_values(ascending=False)

In [None]:
baddies.head()

In [None]:
for i, (lot, cnt) in enumerate(baddies.items()):
    print(lot, cnt, len(dead_lot_clean[dead_lot_clean['VAX_LOT'] == lot].groupby("STATE")))
    if i == 10:
        break

In [None]:
# SUMMARY: THESE NUMBERS ARE REDUCED FROM THOSE IN WS_CH02A.IPYNB, BECAUSE OF REMOVING DUPES
# NOTE: THE DATA MAY STILL HAVE OTHER DUPES NOT HANDLED YET