# REPORT

To transfer the file from mimir, use scp

```bash
scp mnz2108@mimir.dbmi.columbia.edu:/home/mnz2108/nsides/data/PATIENT.csv.xz data/meta_unformatted/
```

## Table schema

```mysql
CREATE TABLE REPORT (
    report_id int
    report_year int
    person_age int
    person_sex char(1)
)
```

In [1]:
import numpy as np
import pandas as pd
import scipy.sparse

## Load report IDs that were used in our analysis

In [2]:
# IDs of reports used in this analysis
report_id_vector = np.load('../../data/meta_formatted/report_id_vector.npy')

report_id_df = pd.DataFrame(report_id_vector, columns=['report_id'])

report_id_vector

array([  4572294,   4440060,   4456349, ...,  87896661, 101667231,
       100823751])

## Load maps from report IDs to demographic information

These maps were provided by Rami

In [3]:
# Load report_id_to_age mapping (from Rami)
ages = np.load('../../data/meta_unformatted/all_ages.npy', allow_pickle=True)
report_id_to_age = ages.item()

# Load report_id_to_year mapping (from Rami)
years = np.load('../../data/meta_unformatted/all_years.npy', allow_pickle=True)
report_id_to_year = years.item()

# Load report_id_to_sex mapping (from Rami)
sexes = np.load('../../data/meta_unformatted/all_sexes.npy', allow_pickle=True)
report_id_to_sex = sexes.item()

## Load tables from the `effect_faers` database

In [4]:
age_df = pd.read_csv('../../data/meta_unformatted/faers_ages.csv.xz')

age_df.head(2)

Unnamed: 0,primaryid,age,age_code,report_year
0,34483284,0.0,,2012
1,35676563,86.62,YR,2012


In [5]:
demo_df = pd.read_csv('../../data/meta_unformatted/faers_demographics.csv.xz')

demo_df.head(2)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,isr_report_id,case_id,init_followup_code,foll_seq_id,image_id,event_date,manfr_informed_date,fda_informed_date,report_code,manfr_id,...,e_submit,weight,weight_code,report_date,reporter_occp_code,death_date,notified_manfr,confid,reporter_country,duplicate
0,4204616.0,5657190.0,I,,4204616-7,2003-08-15,2003-09-18,2003-10-06,EXP,DL2003174,...,N,0.0,,2003-10-02,MD,,,,,
1,,0.0,,,,,,,,,...,,,,,,,,,,


## Merge `report_id_vector` with the `faers_demographics` table

In [6]:
merged = (
    report_id_df
    .merge(demo_df, how='left', left_on='report_id', right_on='isr_report_id')
    .drop(columns=['init_followup_code', 'foll_seq_id', 'image_id',
                   'manfr_informed_date', 'report_code', 'manfr_id', 'manfr_sendreport', 
                   'reporter_occp_code', 'death_date', 'notified_manfr', 'confid',
                   'reporter_country', 'duplicate', 'e_submit', 'weight', 'weight_code'])
)

# Make sure that every report_id from the report_id_vector has a corresponding
#  row in the demographics table.
assert merged[merged['isr_report_id'].isnull()].shape[0] == 0

merged.head(2)

Unnamed: 0,report_id,isr_report_id,case_id,event_date,fda_informed_date,age,age_code,gender_code,report_date
0,4572294,4572294.0,5731006.0,2004-12-09,2005-02-03,76.0,YR,F,2005-02-02
1,4440060,4440060.0,4188710.0,2004-05-25,2004-09-02,78.0,YR,M,2004-09-02


## Merge with data vectors 

From Rami's work

In [7]:
merged = (
    merged
    .assign(
        vector_age=lambda df: df['report_id'].map(report_id_to_age),
        vector_year=lambda df: df['report_id'].map(report_id_to_year),
        
        vector_age_len=lambda df: df['vector_age'].apply(len),
        cleaned_vector_age=lambda df: df.apply(
            lambda row: row['vector_age'][0] if row['vector_age_len'] else np.nan, axis=1),
    )
    .drop(columns=['isr_report_id', 'case_id', 'vector_age', 'vector_age_len'])
)

merged.head(2)

Unnamed: 0,report_id,event_date,fda_informed_date,age,age_code,gender_code,report_date,vector_year,cleaned_vector_age
0,4572294,2004-12-09,2005-02-03,76.0,YR,F,2005-02-02,2005,76.0
1,4440060,2004-05-25,2004-09-02,78.0,YR,M,2004-09-02,2004,78.0


## Clean patient sex field

In [8]:
cleaned_vector_sex = (
    merged['report_id']
    .map(report_id_to_sex)
    .apply(lambda x: 'U' if isinstance(x, list) and len(x) == 0 else x)
    .apply(lambda x: x[0] if isinstance(x, list) and len(x) == 1 else x)
    .apply(lambda x: x[0] if isinstance(x, list) and len(x) == 2 and x[0] == x[1] else x)
    .apply(lambda x: 'U' if x == 'NS' or x == 'UNK' or x == '' 
           or x == ['NS', ''] or x == ['M', 'F'] or x == ['F', 'M']
           or x == ['', 'NS'] or x == 'YR' else x)
    .apply(lambda x: 'M' if x == ['', 'M'] or x == ['NS', 'M'] else x)
    .apply(lambda x: 'F' if x == ['', 'F'] else x)
)
    
# Ensure that no report_ids map to anything other than 'M', 'F', or 'U'
assert cleaned_vector_sex.apply(lambda x: x in {'M', 'F', 'U'}).all()

merged['cleaned_vector_sex'] = cleaned_vector_sex

merged.head(2)

Unnamed: 0,report_id,event_date,fda_informed_date,age,age_code,gender_code,report_date,vector_year,cleaned_vector_age,cleaned_vector_sex
0,4572294,2004-12-09,2005-02-03,76.0,YR,F,2005-02-02,2005,76.0,F
1,4440060,2004-05-25,2004-09-02,78.0,YR,M,2004-09-02,2004,78.0,M


## Investigate errors in the data

In [9]:
errors_df = (
    merged
    .loc[((merged['cleaned_vector_age'] != merged['age']) & 
          (merged['age'] != 0) & 
          (~merged['age'].isnull() & ~merged['cleaned_vector_age'].isnull())), 
         ['report_id', 'age', 'age_code', 'cleaned_vector_age']]
    .assign(
        age=lambda df: df['age'].astype(int),
        cleaned_vector_age=lambda df: df['cleaned_vector_age'].astype(int),
    )
    .query('age != cleaned_vector_age')
    .assign(
        age_mon=lambda df: (df['age'] / 12).apply(int),
        age_wk=lambda df: (df['age'] / 52).apply(int),
        age_dy=lambda df: (df['age'] / 365).apply(int),
    )
    .query('(age_code == "MON" & age_mon != cleaned_vector_age) | ' 
           '(age_code == "WK" & age_wk != cleaned_vector_age) | '
           '(age_code == "DY" & age_dy != cleaned_vector_age)')
)

errors_df.head(2)

Unnamed: 0,report_id,age,age_code,cleaned_vector_age,age_mon,age_wk,age_dy
6732,8581655,999,DY,59,83,19,2
6766,5112633,999,DY,69,83,19,2


In [10]:
(
    merged
    .query('cleaned_vector_age > 120')
    .loc[:, ['report_id', 'age', 'age_code', 'cleaned_vector_age']]
)

Unnamed: 0,report_id,age,age_code,cleaned_vector_age
20986,6974212,999.99,YR,1054.0
60987,4321047,869.0,YR,869.0
398290,4352281,999.99,YR,5200.0
440645,5512138,566.0,YR,566.0
517301,6986289,999.99,YR,1045.0
534086,5641092,190.0,YR,190.0
560270,4381869,999.99,YR,7200.0
623075,7104578,953.0,YR,953.0
639257,4315400,999.99,YR,6800.0
727172,6905005,352.0,YR,352.0


## Finalize and save table

Note that the errors above are included in the saved file, as they are also present in the `effect_faers` database.

In [11]:
report = (
    merged
    .rename(columns={'vector_year': 'report_year', 'cleaned_vector_age': 'person_age',
                     'cleaned_vector_sex': 'person_sex'})
    .filter(items=['report_id', 'report_year', 'person_age', 'person_sex'])
)

assert report.shape[0] == report_id_vector.shape[0]

report.to_csv('../../data/tables/report.csv.xz', index=False, compression='xz')

report.head(2)

Unnamed: 0,report_id,report_year,person_age,person_sex
0,4572294,2005,76.0,F
1,4440060,2004,78.0,M
