In [1]:
import numpy as np
import pandas as pd
import scipy.sparse

In [2]:
report_id_vector = np.load('../../data/meta_formatted/report_id_vector.npy')
drug_id_vector = np.load('../../data/meta_formatted/drug_id_vector.npy')
drug_exposure = scipy.sparse.load_npz('../../data/meta_formatted/drug_exposure_matrix.npz')

report = pd.read_csv('../../data/tables/report.csv.xz')

age_df = (
    pd.DataFrame(report_id_vector, columns=['report_id'])
    .merge(report, on='report_id', how='left')
)

age_df.head(2)

Unnamed: 0,report_id,report_year,person_age,person_sex
0,4572294,2005,76.0,F
1,4440060,2004,78.0,M


# Age differences

Differences in average age between exposed and unexposed reports (where age is known) for all drugs.

In [3]:
age = age_df['person_age'].values

# Indices of records where age is known
indices = np.where(~np.isnan(age))[0]

# Sum of ages of exposed reports for each drug
sum_exposed_ages = age[indices] @ drug_exposure[indices]

# Num exposed with a known age
n_exposed = drug_exposure[indices].sum(axis=0)

# Set drugs with no exposures to 1 exposure, so division works
n_exposed[n_exposed == 0] = 1

# Mean age of exposed reports (among those with known age)
mean_exposed_ages = sum_exposed_ages / n_exposed

# Sum of ages for all known ages
sum_ages = age[indices].sum()

# The sum of ages for unexposed reports is total - sum_exposed
sum_unexposed_ages = sum_ages - sum_exposed_ages

# Mean of unexposed reports is sum of unexposed ages divided by number
#  not exposed (total number reports with age minus number exposed)
mean_unexposed_ages = sum_unexposed_ages / (len(indices) - n_exposed)

# Average age exposed - average age unexposed (among those with known ages)
exposed_minus_unexposed = mean_exposed_ages - mean_unexposed_ages
exposed_minus_unexposed = np.array(exposed_minus_unexposed).flatten()

# Sex differences

Fraction of reports that are male (among those with male or female reported) among exposed minus unexposed.

In [4]:
sex = age_df['person_sex'].values

indices_known = np.where((sex == 'F') | (sex == 'M'))
indices_male = np.where(sex == 'M')

num_known_sex_exposed = drug_exposure[indices_known].sum(axis=0)
num_known_sex_unexposed = (len(indices_known) - num_known_sex_exposed)

num_males_exposed = drug_exposure[indices_male].sum(axis=0)
num_males_unexposed = num_known_sex_exposed - num_males_exposed

# Formatting to enable division
num_known_sex_exposed[num_known_sex_exposed == 0] = 1
num_known_sex_unexposed[num_known_sex_unexposed == 0] = 1

# Fraction of exposed and unexposed that are male
frac_exposed_male = num_males_exposed / num_known_sex_exposed
frac_unexposed_male = num_males_unexposed / num_known_sex_unexposed

frac_male_exp_minus_unexp = np.array(frac_exposed_male - frac_unexposed_male).flatten()

# Save table

In [5]:
raw_diff_df = pd.DataFrame({
    'drug_id': drug_id_vector,
    'age_diff': exposed_minus_unexposed,
    'male_dif': frac_male_exp_minus_unexp
})

raw_diff_df.to_csv('../../data/post_analysis/raw_diff.csv.xz', index=False,
                   compression='xz')

raw_diff_df.head(2)

Unnamed: 0,drug_id,age_diff,male_dif
0,314826,-10.504228,1.15
1,8167,-5.254216,-1.0
