In [130]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [148]:
df = pd.read_csv('data/warfarin.csv')
df = df.dropna(subset = ['Therapeutic Dose of Warfarin'])

def get_bucket(dosage):
    if dosage < 3:
        return 'low'
    elif dosage < 7:
        return 'medium'
    else:
        return 'high'
    
df['dosage_bucket'] = (df['Therapeutic Dose of Warfarin'] / 7).apply(get_bucket)

In [159]:
def get_accuracy(x):
    return sum(x) / len(x)

In [161]:
# baseline return 5
is_medium = df['dosage_bucket'] == 'medium'
get_accuracy(is_medium)

0.6027496382054993

In [102]:
# function to extract decade from df
def get_decade(x):
    if type(x) == str:
        return int(x[:2]) // 10
    return 0

In [163]:
# create features for clinical baseline
df['Bias'] = 1
df['Age in decades'] = df['Age'].apply(get_decade)
df['Asian'] = [1 if race == 'Asian' else 0 for race in df['Race']]
df['Black or African American'] = [1 if race == 'Black or African American' else 0 for race in df['Race']]
df['Missing or Mixed Race'] = [1 if race == 'Unknown' else 0 for race in df['Race']]
df['Enzyme inducer status'] = ((df['Carbamazepine (Tegretol)'] == 1) | \
                               (df['Phenytoin (Dilantin)'] == 1) | (df['Rifampin or Rifampicin'] == 1)).astype(int)
df['Amiodarone status'] = (df['Amiodarone (Cordarone)'] == 1).astype(int)

features = df[['Bias', 'Age in decades', 'Height (cm)', 'Weight (kg)', 'Asian', 'Black or African American', 
               'Missing or Mixed Race','Enzyme inducer status', 'Amiodarone status']]
# save to a csv
df.to_csv('data/clinical_dosing_features.csv')

In [121]:
features.head()

Unnamed: 0,Bias,Age in decades,Height (cm),Weight (kg),Asian,Black or African American,Missing or Mixed Race,Enzyme inducer status,Amiodarone status
0,1,6,193.04,115.7,0,0,0,0,0
1,1,5,176.53,144.2,0,0,0,0,0
2,1,4,162.56,77.1,0,0,0,0,0
3,1,6,182.24,90.7,0,0,0,0,0
4,1,5,167.64,72.6,0,0,0,0,0


In [162]:
# run the clinical dosing predicted dosage
beta = np.array([4.0376, -0.2546, 0.0118, 0.0134, -0.6752, 0.406, 0.0443, 1.2799, -0.5695])

def get_dosage_bucket(x):
    dosage = x.dot(beta)**2 / 7
    return get_bucket(dosage)

dosage_buckets = features.apply(get_dosage_bucket, axis=1)
correct_dosages = dosage_buckets == df['dosage_bucket']
get_accuracy(correct_dosages)

0.5365412445730825