In [275]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [283]:
df = pd.read_csv('data/warfarin.csv')
df = df.dropna(subset = ['Therapeutic Dose of Warfarin'])

def get_bucket(dosage):
    if dosage < 3:
        return 'low'
    elif dosage <= 7:
        return 'medium'
    else:
        return 'high'

In [284]:
def get_accuracy(correct, pred):
    x = correct == pred
    return sum(x) / len(x)

In [285]:
# baseline return 5
is_medium = 'medium'
get_accuracy(df['dosage_bucket'], is_medium)

0.611794500723589

In [264]:
# function to extract decade from df
def get_decade(x):
    if type(x) == str:
        return int(x[:2]) // 10
    return 0

In [279]:
# create features for clinical baseline
df = df.dropna(subset = ['Age', 'Height (cm)', "Weight (kg)"])
df = df.dropna(how = 'all', \
            subset = ['Carbamazepine (Tegretol)', 'Phenytoin (Dilantin)', 'Rifampin or Rifampicin'])

df['Bias'] = 1
df['Age in decades'] = df['Age'].apply(get_decade)
df['Asian'] = [1 if race == 'Asian' else 0 for race in df['Race']]
df['Black or African American'] = [1 if race == 'Black or African American' else 0 for race in df['Race']]
df['Missing or Mixed Race'] = [1 if race == 'Unknown' else 0 for race in df['Race']]
df['Enzyme inducer status'] = ((df['Carbamazepine (Tegretol)'] == 1) | \
                               (df['Phenytoin (Dilantin)'] == 1) | (\
                                df['Rifampin or Rifampicin'] == 1)).astype(int)
df['Amiodarone status'] = (df['Amiodarone (Cordarone)'] == 1).astype(int)

features = df[['Bias', 'Age in decades', 'Height (cm)', 'Weight (kg)', 'Asian', 'Black or African American', 
               'Missing or Mixed Race','Enzyme inducer status', 'Amiodarone status']]
# save to a csv
df.to_csv('data/clinical_dosing_features.csv')

In [280]:
features.head()
len(features)

2130

In [281]:
# run the clinical dosing predicted dosage
beta = np.array([4.0376, -0.2546, 0.0118, 0.0134, -0.6752, 0.406, 0.0443, 1.2799, -0.5695])

def get_dosage_bucket(x):
    dosage = x.dot(beta)**2 / 7
    return get_bucket(dosage)

dosage_buckets = features.apply(get_dosage_bucket, axis=1)
get_accuracy(df['dosage_bucket'], dosage_buckets)

0.6610328638497652