In [None]:
import pandas

pandas.set_option('display.max_columns', None)
pandas.set_option('display.max_rows', 25)

# Setting up the environment

The next couple cells define global variables used throughout the notebook and ingest the training data that will be used to build a data model.

In [None]:
import os

# Set up some variables for the training data paths
BASE_DATA_PATH = os.path.join('..', 'data')
TRAIN_DATA_PATH = os.path.join(BASE_DATA_PATH, 'train')

TRAINING_PROVIDERS_PATH = os.path.join(TRAIN_DATA_PATH, 'Train-1542865627584.csv')
TRAINING_INPATIENT_PATH = os.path.join(TRAIN_DATA_PATH, 
    'Train_Inpatientdata-1542865627584.csv')
TRAINING_OUTPATIENT_PATH = os.path.join(TRAIN_DATA_PATH, 
    'Train_Outpatientdata-1542865627584.csv')
TRAINING_BENEFICIARY_PATH = os.path.join(TRAIN_DATA_PATH, 
    'Train_Beneficiarydata-1542865627584.csv')

# Defines the default behavior for responding to values that do not exist in data set
DEFAULT_NA_VALUE = "None"


In [None]:
# Read all of the training data
train_providers = pandas.read_csv(TRAINING_PROVIDERS_PATH)
train_inpatient = pandas.read_csv(TRAINING_INPATIENT_PATH)
train_outpatient = pandas.read_csv(TRAINING_OUTPATIENT_PATH)
train_beneficiary = pandas.read_csv(TRAINING_BENEFICIARY_PATH)

In [None]:
# How similar are the inpatient/outpatient data? Let's see how many columns are
# unique between the two data sets
inp_unique_cols = [x for x in train_inpatient.columns if x not in train_outpatient.columns]
print(f'Unique columns in inpatient: {inp_unique_cols}')

outp_unique_cols = [x for x in train_outpatient.columns if x not in train_inpatient.columns]
print(f'Unique columns in outpatient: {outp_unique_cols}')

# Merging claim data sets

Based on what we know about the columns in the inpatient/outpatient tables, we can safely merge those datasets with minimal disruption. Recommend combining all of the claims into a single dataframe that has all of the data from each table with the following columns added:
- Additional column for inpatient vs outpatient flag
- Columns unique to inpatient (i.e., AdmissionDt, DischargeDt, DiagnosisGroupCode)

In [None]:
# Add a flag to each of the data sets to indicate whether they came from the
# inpatient vs. outpatient frames
train_inpatient['In/Out'] = 'In'
train_outpatient['In/Out'] = 'Out'

# Use the concat operation to create a union of the two data sets. This 
# automatically handles any missing columns between the two sets, so no need
# to manually add missing columns to the outpatient data first.
train_claims = pandas.concat([train_inpatient, train_outpatient], ignore_index=True)

train_claims

# Merging patients with claims

Using the beneficiary column in the claim data, we can also populate details about the patient in each of the transactions.

This potentially allows us to engineer some additional features that are specific to beneficiaries across transactions. For example, how long has it been since a beneficiary's last claim?

In [None]:
# Do a left outer join in case there are any claims that don't have any corresponding beneficiaries
# defined in the beneficiary data set
train_claims = train_claims.merge(train_beneficiary, how='left', on='BeneID')

# TODO: Are there any claims that don't have corresponding beneficiary definitions?

train_claims

# Cleaning data

There are some fields that have empty data. We need a way to properly deal with those fields so that analysis can be performed.

It's probably too early to throw data out, so instead give these fields a friendly value like "None" and we can revisit whether we can throw them out entirely later.

## Cleanup in the next cell:
- In/Out column - Changed to boolean value, where 0 -> outpatient, 1-> inpatient
- (*)Physician columns - Remove PHY prefix from provider IDs

In [None]:
# Use apply() to map columns to new values that have been sanitized (i.e., transform from strings into numeric)
train_claims['In/Out'] = train_claims['In/Out'].apply(lambda x: 1 if x == 'In' else 0)

# The next few transforms require checks using pandas.isnull() to prevent errors for empty cells
train_claims['AttendingPhysician'] = train_claims['AttendingPhysician'].apply(lambda ap: ap.split('PHY')[-1] if not pandas.isnull(ap) else ap)
train_claims['OperatingPhysician'] = train_claims['OperatingPhysician'].apply(lambda op: op.split('PHY')[-1] if not pandas.isnull(op) else op)
train_claims['OtherPhysician'] = train_claims['OtherPhysician'].apply(lambda op: op.split('PHY')[-1] if not pandas.isnull(op) else op)

train_claims

## Cleanup in the next cell:
- Some of the diagnosis code values are prefixed with a character. Remove them to make values numeric.

> **TODO: Decide whether we need to get all fancy pants and save off the various character flags
          that appear alongside the charge codes. Looks like 'E' and 'V' are the ones that show
          up most, but maybe there are others?**

> **TODO: Some resources online recommend using one-hot encoding to represent categorical data for decision tree algorithms:
https://stackoverflow.com/questions/38108832/passing-categorical-data-to-sklearn-decision-tree
Answer suggests one-hot is not super performant, but that might be acceptable since this data set isn't enormous...**

In [None]:
# Build list of diagnosis code columns, because hard-coding makes me sad
diagnosis_code_cols = [f'ClmDiagnosisCode_{x+1}' for x in range(10)]

# Strip ASCII characters from all of the claim diagnosis codes.
import string
for col in diagnosis_code_cols:
    train_claims[col] = train_claims[col].apply(lambda c: c.strip(string.ascii_letters) if not pandas.isnull(c) else c)

# Sanity check that all of the claim codes are now integers
assert sum([len([x for x in train_claims[col] if not pandas.isnull(x) and not str(x).isdigit()]) for col in diagnosis_code_cols]) == 0

train_claims


# Cleanup in the next cell:

Some of the deductible information has NaN. For now, set these values to 0.

> TODO: Is setting to 0 the right approach?

In [None]:
train_claims['DeductibleAmtPaid'] = train_claims['DeductibleAmtPaid'].apply(lambda amt: 0 if pandas.isnull(amt) else amt)

train_claims

# Feature Engineering

Features we can derive in the inpatient and outpatient datasets
- Claim duration
- Age of patient
- Is Dead flag
- Number of diagnoses
- Number of procedures
- Admission duration
- Has attending phys
- Has operating phys

In [None]:
from datetime import timedelta

# Calculate the age of the patient at the time their claim was initiated
train_claims['PatientAge'] = (pandas.to_datetime(train_claims['ClaimStartDt'], format='%Y-%m-%d') - pandas.to_datetime(train_claims['DOB'], format='%Y-%m-%d'))/timedelta(days=365)

# Calculate the duration of the patient's stay
train_claims['PatientStayDur'] = (pandas.to_datetime(train_claims['DischargeDt'], format='%Y-%m-%d') - pandas.to_datetime(train_claims['AdmissionDt'], format='%Y-%m-%d'))/timedelta(days=1)
train_claims['PatientStayDur'] = train_claims['PatientStayDur'].apply(lambda amt: 0 if pandas.isnull(amt) else amt)

# Calculate the duration of the claim
train_claims['ClaimDur'] = (pandas.to_datetime(train_claims['ClaimEndDt'], format='%Y-%m-%d') - pandas.to_datetime(train_claims['ClaimStartDt'], format='%Y-%m-%d') + timedelta(days=1))/timedelta(days=1)
train_claims['ClaimDur'] = train_claims['ClaimDur'].apply(lambda amt: 0 if pandas.isnull(amt) else amt)

train_claims['PatientDead'] = train_claims['DOD'].apply(lambda dod: 0 if pandas.isnull(dod) else 1)


# Merging claim data with potential fraud indicators

Once we have all of our features in place, we should merge with the potential fraud data set.
This will let us set up independent/dependent variables for the decision tree to examine.

In [None]:
# Use a left outer join so we don't throw away providers that might not have claims.
train_providers = train_providers.merge(train_claims, how='left', on='Provider')

# TODO: Sanity check that all providers have associated claim data.

# Strip the PRV prefix from Provider IDs
train_providers['Provider'] = train_providers['Provider'].apply(lambda ap: ap.split('PRV')[-1] if not pandas.isnull(ap) else ap)

train_providers

# Define independent variables

Based on prior statistical evaluation and experimentation, we select independent variables to include in the model.

In [None]:
# TODO: Reintroduce age at claim when feature engineering section is complete
features = ['InscClaimAmtReimbursed', 'DeductibleAmtPaid', 'PatientAge', 'PatientStayDur', 'ClaimDur', 'IPAnnualDeductibleAmt', 'IPAnnualReimbursementAmt', 'OPAnnualDeductibleAmt', 'OPAnnualReimbursementAmt', 'PatientDead']
X = train_providers.reindex(columns=features)
X

# Define dependent variables

Define the dependent variables that we are trying to predict

In [None]:
Y = train_providers['PotentialFraud']
Y

# Compute decision tree

Use the sklearn implementation of the decision tree classifier algorithm to build a decision tree from training data

In [None]:
#import decision tree
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))
from sklearn import tree

clf = tree.DecisionTreeClassifier(random_state=0)

##Cant have any chracters in these fields, must be numbers
clf = clf.fit(X, Y)
clf

In [None]:
import matplotlib.pyplot as plt

fig_size = plt.rcParams["figure.figsize"]
fig_size[0] = 20
fig_size[1] = 20
plt.rcParams["figure.figsize"] = fig_size

import numpy as np
sorted = Y.unique()
sorted = np.sort(sorted)
sorted = list(map(str, sorted))

sorted

print(clf.get_n_leaves())

plot = tree.plot_tree(clf, rounded=True, filled=True, class_names=sorted, feature_names=features, max_depth=5)