In [10]:
import pandas as pd
import numpy as np

In [11]:
df_train = pd.read_csv('train_data.csv')
df_test = pd.read_csv('test_data.csv')

In [12]:
# Rename the columns from the Label to their Variable Name
df_train = df_train.rename(columns={"County Code": "BENE_COUNTY_CD",
                                     "DESYNPUF: End stage renal disease Indicator": "BENE_ESRD_IND",
                                     "DESYNPUF: State Code": "SP_STATE_CODE",
                                     "Total number of months of part A coverage for the bene": "BENE_HI_CVRAGE_TOT_MONS",
                                     "Total number of months of part B coverage for the bene": "BENE_SMI_CVRAGE_TOT_MONS",
                                     "Total number of months of part D plan coverage for the": "PLAN_CVRG_MOS_NUM",
                                     "Chronic Condition: Alzheimer or related disorders or s": "SP_ALZHDMTA",
                                     "Chronic Condition: Heart Failure": "SP_CHF",
                                     "Chronic Condition: Chronic Kidney Disease": "SP_CHRNKIDN",
                                     "Chronic Condition: Cancer": "SP_CNCR",
                                     "Chronic Condition: Chronic Obstructive Pulmonary Disea": "SP_COPD",
                                     "Chronic Condition: Depression": "SP_DEPRESSN",
                                     "Chronic Condition: Diabetes": "SP_DIABETES",
                                     "Chronic Condition: Ischemic Heart Disease": "SP_ISCHMCHT",
                                     "Chronic Condition: Osteoporosis": "SP_OSTEOPRS",
                                     "Chronic Condition: RA/OA": "SP_RA_OA",
                                     "Chronic Condition: Stroke/transient Ischemic Attack": "SP_STRKETIA",
                                     "Inpatient annual Medicare reimbursement amount": "MEDREIMB_IP",
                                     "Inpatient annual beneficiary responsibility amount": "BENRES_IP",
                                     "Inpatient annual primary payer reimbursement amount": "PPPYMT_IP",
                                     "Outpatient Institutional annual Medicare reimbursement": "MEDREIMB_OP",
                                     "Outpatient Institutional annual beneficiary responsibi": "BENRES_OP",
                                     "Outpatient Institutional annual primary payer reimburs": "PPPYMT_OP",
                                     "Carrier annual Medicare reimbursement amount": "MEDREIMB_CAR",
                                     "Carrier annual beneficiary responsibility amount": "BENRES_CAR",
                                     "Carrier annual primary payer reimbursement amount": "PPPYMT_CAR"
                                     })
df_train.head(10)

Unnamed: 0,AdverseOpioidEvent,BENE_ESRD_IND,SP_STATE_CODE,BENE_COUNTY_CD,BENE_HI_CVRAGE_TOT_MONS,BENE_SMI_CVRAGE_TOT_MONS,PLAN_CVRG_MOS_NUM,SP_ALZHDMTA,SP_CHF,SP_CHRNKIDN,...,PPPYMT_CAR,RX_Filled,Total_Qty_Disp,Tot_Days_Sup,Patient_Pay,Tot_MME,Sex,Race,Age,MonthDiffFill
0,0,0,38,170,12,12,12,0,0,0,...,0,4,220,70,10,595.0,F,White,76,20.0
1,0,0,44,460,12,12,12,0,0,0,...,20,1,30,90,10,2070.0,M,White,58,0.0
2,0,0,11,381,12,12,12,0,0,0,...,0,3,130,40,0,255.0,F,White,66,16.0
3,0,0,10,280,12,12,12,0,0,0,...,0,0,0,0,0,0.0,M,White,54,
4,0,0,24,710,12,12,12,0,0,0,...,0,3,70,50,10,474.5,F,White,89,21.0
5,0,0,45,20,12,12,12,0,0,0,...,0,4,310,90,0,437.0,F,White,60,17.0
6,1,0,5,470,12,12,12,0,0,0,...,0,1,60,30,0,150.0,M,White,44,0.0
7,0,0,52,150,12,12,12,0,0,0,...,0,1,30,30,0,225.0,F,White,82,0.0
8,0,Y,50,260,12,12,12,1,1,1,...,0,2,130,40,0,195.0,F,White,66,15.0
9,0,0,15,480,12,12,12,0,1,1,...,0,6,270,130,30,3149.5,M,White,85,21.0


# Preprocessing of the Columns

## Binary Columns and Dropped Columns

In [13]:
# Make 'BENE_ESRD_IND' and 'Sex' proper binary columns
df_train.loc[(df_train.BENE_ESRD_IND == 'Y'), 'BENE_ESRD_IND']=1
df_train.loc[(df_train.Sex == 'F'), 'Sex']=0
df_train.loc[(df_train.Sex == 'M'), 'Sex']=1

# Use a binary variable of equal to 0 or not equal to 0 for 'PPPYMT_IP'
df_train['PPPYMT_IP'] = np.where(df_train['PPPYMT_IP'] == 0, 0, 1)

# Use a binary variable of equal to 0 or not equal to 0 for 'PPPYMT_OP'
df_train['PPPYMT_OP'] = np.where(df_train['PPPYMT_OP'] == 0, 0, 1)

# Use a binary variable of equal to 0 or not equal to 0 for 'PPPYMT_CAR'
df_train['PPPYMT_CAR'] = np.where(df_train['PPPYMT_CAR'] == 0, 0, 1)

# Drop the 'SP_STATE_CODE' column and the 'BENE_COUNTY_CD' column
df_train = df_train.drop('SP_STATE_CODE', axis=1)
df_train = df_train.drop('BENE_COUNTY_CD', axis=1)

## Scaling

In [14]:
# Scale the number of months of each part coverage of the beneficiary based on the maximum
max_BENE_HI_CVRAGE_TOT_MONS = np.amax(df_train.BENE_HI_CVRAGE_TOT_MONS.unique())
df_train['BENE_HI_CVRAGE_TOT_MONS'] = df_train['BENE_HI_CVRAGE_TOT_MONS'].div(max_BENE_HI_CVRAGE_TOT_MONS)
max_BENE_SMI_CVRAGE_TOT_MONS = np.amax(df_train.BENE_SMI_CVRAGE_TOT_MONS.unique())
df_train['BENE_SMI_CVRAGE_TOT_MONS'] = df_train['BENE_SMI_CVRAGE_TOT_MONS'].div(max_BENE_SMI_CVRAGE_TOT_MONS)
max_PLAN_CVRG_MOS_NUM = np.amax(df_train.PLAN_CVRG_MOS_NUM.unique())
df_train['PLAN_CVRG_MOS_NUM'] = df_train['PLAN_CVRG_MOS_NUM'].div(max_PLAN_CVRG_MOS_NUM)

# Scale down RX_Filled
max_RX_Filled = np.amax(df_train.RX_Filled.unique())
df_train['RX_Filled'] = df_train['RX_Filled'].div(max_RX_Filled)

# Scale 'Total_Qty_Disp', 'Tot_Days_Sup', 'Tot_MME', and 'Age' based on max value
max_Total_Qty_Disp = np.amax(df_train.Total_Qty_Disp.unique())
df_train['Total_Qty_Disp'] = df_train['Total_Qty_Disp'].div(max_Total_Qty_Disp)
max_Tot_Days_Sup = np.amax(df_train.Tot_Days_Sup.unique())
df_train['Tot_Days_Sup'] = df_train['Tot_Days_Sup'].div(max_Tot_Days_Sup)
max_Tot_MME = np.amax(df_train.Tot_MME.unique())
df_train['Tot_MME'] = df_train['Tot_MME'].div(max_Tot_MME)
max_Age = np.amax(df_train.Age.unique())
df_train['Age'] = df_train['Age'].div(max_Age)

# Remove NaN's and scale MonthDiffFill
df_train['MonthDiffFill'] = df_train['MonthDiffFill'].fillna(0)
max_MonthDiffFill = np.amax(df_train.MonthDiffFill.unique())
df_train['MonthDiffFill'] = df_train['MonthDiffFill'].div(max_MonthDiffFill)

## Bins

In [15]:
# Use bins of 0 or less, between 1 and 10000, and over 10000 for 'MEDREIMB_IP'
df_train['MEDREIMB_IP'] = np.where(df_train['MEDREIMB_IP'] <= 0, 0, df_train['MEDREIMB_IP'])
df_train['MEDREIMB_IP'] = np.where(
                            (df_train['MEDREIMB_IP'] > 0) & (df_train['MEDREIMB_IP'] <= 10000), 
                            1, 
                            df_train['MEDREIMB_IP'])
df_train['MEDREIMB_IP'] = np.where(df_train['MEDREIMB_IP'] > 10000, 2, df_train['MEDREIMB_IP'])
onehotencoded_MEDREIMB_IP = pd.get_dummies(df_train.MEDREIMB_IP)
onehotencoded_MEDREIMB_IP = onehotencoded_MEDREIMB_IP.rename(columns={0: "Zero_MEDREIMB_IP",
                                                                      1: "Small_MEDREIMB_IP",
                                                                      2: "Large_MEDREIMB_IP"
                                                                     })
df_train = df_train.join(onehotencoded_MEDREIMB_IP)
df_train = df_train.drop('MEDREIMB_IP', axis=1)

# Use bins of 0 or less, between 1 and 1100, and over 1100 for 'BENRES_IP'
df_train['BENRES_IP'] = np.where(df_train['BENRES_IP'] <= 0, 0, df_train['BENRES_IP'])
df_train['BENRES_IP'] = np.where(
                            (df_train['BENRES_IP'] > 0) & (df_train['BENRES_IP'] <= 1100), 
                            1, 
                            df_train['BENRES_IP'])
df_train['BENRES_IP'] = np.where(df_train['BENRES_IP'] > 1100, 2, df_train['BENRES_IP'])
onehotencoded_BENRES_IP = pd.get_dummies(df_train.BENRES_IP)
onehotencoded_BENRES_IP = onehotencoded_BENRES_IP.rename(columns={0: "Zero_BENRES_IP",
                                                                  1: "Small_BENRES_IP",
                                                                  2: "Large_BENRES_IP"
                                                                  })
df_train = df_train.join(onehotencoded_BENRES_IP)
df_train = df_train.drop('BENRES_IP', axis=1)

# Use bins of 0 or less, between 1 and 600, and over 600 for 'MEDREIMB_OP'
df_train['MEDREIMB_OP'] = np.where(df_train['MEDREIMB_OP'] <= 0, 0, df_train['MEDREIMB_OP'])
df_train['MEDREIMB_OP'] = np.where(
                            (df_train['MEDREIMB_OP'] > 0) & (df_train['MEDREIMB_OP'] <= 600), 
                            1, 
                            df_train['MEDREIMB_OP'])
df_train['MEDREIMB_OP'] = np.where(df_train['MEDREIMB_OP'] > 600, 2, df_train['MEDREIMB_OP'])
onehotencoded_MEDREIMB_OP = pd.get_dummies(df_train.MEDREIMB_OP)
onehotencoded_MEDREIMB_OP = onehotencoded_MEDREIMB_OP.rename(columns={0: "Zero_MEDREIMB_OP",
                                                                      1: "Small_MEDREIMB_OP",
                                                                      2: "Large_MEDREIMB_OP"
                                                                     })
df_train = df_train.join(onehotencoded_MEDREIMB_OP)
df_train = df_train.drop('MEDREIMB_OP', axis=1)

# Use bins of 0 or less, between 1 and 200, and over 200 for 'BENRES_OP'
df_train['BENRES_OP'] = np.where(df_train['BENRES_OP'] <= 0, 0, df_train['BENRES_OP'])
df_train['BENRES_OP'] = np.where(
                            (df_train['BENRES_OP'] > 0) & (df_train['BENRES_OP'] <= 200), 
                            1,
                            df_train['BENRES_OP'])
df_train['BENRES_OP'] = np.where(df_train['BENRES_OP'] > 200, 2, df_train['BENRES_OP'])
onehotencoded_BENRES_OP = pd.get_dummies(df_train.BENRES_OP)
onehotencoded_BENRES_OP = onehotencoded_BENRES_OP.rename(columns={0: "Zero_BENRES_OP",
                                                                      1: "Small_BENRES_OP",
                                                                      2: "Large_BENRES_OP"
                                                                     })
df_train = df_train.join(onehotencoded_BENRES_OP)
df_train = df_train.drop('BENRES_OP', axis=1)

# Use bins of 0 or less, between 1 and 1100, and over 1100 for 'MEDREIMB_CAR'
df_train['MEDREIMB_CAR'] = np.where(df_train['MEDREIMB_CAR'] <= 0, 0, df_train['MEDREIMB_CAR'])
df_train['MEDREIMB_CAR'] = np.where(
                            (df_train['MEDREIMB_CAR'] > 0) & (df_train['MEDREIMB_CAR'] <= 1100), 
                            1,
                            df_train['MEDREIMB_CAR'])
df_train['MEDREIMB_CAR'] = np.where(df_train['MEDREIMB_CAR'] > 1100, 2, df_train['MEDREIMB_CAR'])
onehotencoded_MEDREIMB_CAR = pd.get_dummies(df_train.MEDREIMB_CAR)
onehotencoded_MEDREIMB_CAR = onehotencoded_MEDREIMB_CAR.rename(columns={0: "Zero_MEDREIMB_CAR",
                                                                      1: "Small_MEDREIMB_CAR",
                                                                      2: "Large_MEDREIMB_CAR"
                                                                     })
df_train = df_train.join(onehotencoded_MEDREIMB_CAR)
df_train = df_train.drop('MEDREIMB_CAR', axis=1)

# Use bins of 0 or less, between 1 and 300, and over 300 for 'BENRES_CAR'
df_train['BENRES_CAR'] = np.where(df_train['BENRES_CAR'] <= 0, 0, df_train['BENRES_CAR'])
df_train['BENRES_CAR'] = np.where(
                            (df_train['BENRES_CAR'] > 0) & (df_train['BENRES_CAR'] <= 300), 
                            1,
                            df_train['BENRES_CAR'])
df_train['BENRES_CAR'] = np.where(df_train['BENRES_CAR'] > 300, 2, df_train['BENRES_CAR'])
onehotencoded_BENRES_CAR = pd.get_dummies(df_train.BENRES_CAR)
onehotencoded_BENRES_CAR = onehotencoded_BENRES_CAR.rename(columns={0: "Zero_BENRES_CAR",
                                                                      1: "Small_BENRES_CAR",
                                                                      2: "Large_BENRES_CAR"
                                                                     })
df_train = df_train.join(onehotencoded_BENRES_CAR)
df_train = df_train.drop('BENRES_CAR', axis=1)

# Use one hot encoding for the categorical column 'Race'
onehotencoded_Race = pd.get_dummies(df_train.Race)
df_train = df_train.join(onehotencoded_Race)
df_train = df_train.drop('Race', axis=1)

In [16]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(df_train.head())

   AdverseOpioidEvent BENE_ESRD_IND  BENE_HI_CVRAGE_TOT_MONS  \
0                   0             0                      1.0   
1                   0             0                      1.0   
2                   0             0                      1.0   
3                   0             0                      1.0   
4                   0             0                      1.0   

   BENE_SMI_CVRAGE_TOT_MONS  PLAN_CVRG_MOS_NUM  SP_ALZHDMTA  SP_CHF  \
0                       1.0                1.0            0       0   
1                       1.0                1.0            0       0   
2                       1.0                1.0            0       0   
3                       1.0                1.0            0       0   
4                       1.0                1.0            0       0   

   SP_CHRNKIDN  SP_CNCR  SP_COPD  SP_DEPRESSN  SP_DIABETES  SP_ISCHMCHT  \
0            0        0        0            0            0            0   
1            0        0        0      

# Use Models on the Data and Determine How Good the Model Is

In [17]:
y = df_train['AdverseOpioidEvent']
X = df_train.drop('AdverseOpioidEvent', axis=1)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [18]:
# Use a random forest classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

rfc = RandomForestClassifier(n_estimators = 200)
rfc.fit(X_train, y_train)
pred_rfc = rfc.predict(X_test)
print(classification_report(y_test, pred_rfc))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98     14245
           1       0.91      0.47      0.62      1169

    accuracy                           0.96     15414
   macro avg       0.94      0.73      0.80     15414
weighted avg       0.95      0.96      0.95     15414



In [19]:
# Use a perceptron-based Neural Network
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(48, 96, 12, 12), max_iter=300)
mlp.fit(X_train, y_train)
pred_mlp = mlp.predict(X_test)
print(classification_report(y_test, pred_mlp))

              precision    recall  f1-score   support

           0       0.96      0.97      0.96     14245
           1       0.54      0.47      0.50      1169

    accuracy                           0.93     15414
   macro avg       0.75      0.72      0.73     15414
weighted avg       0.93      0.93      0.93     15414



In [20]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier()
clf.fit(X_train, y_train)
pred_clf = clf.predict(X_test)
print(classification_report(y_test, pred_clf))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96     14245
           1       1.00      0.01      0.01      1169

    accuracy                           0.92     15414
   macro avg       0.96      0.50      0.49     15414
weighted avg       0.93      0.92      0.89     15414



In [21]:
from sklearn.linear_model import LogisticRegression
log = LogisticRegression(random_state=0, solver='lbfgs', max_iter=500)
log.fit(X_train, y_train)
pred_log = log.predict(X_test)
print(classification_report(y_test, pred_log))

              precision    recall  f1-score   support

           0       0.93      0.99      0.96     14245
           1       0.57      0.09      0.16      1169

    accuracy                           0.93     15414
   macro avg       0.75      0.54      0.56     15414
weighted avg       0.90      0.93      0.90     15414



In [22]:
from sklearn.ensemble import GradientBoostingClassifier
xgb = GradientBoostingClassifier()
xgb.fit(X_train, y_train)
pred_xgb = xgb.predict(X_test)
print(classification_report(y_test, pred_xgb))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97     14245
           1       0.90      0.28      0.42      1169

    accuracy                           0.94     15414
   macro avg       0.92      0.64      0.70     15414
weighted avg       0.94      0.94      0.93     15414



In [23]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, pred_rfc))
print(confusion_matrix(y_test, pred_mlp))
print(confusion_matrix(y_test, pred_clf))
print(confusion_matrix(y_test, pred_log))
print(confusion_matrix(y_test, pred_xgb))

[[14193    52]
 [  622   547]]
[[13766   479]
 [  616   553]]
[[14245     0]
 [ 1163     6]]
[[14164    81]
 [ 1062   107]]
[[14209    36]
 [  846   323]]
