In [1]:
# Dependencies

import pandas as pd
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline

In [2]:
# Import modules for feature engineering and modelling
from sklearn.model_selection import train_test_split  # not used here, we use <2015 and >=2015 to split data
# from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
# from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
# from sklearn.pipeline import FeatureUnion  # not sure which one to choose, the featureunion or the columntransformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression 

from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.decomposition import PCA
from xgboost.sklearn import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso, Ridge
# for integer encoding using sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV

# Set seed for reproducibility
seed1, seed2, seed3=101,67, 42

In [3]:
# load the data from csv to pandas dataframe
Healthcare_Provider_raw = pd.read_csv("data/Train-1542865627584.csv")
Healthcare_Beneficiary_raw = pd.read_csv("data/Train_Beneficiarydata-1542865627584.csv")
Healthcare_Inpatient_raw = pd.read_csv("data/Train_Inpatientdata-1542865627584.csv")
Healthcare_Outpatient_raw = pd.read_csv("data/Train_Outpatientdata-1542865627584.csv")

In [4]:
# Assign inpatient data as 1, outpatient data as 0

Healthcare_Inpatient_raw['In_Out']=1
Healthcare_Outpatient_raw['In_Out']=0

In [5]:
# union/concat the inpatient and outpatient data with provider and patient info
concat_df=pd.concat([Healthcare_Inpatient_raw,Healthcare_Outpatient_raw],axis=0)


merge_bene_df=concat_df.merge(Healthcare_Beneficiary_raw, on='BeneID', how='left')

merge_provider_df=merge_bene_df.merge(Healthcare_Provider_raw, on = 'Provider', how ='left')
merge_provider_df.drop_duplicates(inplace=True)
merge_provider_df

Unnamed: 0,BeneID,ClaimID,ClaimStartDt,ClaimEndDt,Provider,InscClaimAmtReimbursed,AttendingPhysician,OperatingPhysician,OtherPhysician,AdmissionDt,...,ChronicCond_Diabetes,ChronicCond_IschemicHeart,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt,PotentialFraud
0,BENE11001,CLM46614,2009-04-12,2009-04-18,PRV55912,26000,PHY390922,,,2009-04-12,...,1,1,2,1,1,36000,3204,60,70,Yes
1,BENE11001,CLM66048,2009-08-31,2009-09-02,PRV55907,5000,PHY318495,PHY318495,,2009-08-31,...,1,1,2,1,1,36000,3204,60,70,No
2,BENE11001,CLM68358,2009-09-17,2009-09-20,PRV56046,5000,PHY372395,,PHY324689,2009-09-17,...,1,1,2,1,1,36000,3204,60,70,No
3,BENE11011,CLM38412,2009-02-14,2009-02-22,PRV52405,5000,PHY369659,PHY392961,PHY349768,2009-02-14,...,1,2,2,1,1,5000,1068,250,320,No
4,BENE11014,CLM63689,2009-08-13,2009-08-30,PRV56614,10000,PHY379376,PHY398258,,2009-08-13,...,2,1,2,2,2,21260,2136,120,100,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
558206,BENE159198,CLM510792,2009-08-06,2009-08-06,PRV53699,800,PHY364188,PHY364188,PHY385752,,...,1,2,2,1,2,0,0,5470,1870,No
558207,BENE159198,CLM551294,2009-08-29,2009-08-29,PRV53702,400,PHY423019,PHY332284,,,...,1,2,2,1,2,0,0,5470,1870,No
558208,BENE159198,CLM596444,2009-09-24,2009-09-24,PRV53676,60,PHY361063,,,,...,1,2,2,1,2,0,0,5470,1870,No
558209,BENE159198,CLM636992,2009-10-18,2009-10-18,PRV53689,70,PHY403198,,PHY419379,,...,1,2,2,1,2,0,0,5470,1870,No


In [6]:
# seperate columns into target, num, datetime, categorical

target=['PotentialFraud']
colnames=merge_provider_df.columns.to_list()
num_col_list=(merge_provider_df.describe()).columns.tolist()
datetime_col_list=['ClaimStartDt','ClaimEndDt','DischargeDt','DOB','DOD']
cat_col_list=list(set(colnames)-set(num_col_list)-set(target)-set(datetime_col_list))

In [7]:
# get a copy of the merged data, prepare change all to numerical
df_num=merge_provider_df.copy()

# change date column datatype to datetime

for datetime_col_name in datetime_col_list:
    df_num[datetime_col_name]=pd.to_datetime(df_num[datetime_col_name])

    
# label encode all object columns
le=LabelEncoder()

for cat_col_name in cat_col_list:
    df_num[cat_col_name]=le.fit_transform(df_num[cat_col_name])
    
# transform target to binary code
df_num['PotentialFraud']=df_num['PotentialFraud'].map(lambda x:1 if (x=='Yes' or x==1) else 0)

df_num.head()

Unnamed: 0,BeneID,ClaimID,ClaimStartDt,ClaimEndDt,Provider,InscClaimAmtReimbursed,AttendingPhysician,OperatingPhysician,OtherPhysician,AdmissionDt,...,ChronicCond_Diabetes,ChronicCond_IschemicHeart,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt,PotentialFraud
0,9381,299052,2009-04-12,2009-04-18,3928,26000,53275,35315,46457,134,...,1,1,2,1,1,36000,3204,60,70,1
1,9381,471344,2009-08-31,2009-09-02,3923,5000,4951,2132,46457,275,...,1,1,2,1,1,36000,3204,60,70,0
2,9381,491764,2009-09-17,2009-09-20,4034,5000,40843,35315,5201,292,...,1,1,2,1,1,36000,3204,60,70,0
3,9489,226452,2009-02-14,2009-02-22,1125,5000,39020,23575,14717,77,...,1,2,2,1,1,5000,1068,250,320,0
4,9518,450337,2009-08-13,2009-08-30,4490,10000,45475,25118,46457,257,...,2,1,2,2,2,21260,2136,120,100,0
