In [1]:
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

In [32]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler

In [38]:
from sklearn.compose import ColumnTransformer

In [42]:
import numpy as np

### Load and merge data

In [2]:
# load the data from csv to pandas dataframe
Provider_raw = pd.read_csv("data/Train-1542865627584.csv")
Beneficiary_raw = pd.read_csv("data/Train_Beneficiarydata-1542865627584.csv")
Inpatient_raw = pd.read_csv("data/Train_Inpatientdata-1542865627584.csv")
Outpatient_raw = pd.read_csv("data/Train_Outpatientdata-1542865627584.csv")

In [3]:
# Assign inpatient data as 1, outpatient data as 0
Inpatient_raw['In_Out']=1
Outpatient_raw['In_Out']=0

# union/concat the inpatient and outpatient data
concat_df=pd.concat([Inpatient_raw,Outpatient_raw],axis=0)

# left join tables
merge_bene_df=concat_df.merge(Beneficiary_raw, on='BeneID', how='left')
merge_provider_df=merge_bene_df.merge(Provider_raw, on = 'Provider', how ='left')
merge_provider_df.drop_duplicates(inplace=True)

# remove columns with all null values
merge_provider_df.dropna(axis=1, how='all',inplace=True)

# drop ClaimID since it behaves as index
merge_provider_df.drop('ClaimID',axis=1, inplace=True)

# check the final merged dataframe
merge_provider_df

Unnamed: 0,BeneID,ClaimStartDt,ClaimEndDt,Provider,InscClaimAmtReimbursed,AttendingPhysician,OperatingPhysician,OtherPhysician,AdmissionDt,ClmAdmitDiagnosisCode,...,ChronicCond_Diabetes,ChronicCond_IschemicHeart,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt,PotentialFraud
0,BENE11001,2009-04-12,2009-04-18,PRV55912,26000,PHY390922,,,2009-04-12,7866,...,1,1,2,1,1,36000,3204,60,70,Yes
1,BENE11001,2009-08-31,2009-09-02,PRV55907,5000,PHY318495,PHY318495,,2009-08-31,6186,...,1,1,2,1,1,36000,3204,60,70,No
2,BENE11001,2009-09-17,2009-09-20,PRV56046,5000,PHY372395,,PHY324689,2009-09-17,29590,...,1,1,2,1,1,36000,3204,60,70,No
3,BENE11011,2009-02-14,2009-02-22,PRV52405,5000,PHY369659,PHY392961,PHY349768,2009-02-14,431,...,1,2,2,1,1,5000,1068,250,320,No
4,BENE11014,2009-08-13,2009-08-30,PRV56614,10000,PHY379376,PHY398258,,2009-08-13,78321,...,2,1,2,2,2,21260,2136,120,100,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
558206,BENE159198,2009-08-06,2009-08-06,PRV53699,800,PHY364188,PHY364188,PHY385752,,,...,1,2,2,1,2,0,0,5470,1870,No
558207,BENE159198,2009-08-29,2009-08-29,PRV53702,400,PHY423019,PHY332284,,,,...,1,2,2,1,2,0,0,5470,1870,No
558208,BENE159198,2009-09-24,2009-09-24,PRV53676,60,PHY361063,,,,,...,1,2,2,1,2,0,0,5470,1870,No
558209,BENE159198,2009-10-18,2009-10-18,PRV53689,70,PHY403198,,PHY419379,,,...,1,2,2,1,2,0,0,5470,1870,No


## label 1 as fraud, 0 as genuine

In [98]:
# get a copy of the merged data, prepare change all to numerical
df_num=merge_provider_df.copy(deep=True)

# transform target to binary code

df_num['PotentialFraud']=df_num['PotentialFraud'].map(lambda x:1 if (x=='Yes' or x==1) else 0)

df_num.head()

Unnamed: 0,BeneID,ClaimStartDt,ClaimEndDt,Provider,InscClaimAmtReimbursed,AttendingPhysician,OperatingPhysician,OtherPhysician,AdmissionDt,ClmAdmitDiagnosisCode,...,ChronicCond_Diabetes,ChronicCond_IschemicHeart,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt,PotentialFraud
0,BENE11001,2009-04-12,2009-04-18,PRV55912,26000,PHY390922,,,2009-04-12,7866,...,1,1,2,1,1,36000,3204,60,70,1
1,BENE11001,2009-08-31,2009-09-02,PRV55907,5000,PHY318495,PHY318495,,2009-08-31,6186,...,1,1,2,1,1,36000,3204,60,70,0
2,BENE11001,2009-09-17,2009-09-20,PRV56046,5000,PHY372395,,PHY324689,2009-09-17,29590,...,1,1,2,1,1,36000,3204,60,70,0
3,BENE11011,2009-02-14,2009-02-22,PRV52405,5000,PHY369659,PHY392961,PHY349768,2009-02-14,431,...,1,2,2,1,1,5000,1068,250,320,0
4,BENE11014,2009-08-13,2009-08-30,PRV56614,10000,PHY379376,PHY398258,,2009-08-13,78321,...,2,1,2,2,2,21260,2136,120,100,0


In [99]:
df_num.isna().sum()

BeneID                                  0
ClaimStartDt                            0
ClaimEndDt                              0
Provider                                0
InscClaimAmtReimbursed                  0
AttendingPhysician                   1508
OperatingPhysician                 443764
OtherPhysician                     358475
AdmissionDt                        517737
ClmAdmitDiagnosisCode              412312
DeductibleAmtPaid                     899
DischargeDt                        517737
DiagnosisGroupCode                 517737
ClmDiagnosisCode_1                  10453
ClmDiagnosisCode_2                 195606
ClmDiagnosisCode_3                 315156
ClmDiagnosisCode_4                 393675
ClmDiagnosisCode_5                 446287
ClmDiagnosisCode_6                 473819
ClmDiagnosisCode_7                 492034
ClmDiagnosisCode_8                 504767
ClmDiagnosisCode_9                 516396
ClmDiagnosisCode_10                553201
ClmProcedureCode_1                

In [100]:
# seperate columns into target, num, datetime, categorical
target=['PotentialFraud','Provider']
colnames=df_num.columns.to_list()

datetime_col_list=['ClaimStartDt','ClaimEndDt','AdmissionDt','DischargeDt','DOB','DOD']

diagnosis_code_columns = [x for x in colnames if 'ClmDiag' in x]
procedure_code_columns = [x for x in colnames if 'ClmProc' in x]
physican_columns = [x for x in colnames if 'Physician' in x]
Chronic_columns = [x for x in colnames if 'ChronicCond' in x]
Amt_columns=[x for x in colnames if 'Amt' in x]
NoOfMonths_columns=[x for x in colnames if 'NoOfMonth' in x]

# Datetime Columns

In [101]:
# change date column datatype to datetime
for datetime_col_name in datetime_col_list:
    df_num[datetime_col_name]=pd.to_datetime(df_num[datetime_col_name])

# Numerical Columns

In [102]:
numerical_columns=Amt_columns+NoOfMonths_columns+['In_Out']

df_num[numerical_columns].agg(['size','count','nunique','min','max']).T

Unnamed: 0,size,count,nunique,min,max
InscClaimAmtReimbursed,558211.0,558211.0,438.0,0.0,125000.0
DeductibleAmtPaid,558211.0,557312.0,17.0,0.0,1068.0
IPAnnualReimbursementAmt,558211.0,558211.0,3004.0,-8000.0,161470.0
IPAnnualDeductibleAmt,558211.0,558211.0,147.0,0.0,38272.0
OPAnnualReimbursementAmt,558211.0,558211.0,2078.0,-70.0,102960.0
OPAnnualDeductibleAmt,558211.0,558211.0,789.0,0.0,13840.0
NoOfMonths_PartACov,558211.0,558211.0,13.0,0.0,12.0
NoOfMonths_PartBCov,558211.0,558211.0,13.0,0.0,12.0
In_Out,558211.0,558211.0,2.0,0.0,1.0


## Binary Categorical Columns

In [103]:
cat_col_list=list(set(colnames)-set(numerical_columns)-set(target)-set(datetime_col_list))
cat_col_list.sort()
cat_nunique=df_num[cat_col_list].nunique()
cat_binary_list=list(cat_nunique[cat_nunique==2].index)

df_num[cat_binary_list].agg(['size','count','nunique','min','max']).T

Unnamed: 0,size,count,nunique,min,max
ChronicCond_Alzheimer,558211,558211,2,1,2
ChronicCond_Cancer,558211,558211,2,1,2
ChronicCond_Depression,558211,558211,2,1,2
ChronicCond_Diabetes,558211,558211,2,1,2
ChronicCond_Heartfailure,558211,558211,2,1,2
ChronicCond_IschemicHeart,558211,558211,2,1,2
ChronicCond_KidneyDisease,558211,558211,2,1,2
ChronicCond_ObstrPulmonary,558211,558211,2,1,2
ChronicCond_Osteoporasis,558211,558211,2,1,2
ChronicCond_rheumatoidarthritis,558211,558211,2,1,2


## Categorical Columns (to be transformed by dummy)

In [104]:
cat_ohe_list=list(cat_nunique[cat_nunique!=2].index)

df_num[cat_ohe_list].agg(['size','count','nunique','min','max']).T

  df_num[cat_ohe_list].agg(['size','count','nunique','min','max']).T


Unnamed: 0,size,count,nunique,min,max
AttendingPhysician,558211.0,556703.0,82063.0,,
BeneID,558211.0,558211.0,138556.0,BENE100000,BENE99999
ClmAdmitDiagnosisCode,558211.0,145899.0,4098.0,,
ClmDiagnosisCode_1,558211.0,547758.0,10450.0,,
ClmDiagnosisCode_10,558211.0,5010.0,1158.0,,
ClmDiagnosisCode_2,558211.0,362605.0,5300.0,,
ClmDiagnosisCode_3,558211.0,243055.0,4756.0,,
ClmDiagnosisCode_4,558211.0,164536.0,4359.0,,
ClmDiagnosisCode_5,558211.0,111924.0,3970.0,,
ClmDiagnosisCode_6,558211.0,84392.0,3607.0,,


# Transformation Classes

In [105]:
class Freq_Transformer(object):
    def __init__(self):
        self.freq={}
    
    def fit(self, X, y=None):
        self.cat_colnames=X.columns
        for column_name in self.cat_colnames:
            self.freq[column_name] = X[column_name].value_counts()
    
    def transform(self, X, y=None):
        df = pd.DataFrame()
        for column_name in self.cat_colnames:
            df[column_name + '_freq']=X[column_name].map(self.freq[column_name])
        return df.fillna(0)
    
    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)    

In [106]:
class Binary_Transformer(object):
    def __init__(self):
        self.freq={}
    
    def fit(self, X, y=None):
        self.cat_colnames=X.columns
        df=pd.get_dummies(X[self.cat_colnames], 
                          prefix={i for i in self.cat_colnames}, 
                          columns=self.cat_colnames, 
                          drop_first=True)
    
    def transform(self, X, y=None):
        df=pd.get_dummies(X[self.cat_colnames], 
                          prefix={i for i in self.cat_colnames}, 
                          columns=self.cat_colnames, 
                          drop_first=True)
        return df.fillna(0)
    
    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)    

In [107]:
bi_pipe = Pipeline([('bixf',Binary_Transformer())])
test_bi=bi_pipe.fit_transform(df_num[cat_binary_list])
test_bi

Unnamed: 0,ChronicCond_Cancer_2,ChronicCond_Diabetes_2,ChronicCond_KidneyDisease_2,ChronicCond_rheumatoidarthritis_2,ChronicCond_IschemicHeart_2,RenalDiseaseIndicator_2,Gender_2,ChronicCond_Osteoporasis_2,ChronicCond_Heartfailure_2,ChronicCond_ObstrPulmonary_2,ChronicCond_Depression_2,ChronicCond_Alzheimer_2,ChronicCond_stroke_Y
0,0,1,0,0,1,0,0,1,1,0,0,0,0
1,0,1,0,0,1,0,0,1,1,0,0,0,0
2,0,1,0,0,1,0,0,1,1,0,0,0,0
3,1,1,0,0,0,1,0,1,1,0,0,1,0
4,1,1,0,1,0,0,0,0,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
558206,0,1,0,0,0,1,1,1,1,0,1,1,0
558207,0,1,0,0,0,1,1,1,1,0,1,1,0
558208,0,1,0,0,0,1,1,1,1,0,1,1,0
558209,0,1,0,0,0,1,1,1,1,0,1,1,0


In [108]:
class OHE_Transformer(object):
    def __init__(self):
        self.dummy = {}
    
    def fit(self, X, y=None):
        ohe = OneHotEncoder(categories='auto', drop='if_binary',sparse=False, max_categories=15)
        ohe_df = pd.DataFrame(ohe.fit_transform(X[cat_dummy_list]))
        ohe_df.columns=ohe.get_feature_names_out(cat_dummy_list)
        self.columns=ohe_df.columns
        
    def transform(self,X,y=None):
        ohe = OneHotEncoder(categories='auto', drop='if_binary',sparse=False, max_categories=15)
        ohe_df = pd.DataFrame(ohe.fit_transform(X[cat_dummy_list]))
        ohe_df.columns=ohe.get_feature_names_out(cat_dummy_list)
        df=pd.DataFrame(columns=ohe_df.columns)
        common_col_list=list(set(self.columns).intersection(ohe_df.columns))
        df=pd.concat([df,ohe_df[common_col_list]],axis=0)
        return df.fillna(-1)
    
    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

In [109]:
# One Hot Encoder Pipeline

ohe_pipe = Pipeline([('ohexf',OHE_Transformer())])
test_dummy=ohe_pipe.fit_transform(df_num[cat_dummy_list])
test_dummy

Unnamed: 0,AttendingPhysician_PHY314027,AttendingPhysician_PHY327046,AttendingPhysician_PHY330576,AttendingPhysician_PHY337425,AttendingPhysician_PHY338032,AttendingPhysician_PHY341578,AttendingPhysician_PHY347064,AttendingPhysician_PHY350277,AttendingPhysician_PHY351121,AttendingPhysician_PHY357120,...,State_23,State_31,State_33,State_34,State_36,State_39,State_44,State_45,State_49,State_infrequent_sklearn
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
558206,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
558207,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
558208,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
558209,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# Preprocessor

In [110]:
# Combine the above pipelines
preprocessor = ColumnTransformer([
    ('bi', bi_pipe,cat_binary_list),
    ('ohe',ohe_pipe,cat_dummy_list)
])

In [111]:
model= Pipeline([
    ('preprocessor', preprocessor),
])

In [112]:
temp_df=model.fit_transform(df_num)

In [113]:
len(temp_df[1]),len(temp_df)

(354, 558211)

In [114]:
temp_df

array([[0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 0., 1.]])

In [10]:
class DateTransform(BaseEstimator, TransformerMixin):
    # initializer 
    def __init__(self, start, end, newColumn):
        # save the features list internally in the class
        self.start = start
        self.end = end
        self.newColumn = newColumn
    
    def fit(self, X, y=None):
        return self
      
    def transform(self, X, y=None):
        X[self.newColumn] = self.convertDateToPeriod(X, self.start, self.end)
        X[self.newColumn] = X[self.newColumn].fillna(0)
        return X
        
    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)    
    
    def convertDateToPeriod(self, df, startDate, endDate):
        return (pd.to_datetime(df[endDate]) - pd.to_datetime(df[startDate])).dt.days + 1

In [11]:
Date_Xfer=DateTransform()
Date_Xfer.fit_transform(df_num)

TypeError: DateTransform.__init__() missing 3 required positional arguments: 'start', 'end', and 'newColumn'

In [27]:
from datetime import date, datetime

class AgeTransform(BaseEstimator, TransformerMixin):
    # initializer 
    def __init__(self, dob, dod, ageColumn, deceasedColumn):
        # save the features list internally in the class
        self.dob = dob
        self.dod = dod
        self.ageColumn = ageColumn
        self.deceasedColumn = deceasedColumn
    
    def fit(self, X, y=None):
        return self
      
    def transform(self, X, y=None):
        X[self.ageColumn] = X.apply(lambda x: self.calculateAge(dob = x[self.dob], dod = x[self.dod], calulationDate = '2009-12-01'), axis = 1)
        X[self.deceasedColumn] = X[self.dod].apply(lambda x : 0 if pd.isna(x) else 1)
        return X
        
    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)    
    
    def calculateAge(self, dob, dod, calulationDate):
        born = datetime.strptime(dob, "%Y-%m-%d").date()
        if not pd.isna(dod):
            calulationDate = datetime.strptime(dod, "%Y-%m-%d").date()
        else:
            calulationDate = datetime.strptime(calulationDate, "%Y-%m-%d").date()
        return calulationDate.year - born.year - ((calulationDate.month, calulationDate.day) < (born.month, born.day))  

In [30]:
diagnosis_code_columns = ['ClmDiagnosisCode_1', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3', 'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5', 
           'ClmDiagnosisCode_6', 'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9', 'ClmDiagnosisCode_10' ]
procedure_code_columns = ['ClmProcedureCode_1', 'ClmProcedureCode_2', 'ClmProcedureCode_3',
       'ClmProcedureCode_4', 'ClmProcedureCode_5', 'ClmProcedureCode_6']

In [66]:
class CodeCountTransform(BaseEstimator, TransformerMixin):
    # initializer 
    def __init__(self, colunmsToCount, newColumn):
        # save the features list internally in the class
        self.colunmsToCount = colunmsToCount
        self.newColumn = newColumn
        
    def fit(self, X, y=None):
        return self
      
    def transform(self, X, y=None):
        X[self.newColumn] = self.countCodeNumber(X, self.colunmsToCount)
        return X
        
    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)    
    
    def countCodeNumber(self, df, colunmsToCount):
        df_codes = df.loc[:, colunmsToCount]
        codecount = df_codes.notnull().sum(axis=1)
        return codecount

In [83]:
class CodeFrequencyGroupTransform(BaseEstimator, TransformerMixin):
    '''code_columns: the column list containing the claim codes
       new_column: list of new_columns
       '''
    def __init__(self, code_columns, new_columns_prefix, high, medium_high, medium, low):
        # save the features list internally in the class
        self.code_columns = code_columns
        self.new_columns_prefix = new_columns_prefix
        self.high = high
        self.medium_high = medium_high
        self.medium = medium
        self.low = low
    
    def fit(self, X, y=None):
        self.frequency_counts = self.getTotalCodeCounts(X, self.code_columns)
        self.frequency_groups = self.getFrequencyGroups(self.frequency_counts, self.high, self.medium_high, 
                                                        self.medium, self.low)
        
    def transform(self, X, y=None):
        X[self.new_columns_prefix+'HighFreqCount'] = self.codeForfrequencyGroupCounts(X, self.code_columns, self.frequency_groups[0])
        X[self.new_columns_prefix+'MediumHighFreqCount'] = self.codeForfrequencyGroupCounts(X, self.code_columns, self.frequency_groups[1])
        X[self.new_columns_prefix+'MediumFreqCount'] = self.codeForfrequencyGroupCounts(X, self.code_columns, self.frequency_groups[2])
        X[self.new_columns_prefix+'LowFreqCount'] = self.codeForfrequencyGroupCounts(X, self.code_columns, self.frequency_groups[3])
        X[self.new_columns_prefix+'RareFreqCount'] = self.codeForfrequencyGroupCounts(X, self.code_columns, self.frequency_groups[4])
        return X
        
    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)    
    
    def mergeDictionaryWithUpdate(self, dict_1, dict_2):
        for key in dict_2:
            value = dict_2[key]
       
            if key not in dict_1:
                dict_1[key] = value
            else:
                old_value = int(dict_1[key])
                dict_1[key] = old_value + value
        return dict_1    
    
    '''Get total counts of each code'''
    def getTotalCodeCounts(self, df, columns):
        code_counts = {}
        for column in columns:
            value_counts = df[column].value_counts().to_dict()
            code_counts = self.mergeDictionaryWithUpdate(code_counts, value_counts)
        sorted_counts = dict(sorted(code_counts.items(), key=lambda item: item[1], reverse=True))
        return sorted_counts  
    
    def getFrequencyGroups(self, dictionary, high, medium_high, medium, low):
        high_frequency = []
        medium_high_frequency = []
        medium_frequency = []
        low_fequency = []
        rare_frequency = []
        for key in dictionary:
            value = dictionary[key]
            if value >= high:
                high_frequency.append(key)
            elif value < high and value >= medium_high:
                medium_high_frequency.append(key)
            elif value < medium_high and value >= medium:
                medium_frequency.append(key)
            elif value < medium and value >= low:
                low_fequency.append(key)
            else:
                rare_frequency.append(key)
        results = []
        results.extend((high_frequency, medium_high_frequency, medium_frequency, low_fequency, rare_frequency))
        return results

    def codeForfrequencyGroupCounts(self, df, columns, frequency_group):
        df_codes = df.loc[:, columns]
        codecount = df_codes.isin(frequency_group).sum(axis=1)
        return codecount

In [98]:
class Top15OneHotTransform(BaseEstimator, TransformerMixin):
    def __init__(self, column_list, top_15_codes, new_column_prefix):
        self.column_list = column_list
        self.top_15_codes = top_15_codes
        self.new_column_prefix = new_column_prefix
        self.codes_df = pd.DataFrame()
    
    def fit(self, X, y=None):
        #change codes not in top15 with 'Other'
        codes_df = X[self.column_list]
        for column in self.column_list:
            codes_df[column] =  codes_df[column].apply(lambda x : 'Other' if x not in self.top_15_codes else x )
        self.codes_df = codes_df    
        
    def transform(self, X, y=None):
        for code in self.top_15_codes:
            column_name = self.new_column_prefix + code
            X[column_name] = self.codeForNHotCounts(self.codes_df, self.column_list, code)
        return X
        
    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)  
    
    def codeForNHotCounts(self, df, columns, code):
        df_codes = df.loc[:, columns]
        codecount = (df_codes==code).sum(axis=1)
        return codecount
    

In [None]:
# import category_encoders as ce
# class TargetCodingTransform(BaseEstimator, TransformerMixin):
#     def __init__(self, column_list, new_column):
#         self.column_list = column_list
#         self.new_column = new_column
#         self.codes_df = pd.DataFrame()
    
#     def fit(self, X, y=None):
#         #Change NA to -9999 and change PotentialFraud to 0/1
#         codes_df = X[self.column_list]
#         for column in self.column_list:
#             codes_df[column] = codes_df[column].apply(lambda x: '-9999' if pd.isna(x) else x)
#         codes_df['PotentialFraud'] = X['PotentialFraud'].apply(lambda x: 1 if x == 'Yes' else 0)     
#         self.codes_df = codes_df
        
        
#     def transform(self, X, y=None):
        
#         return X
        
#     def fit_transform(self, X, y=None):
#         self.fit(X)
#         return self.transform(X)  
    
    

In [142]:
class ProviderLevelAggregateTransform(BaseEstimator, TransformerMixin):
    def __init__(self, fraction_column_list):
        self.fraction_column_list = fraction_column_list
    
    def fit(self, X, y=None):
        #change codes not in top15 with 'Other'
        return self 
        
    def transform(self, X, y=None):
        agg_df = pd.DataFrame()
        
        agg_df = X.groupby('Provider').agg(     MedianAge = ('Age', 'median'),
                                                MeanInscClaimAmtReimbursed = ('InscClaimAmtReimbursed', 'mean'),
                                                MaxInscClaimAmtReimbursed = ('InscClaimAmtReimbursed', 'max'),
                                                TotalInscClaimAmtReimbursed = ('InscClaimAmtReimbursed', 'sum'),
                                                MeanDeductibleAmtPaid = ('DeductibleAmtPaid', 'mean'),
                                                MaxDeductibleAmtPaid = ('DeductibleAmtPaid', 'max'),
                                                MeanNumMonth_PartACov = ('NoOfMonths_PartACov','mean'),
                                                MeanNumMonth_PartBCov = ('NoOfMonths_PartBCov','mean'),
                                                MeanOPAnnualReimbursementAmt = ('OPAnnualReimbursementAmt', 'mean'),
                                                MaxOPAnnualReimbursementAmt = ('OPAnnualReimbursementAmt', 'max'),
                                                TotalOPAnnualReimbursementAmt = ('OPAnnualReimbursementAmt', 'sum'),
                                                MeanOPAnnualDeductibleAmt = ('OPAnnualDeductibleAmt', 'mean'),
                                                MaxOPAnnualDeductibleAmt = ('OPAnnualDeductibleAmt', 'max'),
                                                TotalOPAnnualDeductibleAmt = ('OPAnnualDeductibleAmt', 'sum'),
                                                MeanClaimPeriods = ('ClaimPeriod', 'mean'),
                                                MaxHospitalDays = ('HospitalDays', 'max'), 
                                                MedianHospitalDays = ('HospitalDays', 'median'),
                                                MeanHospitalDays = ('HospitalDays', 'mean'),
                                                MaxDiagCodeNumPerClaim = ('DiagCodeCounts', 'max'),
                                                MeanDiagCodeNumPerClaim = ('DiagCodeCounts', 'mean'),
                                                MaxProcCodeNumPerClaim = ('ProcCodeCounts', 'max'),
                                                MeanProcCodeNumPerClaim = ('ProcCodeCounts', 'mean'),
                                                TotalDiagCodeNum = ('DiagCodeCounts', 'sum'),
                                                TotalProcCodeNum = ('ProcCodeCounts', 'sum'),
                                                MeanHighFreqDiagCodeNumPerClaim = ('ClmDiagHighFreqCount', 'mean'),
                                                MeanMediumHighFreqDiagCodeNumPerClaim = ('ClmDiagMediumHighFreqCount', 'mean'),
                                                MeanMediumFreqDiagCodeNumPerClaim = ('ClmDiagMediumFreqCount', 'mean'),
                                                MeanLowFreqDiagCodeNumPerClaim = ('ClmDiagLowFreqCount', 'mean'),
                                                MeanRareFreqDiagCodeNumPerClaim = ('ClmDiagRareFreqCount', 'mean'),
                                                TotalHighFreqProcCodeNumPerClaim = ('ClmProcHighFreqCount', 'sum'),
                                                TotalMediumHighFreqProcCodeNumPerClaim = ('ClmProcMediumHighFreqCount', 'sum'),
                                                TotalMediumFreqProcCodeNumPerClaim = ('ClmProcMediumFreqCount', 'sum'),
                                                TotalLowFreqProcCodeNumPerClaim = ('ClmProcLowFreqCount', 'sum'),
                                                TotalRareFreqProcCodeNumPerClaim = ('ClmProcRareFreqCount', 'sum'),
                                                totalDiagCode_4019 = ('DiagCode_4019', 'sum'),      
                                                totalDiagCode_25000 = ('DiagCode_25000','sum'),
                                                totalDiagCode_2724 = ('DiagCode_2724', 'sum'),
                                                totalDiagCode_V5869 = ('DiagCode_V5869', 'sum'),
                                                totalDiagCode_4011 = ('DiagCode_4011', 'sum'),
                                                totalDiagCode_42731 =  ('DiagCode_42731', 'sum'),
                                                totalDiagCode_V5861 = ('DiagCode_V5861', 'sum'),
                                                totalDiagCode_2720 = ('DiagCode_2720', 'sum'),
                                                totalDiagCode_2449 = ('DiagCode_2449', 'sum'),
                                                totalDiagCode_4280 = ('DiagCode_4280', 'sum'),
                                                totalDiagCode_53081 = ('DiagCode_53081', 'sum'),
                                                totalDiagCode_41401 = ('DiagCode_41401', 'sum'),
                                                totalDiagCode_496 = ('DiagCode_496', 'sum'),
                                                totalDiagCode_2589 = ('DiagCode_2859', 'sum'),
                                                totalDiagCode_41400 = ('DiagCode_41400', 'sum'),
                                                totalDiagCode_Other = ('DiagCode_Other', 'sum'),
                                                totalProcCode_4019 = ('ProcCode_4019.0', 'sum'),
                                                totalProcCode_9904 = ('ProcCode_9904.0', 'sum'),
                                                totalProcCode_2724 = ('ProcCode_2724.0', 'sum'),
                                                totalProcCode_8154 = ('ProcCode_8154.0', 'sum'),
                                                totalProcCode_66 = ('ProcCode_66.0', 'sum'),
                                                totalProcCode_3893 = ('ProcCode_3893.0', 'sum'),
                                                totalProcCode_3995 = ('ProcCode_3995.0', 'sum'),
                                                totalProcCode_4516 = ('ProcCode_4516.0', 'sum'),
                                                totalProcCode_3722 = ('ProcCode_3722.0', 'sum'),
                                                totalProcCode_8151 = ('ProcCode_8151.0', 'sum'),
                                                totalProcCode_8872 = ('ProcCode_8872.0', 'sum'),
                                                totalProcCode_9671 = ('ProcCode_9671.0', 'sum'),
                                                totalProcCode_4513 = ('ProcCode_4513.0', 'sum'),
                                                totalProcCode_5849 = ('ProcCode_5849.0', 'sum'),
                                                totalProcCode_9390 = ('ProcCode_9390.0', 'sum'),
                                                totalProcCode_Other = ('ProcCode_Other', 'sum'))
        # Caculate aggregted fraction 
        for column in self.fraction_column_list:
            new_colunm = column + 'Frac_'
            agg_df[new_colunm] = (X.groupby('Provider').apply(lambda x: (x[column] == 1).sum()/x[column].count())).values                             
        
        # Total Claims per provider
        agg_df['ClaimNumbers'] = (X.groupby('Provider')[['ClaimID']].count()).values
        agg_df['uniqBeneCount'] = (X.groupby('Provider')[['BeneID']].nunique()).values
                                           
        return agg_df
        
    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)  
    
                                           

In [88]:
train_top_15_diag_codes = ['4019', '25000', '2724', 'V5869', '4011', '42731', 'V5861', '2720', '2449',
 '4280', '53081', '41401', '496', '2859', '41400', 'Other']
train_top_15_proc_codes = ['4019.0', '9904.0', '2724.0', '8154.0', '66.0', '3893.0', '3995.0', '4516.0',
 '3722.0', '8151.0', '8872.0', '9671.0','4513.0','5849.0', '9390.0', 'Other']

In [129]:
fraction_column_list= ['ChronicCond_Alzheimer','ChronicCond_Heartfailure','ChronicCond_KidneyDisease','ChronicCond_Cancer',
'ChronicCond_ObstrPulmonary','ChronicCond_Depression','ChronicCond_Diabetes','ChronicCond_IschemicHeart','ChronicCond_Osteoporasis',
'ChronicCond_rheumatoidarthritis','ChronicCond_stroke', 'RenalDiseaseIndicator', 'Deceased', 'Gender', 'Race']

In [None]:
steps = [('claim_period_transform', DateTransform(start='ClaimStartDt', end='ClaimEndDt', newColumn='ClaimPeriod')), 
         ('hospital_period_transform', DateTransform(start='AdmissionDt', end='DischargeDt', newColumn='HospitalDays')),
         ('age_transform', AgeTransform(dob='DOB', dod='DOD', ageColumn='Age', deceasedColumn='Deceased')),
         
         ('diag_code_count', CodeCountTransform(colunmsToCount = diagnosis_code_columns, newColumn='DiagCodeCounts')),
         ('proc_code_count', CodeCountTransform(colunmsToCount = procedure_code_columns, newColumn='ProcCodeCounts')),
         
         ('diag_code_frequency_group', CodeFrequencyGroupTransform(code_columns=diagnosis_code_columns, 
                                                                   new_columns_prefix='ClmDiag',
                                                                  high=10000, medium_high=5000, medium=800, low=500)),
         ('proc_code_frequency_group', CodeFrequencyGroupTransform(code_columns=procedure_code_columns, 
                                                                   new_columns_prefix='ClmProc',
                                                                  high=500, medium_high=100, medium=10, low=5)),
         
         ('diag_code_top15_onehot', Top15OneHotTransform(column_list=diagnosis_code_columns, 
                                                         top_15_codes=train_top_15_diag_codes,
                                                         new_column_prefix='DiagCode_')),
         ('proc_code_top15_onehot', Top15OneHotTransform(column_list=procedure_code_columns, 
                                                         top_15_codes=train_top_15_proc_codes,
                                                         new_column_prefix='ProcCode_')),
         
         ('aggregation', ProviderLevelAggregateTransform(fraction_column_list=fraction_column_list))]

pipe = Pipeline(steps)
pipe.fit_transform(merge_provider_df)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  codes_df[column] =  codes_df[column].apply(lambda x : 'Other' if x not in self.top_15_codes else x )
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  codes_df[column] =  codes_df[column].apply(lambda x : 'Other' if x not in self.top_15_codes else x )


In [None]:
drop_column_list = []

In [141]:
merge_provider_df.groupby('Provider')[['BeneID']].nunique()

Unnamed: 0_level_0,BeneID
Provider,Unnamed: 1_level_1
PRV51001,24
PRV51003,117
PRV51004,138
PRV51005,495
PRV51007,58
...,...
PRV57759,24
PRV57760,9
PRV57761,67
PRV57762,1
