In [275]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
from tqdm.notebook import tqdm_notebook

import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier, plot_tree 
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_curve, auc , f1_score, recall_score, accuracy_score
from sklearn.metrics import confusion_matrix, classification_report 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer , StandardScaler

## Read data 

In [276]:
# data train include provider ID and labels
Train = pd.read_csv("data/Train-1542865627584.csv")

# include beneficiary KYC details like health conditions,region
Train_Beneficiarydata = pd.read_csv("data/Train_Beneficiarydata-1542865627584.csv")

# include the claims filed for those patients who are admitted in the hospitals
Train_Inpatientdata=pd.read_csv("data/Train_Inpatientdata-1542865627584.csv")

# include the claims filed for those patients who visit hospitals and not admitted in it.
Train_Outpatientdata=pd.read_csv("data/Train_Outpatientdata-1542865627584.csv")


In [277]:
print("Train\n", Train.shape)
print("Train_Beneficiarydata\n", Train_Beneficiarydata.shape)
print("Train_Inpatientdata\n", Train_Inpatientdata.shape)
print("Train_Outpatientdata\n", Train_Outpatientdata.shape)


Train
 (5410, 2)
Train_Beneficiarydata
 (138556, 25)
Train_Inpatientdata
 (40474, 30)
Train_Outpatientdata
 (517737, 27)


## Functions

In [313]:
def agg_calculation(dataset,
                        groupby_col = '',
                        agg_col = '',
                        sort_col = '',
                        agg_method_lst = [],
                        col_name_lst = []
                       ):
    """This function makes aggregation on specific column"""

    grouped = dataset.groupby(groupby_col)[agg_col].agg(agg_method_lst).reset_index()
    grouped.columns = col_name_lst
    grouped = grouped.sort_values(by = [sort_col], ascending=False).reset_index(drop = True)

    return grouped


In [349]:
def get_top_5_codes(group, col=''):
    top_5 = group[col].value_counts().nlargest(5).index.tolist()
    return top_5

In [366]:
# Function to count occurrences of values across multiple columns and find top 5
def top_5_across_columns(df, columns):
    counts = {}
    for col in columns:
        for val in df[col]:
            if pd.notnull(val):
                if val in counts:
                    counts[val] += 1
                else:
                    counts[val] = 1
    top_5 = sorted(counts, key=counts.get, reverse=True)[:5]
    return top_5

## Create Features

### Inpatient

In [278]:
print('inpatient columns:\n',  Train_Inpatientdata.columns)
Train_Inpatientdata.head(5)

inpatient columns:
 Index(['BeneID', 'ClaimID', 'ClaimStartDt', 'ClaimEndDt', 'Provider',
       'InscClaimAmtReimbursed', 'AttendingPhysician', 'OperatingPhysician',
       'OtherPhysician', 'AdmissionDt', 'ClmAdmitDiagnosisCode',
       'DeductibleAmtPaid', 'DischargeDt', 'DiagnosisGroupCode',
       'ClmDiagnosisCode_1', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3',
       'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6',
       'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9',
       'ClmDiagnosisCode_10', 'ClmProcedureCode_1', 'ClmProcedureCode_2',
       'ClmProcedureCode_3', 'ClmProcedureCode_4', 'ClmProcedureCode_5',
       'ClmProcedureCode_6'],
      dtype='object')


Unnamed: 0,BeneID,ClaimID,ClaimStartDt,ClaimEndDt,Provider,InscClaimAmtReimbursed,AttendingPhysician,OperatingPhysician,OtherPhysician,AdmissionDt,...,ClmDiagnosisCode_7,ClmDiagnosisCode_8,ClmDiagnosisCode_9,ClmDiagnosisCode_10,ClmProcedureCode_1,ClmProcedureCode_2,ClmProcedureCode_3,ClmProcedureCode_4,ClmProcedureCode_5,ClmProcedureCode_6
0,BENE11001,CLM46614,2009-04-12,2009-04-18,PRV55912,26000,PHY390922,,,2009-04-12,...,2724.0,19889.0,5849.0,,,,,,,
1,BENE11001,CLM66048,2009-08-31,2009-09-02,PRV55907,5000,PHY318495,PHY318495,,2009-08-31,...,,,,,7092.0,,,,,
2,BENE11001,CLM68358,2009-09-17,2009-09-20,PRV56046,5000,PHY372395,,PHY324689,2009-09-17,...,,,,,,,,,,
3,BENE11011,CLM38412,2009-02-14,2009-02-22,PRV52405,5000,PHY369659,PHY392961,PHY349768,2009-02-14,...,25062.0,40390.0,4019.0,,331.0,,,,,
4,BENE11014,CLM63689,2009-08-13,2009-08-30,PRV56614,10000,PHY379376,PHY398258,,2009-08-13,...,5119.0,29620.0,20300.0,,3893.0,,,,,


In [291]:
# check duplicates
print(len(list(Train_Inpatientdata.ClaimID.value_counts()==1)))
print(len(list(Train_Inpatientdata['ClaimID'].unique())))

40474
40474


#### 1. Num of inpatient claims for each provider

In [322]:
df1 = agg_calculation(Train_Inpatientdata,
                    groupby_col = 'Provider',
                    agg_col = 'ClaimID',
                    sort_col = 'ip_claims_num',
                    agg_method_lst = ['count'],
                    col_name_lst = ['Provider', 'ip_claims_num']
                       )
df1

Unnamed: 0,Provider,ip_claims_num
0,PRV52019,516
1,PRV55462,386
2,PRV54367,322
3,PRV53706,282
4,PRV55209,275
...,...,...
2087,PRV53515,1
2088,PRV53516,1
2089,PRV53600,1
2090,PRV56474,1


#### 2. Num of inpatient patients for each provider

In [321]:
df2 = agg_calculation(Train_Inpatientdata,
                    groupby_col = 'Provider',
                    agg_col = 'BeneID',
                    sort_col = 'ip_patients_num',
                    agg_method_lst = ['nunique'],
                    col_name_lst = ['Provider', 'ip_patients_num']
                       )
df_merged = df1.merge(df2, how = 'outer', on = 'Provider')
df_merged

Unnamed: 0,Provider,ip_claims_num,ip_patients_num
0,PRV52019,516,458
1,PRV55462,386,308
2,PRV54367,322,279
3,PRV53706,282,262
4,PRV55209,275,243
...,...,...,...
2087,PRV53515,1,1
2088,PRV53516,1,1
2089,PRV53600,1,1
2090,PRV56474,1,1


#### 3. Total inpatient reimbursement for each provider

In [323]:
df3 = agg_calculation(Train_Inpatientdata,
                    groupby_col = 'Provider',
                    agg_col = 'InscClaimAmtReimbursed',
                    sort_col = 'ip_reimbursed',
                    agg_method_lst = ['sum'],
                    col_name_lst = ['Provider', 'ip_reimbursed']
                       )
df_merged = df_merged.merge(df3, how = 'outer', on = 'Provider')
df_merged

Unnamed: 0,Provider,ip_claims_num,ip_patients_num,ip_reimbursed
0,PRV52019,516,458,5580870
1,PRV55462,386,308,4260100
2,PRV54367,322,279,3040900
3,PRV53706,282,262,2776000
4,PRV55209,275,243,2756100
...,...,...,...,...
2087,PRV53515,1,1,14000
2088,PRV53516,1,1,3000
2089,PRV53600,1,1,4000
2090,PRV56474,1,1,7000


#### 4. Total inpatient deductible amount for each provider

In [324]:
df4 = agg_calculation(Train_Inpatientdata,
                    groupby_col = 'Provider',
                    agg_col = 'DeductibleAmtPaid',
                    sort_col = 'ip_deductible',
                    agg_method_lst = ['sum'],
                    col_name_lst = ['Provider', 'ip_deductible']
                       )
df_merged = df_merged.merge(df4, how = 'outer', on = 'Provider')
df_merged

Unnamed: 0,Provider,ip_claims_num,ip_patients_num,ip_reimbursed,ip_deductible
0,PRV52019,516,458,5580870,536136.0
1,PRV55462,386,308,4260100,405840.0
2,PRV54367,322,279,3040900,334284.0
3,PRV53706,282,262,2776000,296904.0
4,PRV55209,275,243,2756100,289428.0
...,...,...,...,...,...
2087,PRV53515,1,1,14000,1068.0
2088,PRV53516,1,1,3000,1068.0
2089,PRV53600,1,1,4000,1068.0
2090,PRV56474,1,1,7000,1068.0


#### 5. Total claim length for each provider

In [325]:
startdate = pd.to_datetime(Train_Inpatientdata.ClaimStartDt)
enddate = pd.to_datetime(Train_Inpatientdata.ClaimEndDt)
Train_Inpatientdata['Cperiod'] = (enddate - startdate).dt.days+1

df5 = agg_calculation(Train_Inpatientdata,
                    groupby_col = 'Provider',
                    agg_col = 'Cperiod',
                    sort_col = 'ip_cperiod',
                    agg_method_lst = ['sum'],
                    col_name_lst = ['Provider', 'ip_cperiod']
                       )
df_merged = df_merged.merge(df5, how = 'outer', on = 'Provider')
df_merged

Unnamed: 0,Provider,ip_claims_num,ip_patients_num,ip_reimbursed,ip_deductible,ip_cperiod
0,PRV52019,516,458,5580870,536136.0,3560
1,PRV55462,386,308,4260100,405840.0,2682
2,PRV54367,322,279,3040900,334284.0,2052
3,PRV53706,282,262,2776000,296904.0,1892
4,PRV55209,275,243,2756100,289428.0,1744
...,...,...,...,...,...,...
2087,PRV53515,1,1,14000,1068.0,6
2088,PRV53516,1,1,3000,1068.0,9
2089,PRV53600,1,1,4000,1068.0,3
2090,PRV56474,1,1,7000,1068.0,13


#### 6. Total hospital stay for each provider

In [326]:
startdate = pd.to_datetime(Train_Inpatientdata.AdmissionDt)
enddate = pd.to_datetime(Train_Inpatientdata.DischargeDt)
Train_Inpatientdata['HospitalStay'] = (enddate - startdate).dt.days+1

df6 = agg_calculation(Train_Inpatientdata,
                    groupby_col = 'Provider',
                    agg_col = 'HospitalStay',
                    sort_col = 'ip_hperiod',
                    agg_method_lst = ['sum'],
                    col_name_lst = ['Provider', 'ip_hperiod']
                       )
df_merged = df_merged.merge(df6, how = 'outer', on = 'Provider')
df_merged

Unnamed: 0,Provider,ip_claims_num,ip_patients_num,ip_reimbursed,ip_deductible,ip_cperiod,ip_hperiod
0,PRV52019,516,458,5580870,536136.0,3560,3568
1,PRV55462,386,308,4260100,405840.0,2682,2682
2,PRV54367,322,279,3040900,334284.0,2052,2052
3,PRV53706,282,262,2776000,296904.0,1892,1919
4,PRV55209,275,243,2756100,289428.0,1744,1744
...,...,...,...,...,...,...,...
2087,PRV53515,1,1,14000,1068.0,6,6
2088,PRV53516,1,1,3000,1068.0,9,9
2089,PRV53600,1,1,4000,1068.0,3,3
2090,PRV56474,1,1,7000,1068.0,13,13


#### 7. Calculate a relationship score among three physcian columns for each claim and average on each provider

In [327]:
#check for same physician in all three physician columns
def physician_same(row):
    atten_oper = row['AttendingPhysician'] == row['OperatingPhysician']
    oper_other = row['OperatingPhysician'] == row['OtherPhysician']
    atten_other = row['AttendingPhysician'] == row['OtherPhysician']
    
    # atten = oper = other
    if atten_oper == True and oper_other == True:
        return 0
    
    # atten = oper != other
    elif atten_oper == True and oper_other == False:
        return 1
    
    # atten = other != oper
    elif atten_other == True and oper_other == False:
        return 2
    
    # atten != other = oper
    elif atten_other == False and oper_other == True:
        return 3
    
    # atten != other != oper
    else:
        return 4


Train_Inpatientdata['phy_same'] = Train_Inpatientdata.apply(physician_same, axis=1)

In [328]:
df7 = agg_calculation(Train_Inpatientdata,
                    groupby_col = 'Provider',
                    agg_col = 'phy_same',
                    sort_col = 'ip_phy_same',
                    agg_method_lst = ['mean'],
                    col_name_lst = ['Provider', 'ip_phy_same']
                       )
df_merged = df_merged.merge(df7, how = 'outer', on = 'Provider')
df_merged

Unnamed: 0,Provider,ip_claims_num,ip_patients_num,ip_reimbursed,ip_deductible,ip_cperiod,ip_hperiod,ip_phy_same
0,PRV52019,516,458,5580870,536136.0,3560,3568,3.614341
1,PRV55462,386,308,4260100,405840.0,2682,2682,4.000000
2,PRV54367,322,279,3040900,334284.0,2052,2052,2.611801
3,PRV53706,282,262,2776000,296904.0,1892,1919,3.659574
4,PRV55209,275,243,2756100,289428.0,1744,1744,2.330909
...,...,...,...,...,...,...,...,...
2087,PRV53515,1,1,14000,1068.0,6,6,4.000000
2088,PRV53516,1,1,3000,1068.0,9,9,4.000000
2089,PRV53600,1,1,4000,1068.0,3,3,1.000000
2090,PRV56474,1,1,7000,1068.0,13,13,4.000000


#### 8. Number of different physicians who appear on a claim and average on each provider

In [330]:
#encoding types of physicians into numeric values
Train_Inpatientdata[['AttendingPhysician', 'OperatingPhysician', 'OtherPhysician']] = np.where(Train_Inpatientdata[['AttendingPhysician','OperatingPhysician','OtherPhysician']].isnull(), 0, 1)

# number of different physicians who attend a patient
Train_Inpatientdata['N_Types_Physicians'] = Train_Inpatientdata['AttendingPhysician'] +  Train_Inpatientdata['OperatingPhysician'] + Train_Inpatientdata['OtherPhysician']


In [332]:
df8 = agg_calculation(Train_Inpatientdata,
                    groupby_col = 'Provider',
                    agg_col = 'N_Types_Physicians',
                    sort_col = 'ip_N_Types_Physicians',
                    agg_method_lst = ['mean'],
                    col_name_lst = ['Provider', 'ip_N_Types_Physicians']
                       )
df_merged = df_merged.merge(df8, how = 'outer', on = 'Provider')
df_merged

Unnamed: 0,Provider,ip_claims_num,ip_patients_num,ip_reimbursed,ip_deductible,ip_cperiod,ip_hperiod,ip_phy_same,ip_ClmAdmitDiagnosisCode,ip_N_Types_Physicians
0,PRV52019,516,458,5580870,536136.0,3560,3568,3.614341,226,1.812016
1,PRV55462,386,308,4260100,405840.0,2682,2682,4.000000,181,1.582902
2,PRV54367,322,279,3040900,334284.0,2052,2052,2.611801,157,1.577640
3,PRV53706,282,262,2776000,296904.0,1892,1919,3.659574,163,1.723404
4,PRV55209,275,243,2756100,289428.0,1744,1744,2.330909,151,1.552727
...,...,...,...,...,...,...,...,...,...,...
2087,PRV53515,1,1,14000,1068.0,6,6,4.000000,1,2.000000
2088,PRV53516,1,1,3000,1068.0,9,9,4.000000,1,1.000000
2089,PRV53600,1,1,4000,1068.0,3,3,1.000000,1,2.000000
2090,PRV56474,1,1,7000,1068.0,13,13,4.000000,1,1.000000


#### 9. top 5 ClmAdmitDiagnosisCode for each provider

In [354]:
df9 = Train_Inpatientdata.groupby('Provider').apply(lambda x: get_top_5_codes(x, 'ClmAdmitDiagnosisCode')).reset_index(name='Top_5_ClmAdmitDiagnosisCode')
df9

Unnamed: 0,Provider,Top_5_ClmAdmitDiagnosisCode
0,PRV51001,"[29623, 0389, 5362, 41401, 80121]"
1,PRV51003,"[78605, 4280, 42789, 78650, 78701]"
2,PRV51007,"[78097, 29570, 29633]"
3,PRV51008,"[7837, 920]"
4,PRV51011,[78906]
...,...,...
2087,PRV57719,"[68110, 5789, 78703, 4271]"
2088,PRV57728,[42789]
2089,PRV57729,"[486, 43491, 78605, 5990, 41401]"
2090,PRV57732,"[5119, 7907, 71535, V5789, 7242]"


In [355]:
df_merged = df_merged.merge(df9, how = 'outer', on = 'Provider')
df_merged

Unnamed: 0,Provider,ip_claims_num,ip_patients_num,ip_reimbursed,ip_deductible,ip_cperiod,ip_hperiod,ip_phy_same,ip_N_Types_Physicians,Top_5_ClmAdmitDiagnosisCode
0,PRV52019,516,458,5580870,536136.0,3560,3568,3.614341,1.812016,"[78650, 486, 42731, 78605, 4280]"
1,PRV55462,386,308,4260100,405840.0,2682,2682,4.000000,1.582902,"[78650, 486, 78605, V5789, 78097]"
2,PRV54367,322,279,3040900,334284.0,2052,2052,2.611801,1.577640,"[78650, 78605, V5789, 42731, 78659]"
3,PRV53706,282,262,2776000,296904.0,1892,1919,3.659574,1.723404,"[78605, 486, 4280, 78650, 71536]"
4,PRV55209,275,243,2756100,289428.0,1744,1744,2.330909,1.552727,"[78650, 486, 4280, 78605, 0389]"
...,...,...,...,...,...,...,...,...,...,...
2087,PRV53515,1,1,14000,1068.0,6,6,4.000000,2.000000,[72402]
2088,PRV53516,1,1,3000,1068.0,9,9,4.000000,1.000000,[49322]
2089,PRV53600,1,1,4000,1068.0,3,3,1.000000,2.000000,[V5413]
2090,PRV56474,1,1,7000,1068.0,13,13,4.000000,1.000000,[486]


#### 10. top 5 DiagnosisGroupCode for each provider

In [356]:
df10 = Train_Inpatientdata.groupby('Provider').apply(lambda x: get_top_5_codes(x, 'DiagnosisGroupCode')).reset_index(name='Top_5_DiagnosisGroupCode')
df10

Unnamed: 0,Provider,Top_5_DiagnosisGroupCode
0,PRV51001,"[882, 864, 353, 245, 062]"
1,PRV51003,"[203, 262, 241, 222, 627]"
2,PRV51007,"[085, 886, 887]"
3,PRV51008,"[623, 095]"
4,PRV51011,[414]
...,...,...
2087,PRV57719,"[605, 684, 341, 005]"
2088,PRV57728,[257]
2089,PRV57729,"[288, 183, 103, 669, 082]"
2090,PRV57732,"[941, 189, 473, 414, 939]"


In [357]:
df_merged = df_merged.merge(df10, how = 'outer', on = 'Provider')
df_merged

Unnamed: 0,Provider,ip_claims_num,ip_patients_num,ip_reimbursed,ip_deductible,ip_cperiod,ip_hperiod,ip_phy_same,ip_N_Types_Physicians,Top_5_ClmAdmitDiagnosisCode,Top_5_DiagnosisGroupCode
0,PRV52019,516,458,5580870,536136.0,3560,3568,3.614341,1.812016,"[78650, 486, 42731, 78605, 4280]","[181, 627, 205, 882, 694]"
1,PRV55462,386,308,4260100,405840.0,2682,2682,4.000000,1.582902,"[78650, 486, 78605, V5789, 78097]","[167, 950, 866, 304, 217]"
2,PRV54367,322,279,3040900,334284.0,2052,2052,2.611801,1.577640,"[78650, 78605, V5789, 42731, 78659]","[204, 196, 232, 941, 255]"
3,PRV53706,282,262,2776000,296904.0,1892,1919,3.659574,1.723404,"[78605, 486, 4280, 78650, 71536]","[864, 241, 245, 264, 949]"
4,PRV55209,275,243,2756100,289428.0,1744,1744,2.330909,1.552727,"[78650, 486, 4280, 78605, 0389]","[883, 183, 871, 887, 866]"
...,...,...,...,...,...,...,...,...,...,...,...
2087,PRV53515,1,1,14000,1068.0,6,6,4.000000,2.000000,[72402],[476]
2088,PRV53516,1,1,3000,1068.0,9,9,4.000000,1.000000,[49322],[207]
2089,PRV53600,1,1,4000,1068.0,3,3,1.000000,2.000000,[V5413],[514]
2090,PRV56474,1,1,7000,1068.0,13,13,4.000000,1.000000,[486],[188]


#### 11. top 5 ClmDiagnosisCode for each provider

In [367]:
diagnosis_col = ['ClmDiagnosisCode_1','ClmDiagnosisCode_2','ClmDiagnosisCode_3','ClmDiagnosisCode_4','ClmDiagnosisCode_5','ClmDiagnosisCode_6','ClmDiagnosisCode_7','ClmDiagnosisCode_8','ClmDiagnosisCode_9','ClmDiagnosisCode_10']

df11 = Train_Inpatientdata.groupby('Provider').apply(lambda x: top_5_across_columns(x, diagnosis_col)).reset_index(name='Top_5_ClmDiagnosisCode')
df11

Unnamed: 0,Provider,Top_5_ClmDiagnosisCode
0,PRV51001,"[2724, 53081, 29650, 0388, 56211]"
1,PRV51003,"[4019, 4280, 2724, 41401, 5990]"
2,PRV51007,"[3310, 29620, 29622, 25000, 4019]"
3,PRV51008,"[27651, 1911, 4439, 2948, V1046]"
4,PRV51011,"[5770, 32723, 311, 5771, 2449]"
...,...,...
2087,PRV57719,"[25000, 6827, 5849, 56211, 0389]"
2088,PRV57728,"[42732, 49390, 2729, 78791, 7906]"
2089,PRV57729,"[4280, 2724, 41401, 4019, 5990]"
2090,PRV57732,"[4019, 486, 53081, 2724, 40390]"


In [368]:
df_merged = df_merged.merge(df11, how = 'outer', on = 'Provider')
df_merged

Unnamed: 0,Provider,ip_claims_num,ip_patients_num,ip_reimbursed,ip_deductible,ip_cperiod,ip_hperiod,ip_phy_same,ip_N_Types_Physicians,Top_5_ClmAdmitDiagnosisCode,Top_5_DiagnosisGroupCode,Top_5_ClmDiagnosisCode
0,PRV52019,516,458,5580870,536136.0,3560,3568,3.614341,1.812016,"[78650, 486, 42731, 78605, 4280]","[181, 627, 205, 882, 694]","[4019, 25000, 2724, 42731, 4280]"
1,PRV55462,386,308,4260100,405840.0,2682,2682,4.000000,1.582902,"[78650, 486, 78605, V5789, 78097]","[167, 950, 866, 304, 217]","[4019, 2724, 25000, 41401, 42731]"
2,PRV54367,322,279,3040900,334284.0,2052,2052,2.611801,1.577640,"[78650, 78605, V5789, 42731, 78659]","[204, 196, 232, 941, 255]","[4019, 2724, 25000, 41401, 53081]"
3,PRV53706,282,262,2776000,296904.0,1892,1919,3.659574,1.723404,"[78605, 486, 4280, 78650, 71536]","[864, 241, 245, 264, 949]","[4019, 2724, 41401, 42731, 25000]"
4,PRV55209,275,243,2756100,289428.0,1744,1744,2.330909,1.552727,"[78650, 486, 4280, 78605, 0389]","[883, 183, 871, 887, 866]","[4019, 2724, 4280, 25000, 42731]"
...,...,...,...,...,...,...,...,...,...,...,...,...
2087,PRV53515,1,1,14000,1068.0,6,6,4.000000,2.000000,[72402],[476],"[72402, 51883, 71690, 49320, 3051]"
2088,PRV53516,1,1,3000,1068.0,9,9,4.000000,1.000000,[49322],[207],"[49322, V433, 2449, 78650, 30522]"
2089,PRV53600,1,1,4000,1068.0,3,3,1.000000,2.000000,[V5413],[514],"[99666, 27801, V4365, 0414, 2724]"
2090,PRV56474,1,1,7000,1068.0,13,13,4.000000,1.000000,[486],[188],"[486, 04112, 71690, 5307, 2440]"


#### 12. top 5 ClmProcedureCode for each provider

In [369]:
procedure_col = ['ClmProcedureCode_1','ClmProcedureCode_2','ClmProcedureCode_3','ClmProcedureCode_4','ClmProcedureCode_5','ClmProcedureCode_6']

df12 = Train_Inpatientdata.groupby('Provider').apply(lambda x: top_5_across_columns(x, procedure_col)).reset_index(name='Top_5_ClmProcedureCode')
df12

Unnamed: 0,Provider,Top_5_ClmProcedureCode
0,PRV51001,"[3521.0, 8659.0, 2724.0]"
1,PRV51003,"[4019.0, 8622.0, 3491.0, 3995.0, 3893.0]"
2,PRV51007,[8627.0]
3,PRV51008,"[8954.0, 159.0]"
4,PRV51011,[]
...,...,...
2087,PRV57719,"[4516.0, 131.0, 8604.0]"
2088,PRV57728,[]
2089,PRV57729,"[3722.0, 3613.0, 9904.0, 8841.0, 66.0]"
2090,PRV57732,"[8154.0, 9960.0, 4233.0, 9338.0, 8838.0]"


In [370]:
df_merged = df_merged.merge(df12, how = 'outer', on = 'Provider')
df_merged

Unnamed: 0,Provider,ip_claims_num,ip_patients_num,ip_reimbursed,ip_deductible,ip_cperiod,ip_hperiod,ip_phy_same,ip_N_Types_Physicians,Top_5_ClmAdmitDiagnosisCode,Top_5_DiagnosisGroupCode,Top_5_ClmDiagnosisCode,Top_5_ClmProcedureCode
0,PRV52019,516,458,5580870,536136.0,3560,3568,3.614341,1.812016,"[78650, 486, 42731, 78605, 4280]","[181, 627, 205, 882, 694]","[4019, 25000, 2724, 42731, 4280]","[4019.0, 8154.0, 2724.0, 3893.0, 9904.0]"
1,PRV55462,386,308,4260100,405840.0,2682,2682,4.000000,1.582902,"[78650, 486, 78605, V5789, 78097]","[167, 950, 866, 304, 217]","[4019, 2724, 25000, 41401, 42731]","[4019.0, 9904.0, 66.0, 8154.0, 3893.0]"
2,PRV54367,322,279,3040900,334284.0,2052,2052,2.611801,1.577640,"[78650, 78605, V5789, 42731, 78659]","[204, 196, 232, 941, 255]","[4019, 2724, 25000, 41401, 53081]","[4019.0, 9904.0, 2724.0, 66.0, 8154.0]"
3,PRV53706,282,262,2776000,296904.0,1892,1919,3.659574,1.723404,"[78605, 486, 4280, 78650, 71536]","[864, 241, 245, 264, 949]","[4019, 2724, 41401, 42731, 25000]","[4019.0, 8154.0, 66.0, 2724.0, 9904.0]"
4,PRV55209,275,243,2756100,289428.0,1744,1744,2.330909,1.552727,"[78650, 486, 4280, 78605, 0389]","[883, 183, 871, 887, 866]","[4019, 2724, 4280, 25000, 42731]","[4019.0, 4513.0, 9904.0, 8154.0, 3893.0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2087,PRV53515,1,1,14000,1068.0,6,6,4.000000,2.000000,[72402],[476],"[72402, 51883, 71690, 49320, 3051]","[309.0, 1749.0]"
2088,PRV53516,1,1,3000,1068.0,9,9,4.000000,1.000000,[49322],[207],"[49322, V433, 2449, 78650, 30522]",[]
2089,PRV53600,1,1,4000,1068.0,3,3,1.000000,2.000000,[V5413],[514],"[99666, 27801, V4365, 0414, 2724]",[8944.0]
2090,PRV56474,1,1,7000,1068.0,13,13,4.000000,1.000000,[486],[188],"[486, 04112, 71690, 5307, 2440]",[]


### Outpatient

In [371]:
print('outpatient columns:\n',  Train_Outpatientdata.columns)
Train_Outpatientdata.head(5)

outpatient columns:
 Index(['BeneID', 'ClaimID', 'ClaimStartDt', 'ClaimEndDt', 'Provider',
       'InscClaimAmtReimbursed', 'AttendingPhysician', 'OperatingPhysician',
       'OtherPhysician', 'ClmDiagnosisCode_1', 'ClmDiagnosisCode_2',
       'ClmDiagnosisCode_3', 'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5',
       'ClmDiagnosisCode_6', 'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8',
       'ClmDiagnosisCode_9', 'ClmDiagnosisCode_10', 'ClmProcedureCode_1',
       'ClmProcedureCode_2', 'ClmProcedureCode_3', 'ClmProcedureCode_4',
       'ClmProcedureCode_5', 'ClmProcedureCode_6', 'DeductibleAmtPaid',
       'ClmAdmitDiagnosisCode'],
      dtype='object')


Unnamed: 0,BeneID,ClaimID,ClaimStartDt,ClaimEndDt,Provider,InscClaimAmtReimbursed,AttendingPhysician,OperatingPhysician,OtherPhysician,ClmDiagnosisCode_1,...,ClmDiagnosisCode_9,ClmDiagnosisCode_10,ClmProcedureCode_1,ClmProcedureCode_2,ClmProcedureCode_3,ClmProcedureCode_4,ClmProcedureCode_5,ClmProcedureCode_6,DeductibleAmtPaid,ClmAdmitDiagnosisCode
0,BENE11002,CLM624349,2009-10-11,2009-10-11,PRV56011,30,PHY326117,,,78943,...,,,,,,,,,0,56409.0
1,BENE11003,CLM189947,2009-02-12,2009-02-12,PRV57610,80,PHY362868,,,6115,...,,,,,,,,,0,79380.0
2,BENE11003,CLM438021,2009-06-27,2009-06-27,PRV57595,10,PHY328821,,,2723,...,,,,,,,,,0,
3,BENE11004,CLM121801,2009-01-06,2009-01-06,PRV56011,40,PHY334319,,,71988,...,,,,,,,,,0,
4,BENE11004,CLM150998,2009-01-22,2009-01-22,PRV56011,200,PHY403831,,,82382,...,,,,,,,,,0,71947.0


#### 1. Num of inpatient claims for each provider

#### 2. Num of inpatient patients for each provider

#### 3. Total inpatient reimbursement for each provider

#### 4. Total inpatient deductible amount for each provider

#### 5. Total claim length for each provider

#### 6. Total hospital stay for each provider

#### 7. Calculate a relationship score among three physcian columns for each claim and average on each provider

#### 8. Number of different physicians who appear on a claim and average on each provider

#### 9. top 5 ClmAdmitDiagnosisCode for each provider

#### 10. top 5 DiagnosisGroupCode for each provider

#### 11. top 5 ClmDiagnosisCode for each provider

#### 12. top 5 ClmProcedureCode for each provider