In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_curve, precision_score, recall_score, confusion_matrix, auc
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from joblib import dump, load

## Utility functions

In [2]:
def generate_data(x, models):
    
    '''This function generates metadata with k predictions of k base learners for custom model'''
    
    res_x = []
    for model in models:
        res_x.append(model.predict(x))
    res_x = np.array(res_x).T
    
    return res_x

## Final pipeline

In [3]:
def final_fun_1(X):
    
    '''This function takes details about a healthcare provider as input and returns a prediction of the healthcare provider
       being a potential fraud. The details include: no. of inpatient claims(is_inpatient), no. of claims with group codes
       (is_groupcode), no. of claims with chronic illnesses like heartfailure, alzeimer, diabetes, etc., avg. deductible amt,
       avg. insurance amount reimbursed to the provider and avg. no. of days a patient was admitted under provider's care.'''
    
    # Loading Standard Scaler model to scale the data
    with open ('../Saved_Models/scaler.pkl', 'rb') as f:
        scaler = pickle.load(f)
        
    # Storing all provider ids separately
    provider_ids = X[:, 0]
    X = np.delete(X, 0, 1)
    
    # Scaling data
    X_scaled = scaler.transform(X)
    
    # Loading all base learners
    files = os.listdir('../Saved_Models/base_learners2')
    models = []
    for model in files:
        clf = load('../Saved_Models/base_learners2/'+model)
        models.append(clf)
        
    # Loading custom model
    custom_model = load('../Saved_Models/best_custom_model2.joblib')
    
    # Predictions
    x_meta = generate_data(X_scaled, models)
    y_pred = custom_model.predict(x_meta)
    y_prob = custom_model.predict_proba(x_meta)
    
    # Generating dataframe with top 10 features and predictions
    top_feat = ['Inpatient_Claims', 'Claims_with_groupcode', 'Claims_with_ChronicCond_rheumatoidarthritis',
                'Total_Beneficiaries_Count', 'Average_DeductibleAmtPaid','Average_InscClaimAmtReimbursed',
                'Claims_with_ChronicCond_Alzheimer', 'Claims_with_ChronicCond_IschemicHeart', 'Average_Days_Admitted',
                'Claims_with_ChronicCond_stroke']
    
    all_predictions = pd.DataFrame(X, columns=top_feat)
    all_predictions['PotentialFraud'] = y_pred
    all_predictions['Probability_PotentialFraud'] = y_prob[:, 1]
    all_predictions.insert(0, "Provider", provider_ids)
    
    return all_predictions

In [4]:
def final_fun_2(X, Y):
    
    '''This fuction evaluates the predictions of model by comparing with actual values'''
    
    predictions = final_fun_1(X)
    y_pred = predictions.PotentialFraud
    y_prob = predictions.Probability_PotentialFraud
    
    print("F1 score for data: ", f1_score(Y, y_pred))
    print("Recall for data: ", recall_score(Y, y_pred))
    print("Precision for data: ", precision_score(Y, y_pred))
    
    test_fpr_tfidf, test_tpr_tfidf, te_thresholds_tfidf = roc_curve(Y, y_prob)

    print("AUC score for data: ", auc(test_fpr_tfidf, test_tpr_tfidf))

In [5]:
# Reading data
df = pd.read_csv('../Data/Processed/final_train_data.csv')
X = df[['Provider', 'is_inpatient', 'is_groupcode', 'ChronicCond_rheumatoidarthritis', 'Beneficiaries_Count', 
        'DeductibleAmtPaid','InscClaimAmtReimbursed', 'ChronicCond_Alzheimer', 'ChronicCond_IschemicHeart',
        'Days_Admitted', 'ChronicCond_stroke']]
Y = df['PotentialFraud'].values

In [6]:
# Testing final_fun_1
X = X.values
results = final_fun_1(X)
results.head()

Unnamed: 0,Provider,Inpatient_Claims,Claims_with_groupcode,Claims_with_ChronicCond_rheumatoidarthritis,Total_Beneficiaries_Count,Average_DeductibleAmtPaid,Average_InscClaimAmtReimbursed,Claims_with_ChronicCond_Alzheimer,Claims_with_ChronicCond_IschemicHeart,Average_Days_Admitted,Claims_with_ChronicCond_stroke,PotentialFraud,Probability_PotentialFraud
0,PRV51001,5,5,8,24,213.6,4185.6,15,23,1.0,6,0,0.147947
1,PRV51003,62,62,38,117,502.166667,4588.409091,56,112,2.424242,12,1,0.933284
2,PRV51004,0,0,46,138,2.080537,350.134228,64,108,0.0,17,0,0.147947
3,PRV51005,0,0,331,495,3.175966,241.124464,426,895,0.0,124,1,0.933284
4,PRV51007,3,3,22,58,45.333333,468.194444,26,51,0.222222,12,0,0.147947


In [7]:
# Testing final_fun_2
final_fun_2(X, Y)

F1 score for data:  0.6131996658312449
Recall for data:  0.7252964426877471
Precision for data:  0.5311143270622286
AUC score for data:  0.8641147180006319
