In [1]:
import pandas as pd

benef = pd.read_csv("fraud_data/Train_Beneficiarydata.csv")
inpat = pd.read_csv("fraud_data/Train_Inpatientdata.csv")
outpat = pd.read_csv("fraud_data/Train_Outpatientdata.csv")
provider = pd.read_csv("fraud_data/Train_Provider.csv")

print("Beneficiary:", benef.shape)
print("Inpatient:", inpat.shape)
print("Outpatient:", outpat.shape)
print("Provider:", provider.shape)

Beneficiary: (138556, 24)
Inpatient: (40474, 30)
Outpatient: (517737, 27)
Provider: (5410, 2)


In [2]:
claims = pd.concat([inpat, outpat], axis=0)
print(claims.shape)

(558211, 30)


In [3]:
claims_per_provider = claims.groupby("Provider").size().reset_index(name="TotalClaims")

print(claims_per_provider.head())

   Provider  TotalClaims
0  PRV51001           25
1  PRV51003          132
2  PRV51004          149
3  PRV51005         1165
4  PRV51007           72


In [4]:
avg_claim = claims.groupby("Provider")["InscClaimAmtReimbursed"].mean().reset_index(name="AvgClaimAmount")

total_claim = claims.groupby("Provider")["InscClaimAmtReimbursed"].sum().reset_index(name="TotalReimbursed")

In [5]:
df = claims_per_provider.merge(avg_claim, on="Provider", how="left")
df = df.merge(total_claim, on="Provider", how="left")

In [6]:
df = df.merge(provider, on="Provider", how="left")

print(df.shape)

(5410, 5)


In [7]:
df["PotentialFraud"] = df["PotentialFraud"].map({
    "Yes": 1,
    "No": 0
})

In [8]:
print(df["PotentialFraud"].value_counts())

PotentialFraud
0    4904
1     506
Name: count, dtype: int64


In [9]:
X = df.drop(["Provider", "PotentialFraud"], axis=1)
y = df["PotentialFraud"]

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [11]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train, y_train)

In [12]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=300,
    random_state=42
)

model.fit(X_train, y_train)

0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",300
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`. Note: This parameter is tree-specific.",'gini'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",1
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=""sqrt"" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to `""sqrt""`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",'sqrt'
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


In [13]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[873 108]
 [ 25  76]]
              precision    recall  f1-score   support

           0       0.97      0.89      0.93       981
           1       0.41      0.75      0.53       101

    accuracy                           0.88      1082
   macro avg       0.69      0.82      0.73      1082
weighted avg       0.92      0.88      0.89      1082



In [14]:
claim_std = claims.groupby("Provider")["InscClaimAmtReimbursed"]\
                  .std().reset_index(name="ClaimStd")

df = df.merge(claim_std, on="Provider", how="left")

In [15]:
df["ClaimStd"] = df["ClaimStd"].fillna(0)

In [16]:
unique_patients = claims.groupby("Provider")["BeneID"]\
                        .nunique().reset_index(name="UniquePatients")

df = df.merge(unique_patients, on="Provider", how="left")

In [17]:
df["ClaimsPerPatient"] = df["TotalClaims"] / df["UniquePatients"]

In [18]:
df.replace([float("inf"), -float("inf")], 0, inplace=True)
df.fillna(0, inplace=True)

In [19]:
X = df.drop(["Provider", "PotentialFraud"], axis=1)
y = df["PotentialFraud"]

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

In [21]:
import numpy as np

print("NaN in X_train:", np.isnan(X_train).sum().sum())
print("Inf in X_train:", np.isinf(X_train).sum().sum())

NaN in X_train: 0
Inf in X_train: 0


In [22]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train, y_train)

In [23]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=600,
    max_depth=12,
    min_samples_split=4,
    min_samples_leaf=2,
    random_state=42
)

model.fit(X_train, y_train)

0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",600
,"criterion  criterion: {""gini"", ""entropy"", ""log_loss""}, default=""gini"" The function to measure the quality of a split. Supported criteria are ""gini"" for the Gini impurity and ""log_loss"" and ""entropy"" both for the Shannon information gain, see :ref:`tree_mathematical_formulation`. Note: This parameter is tree-specific.",'gini'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",12
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",4
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=""sqrt"" The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to `""sqrt""`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",'sqrt'
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


In [24]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8826247689463955
[[873 108]
 [ 19  82]]
              precision    recall  f1-score   support

           0       0.98      0.89      0.93       981
           1       0.43      0.81      0.56       101

    accuracy                           0.88      1082
   macro avg       0.71      0.85      0.75      1082
weighted avg       0.93      0.88      0.90      1082



In [25]:
mean_amt = df["AvgClaimAmount"].mean()
std_amt = df["AvgClaimAmount"].std()

df["UpcodingFlag"] = (
    df["AvgClaimAmount"] > mean_amt + 2*std_amt
).astype(int)

print("Upcoding cases:", df["UpcodingFlag"].sum())

Upcoding cases: 265


In [26]:
df["GhostBillingFlag"] = (
    df["ClaimsPerPatient"] > df["ClaimsPerPatient"].quantile(0.97)
).astype(int)

print("Ghost billing cases:", df["GhostBillingFlag"].sum())

Ghost billing cases: 163


In [27]:
import numpy as np

# Z-score for TotalClaims
mean_claims = df["TotalClaims"].mean()
std_claims = df["TotalClaims"].std()

df["HighVolumeFlag"] = (
    df["TotalClaims"] > mean_claims + 2 * std_claims
).astype(int)

print("High volume cases:", df["HighVolumeFlag"].sum())

High volume cases: 140


In [28]:
mean_std = df["ClaimStd"].mean()
std_std = df["ClaimStd"].std()

df["HighVarianceFlag"] = (
    df["ClaimStd"] > mean_std + 2 * std_std
).astype(int)

print("High variance cases:", df["HighVarianceFlag"].sum())

High variance cases: 270


In [29]:
df["MultiFlagCount"] = (
    df["UpcodingFlag"] +
    df["GhostBillingFlag"] +
    df["HighVolumeFlag"] +
    df["HighVarianceFlag"]
)

In [30]:
df["RuleScore"] = (
    df["UpcodingFlag"] * 35 +
    df["GhostBillingFlag"] * 30 +
    df["HighVolumeFlag"] * 20 +
    df["HighVarianceFlag"] * 15
)

In [31]:

training_features = [
    "TotalClaims",
    "AvgClaimAmount",
    "TotalReimbursed",
    "ClaimStd",
    "UniquePatients",
    "ClaimsPerPatient"
]

X_full = df[training_features]


df["ML_Probability"] = model.predict_proba(X_full)[:,1] * 100

In [32]:
df["FinalFraudScore"] = (
    df["ML_Probability"] * 0.65 +
    df["RuleScore"] * 0.35
)

In [33]:
top_risky = df.sort_values(by="FinalFraudScore", ascending=False)

print(top_risky[[
    "Provider",
    "FinalFraudScore",
    "ML_Probability",
    "RuleScore",
    "MultiFlagCount",
    "UpcodingFlag",
    "GhostBillingFlag",
    "HighVolumeFlag",
    "HighVarianceFlag"
]].head(10))

      Provider  FinalFraudScore  ML_Probability  RuleScore  MultiFlagCount  \
1490  PRV52846        82.468163       99.951020         50               2   
363   PRV51459        82.452728       99.927273         50               2   
397   PRV51501        82.335771       99.747340         50               2   
943   PRV52178        82.306487       99.702288         50               2   
3541  PRV55444        82.187875       99.519807         50               2   
3932  PRV55916        82.148370       99.459030         50               2   
669   PRV51836        81.710037       98.784672         50               2   
3946  PRV55934        81.666248       98.717304         50               2   
1070  PRV52337        81.502602       98.465542         50               2   
938   PRV52173        81.258412       98.089865         50               2   

      UpcodingFlag  GhostBillingFlag  HighVolumeFlag  HighVarianceFlag  
1490             1                 0               0                

In [34]:
import joblib
joblib.dump(model, "fraud_model.pkl")

['fraud_model.pkl']