# Insurance Claims Fraud detection
### Author github.com/2704

### Importing the required Libraries


In [1]:
import pandas as pd
import pickle
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split


In [2]:
# Load the dataset from the provided CSV file
df = pd.read_csv('/content/fraud.csv')

# Display information about the dataset
df.info()

# Display the first few rows of the dataset
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15420 entries, 0 to 15419
Data columns (total 33 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Month                 15420 non-null  object
 1   WeekOfMonth           15420 non-null  int64 
 2   DayOfWeek             15420 non-null  object
 3   Make                  15420 non-null  object
 4   AccidentArea          15420 non-null  object
 5   DayOfWeekClaimed      15420 non-null  object
 6   MonthClaimed          15420 non-null  object
 7   WeekOfMonthClaimed    15420 non-null  int64 
 8   Sex                   15420 non-null  object
 9   MaritalStatus         15420 non-null  object
 10  Age                   15420 non-null  int64 
 11  Fault                 15420 non-null  object
 12  PolicyType            15420 non-null  object
 13  VehicleCategory       15420 non-null  object
 14  VehiclePrice          15420 non-null  object
 15  FraudFound_P          15420 non-null

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,...,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange_Claim,NumberOfCars,Year,BasePolicy
0,Dec,5,Wednesday,Honda,Urban,Tuesday,Jan,1,Female,Single,...,3 years,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability
1,Jan,3,Wednesday,Honda,Urban,Monday,Jan,4,Male,Single,...,6 years,31 to 35,Yes,No,External,none,no change,1 vehicle,1994,Collision
2,Oct,5,Friday,Honda,Urban,Thursday,Nov,2,Male,Married,...,7 years,41 to 50,No,No,External,none,no change,1 vehicle,1994,Collision
3,Jun,2,Saturday,Toyota,Rural,Friday,Jul,1,Male,Married,...,more than 7,51 to 65,Yes,No,External,more than 5,no change,1 vehicle,1994,Liability
4,Jan,5,Monday,Honda,Urban,Tuesday,Feb,2,Female,Single,...,5 years,31 to 35,No,No,External,none,no change,1 vehicle,1994,Collision


In [3]:

# Replace ' - ' with '_' in the 'PolicyType' column
df['PolicyType'] = df['PolicyType'].str.replace(' - ', '_')

# Replace spaces with underscores throughout the DataFrame
df = df.replace(' ', '_', regex=True)


In [4]:
#Checking for unique values
for column in df:
    # Check if the current column is 'PolicyNumber'
    if column == 'PolicyNumber':
        pass
    else:
        # Print the column name
        print("Column:", column)

        # Print sorted unique values in the column
        unique_values = sorted(df[column].unique())
        print("Unique Values:", unique_values, "\n")


Column: Month
Unique Values: ['Apr', 'Aug', 'Dec', 'Feb', 'Jan', 'Jul', 'Jun', 'Mar', 'May', 'Nov', 'Oct', 'Sep'] 

Column: WeekOfMonth
Unique Values: [1, 2, 3, 4, 5] 

Column: DayOfWeek
Unique Values: ['Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday'] 

Column: Make
Unique Values: ['Accura', 'BMW', 'Chevrolet', 'Dodge', 'Ferrari', 'Ford', 'Honda', 'Jaguar', 'Lexus', 'Mazda', 'Mecedes', 'Mercury', 'Nisson', 'Pontiac', 'Porche', 'Saab', 'Saturn', 'Toyota', 'VW'] 

Column: AccidentArea
Unique Values: ['Rural', 'Urban'] 

Column: DayOfWeekClaimed
Unique Values: ['0', 'Friday', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday'] 

Column: MonthClaimed
Unique Values: ['0', 'Apr', 'Aug', 'Dec', 'Feb', 'Jan', 'Jul', 'Jun', 'Mar', 'May', 'Nov', 'Oct', 'Sep'] 

Column: WeekOfMonthClaimed
Unique Values: [1, 2, 3, 4, 5] 

Column: Sex
Unique Values: ['Female', 'Male'] 

Column: MaritalStatus
Unique Values: ['Divorced', 'Married', 'Single', 'Widow'] 

Col

In [5]:
# Extract feature columns by dropping the 'FraudFound_P' column
features = df.drop('FraudFound_P', 1).columns

# Save the list of feature columns to a pickle file
with open("features1.pkl", "wb") as file:
    pickle.dump(features, file)

  features = df.drop('FraudFound_P', 1).columns


In [6]:
# Function to extract categorical columns from a DataFrame
def get_categorical_columns(dataframe):
    """
    Extracts and returns a list of categorical column names from the given DataFrame.

    Args:
        dataframe (pd.DataFrame): The DataFrame from which categorical columns are to be extracted.

    Returns:
        list: A list containing the names of categorical columns.
    """
    # Select columns with data type 'object' (categorical)
    categorical_columns = dataframe.select_dtypes(include='object').columns.tolist()
    return categorical_columns

# Call the function to get categorical column names from the DataFrame 'df'
categorical_cols = get_categorical_columns(df)

# Print the list of categorical column names
print("Categorical Columns:", categorical_cols)

Categorical Columns: ['Month', 'DayOfWeek', 'Make', 'AccidentArea', 'DayOfWeekClaimed', 'MonthClaimed', 'Sex', 'MaritalStatus', 'Fault', 'PolicyType', 'VehicleCategory', 'VehiclePrice', 'Days_Policy_Accident', 'Days_Policy_Claim', 'PastNumberOfClaims', 'AgeOfVehicle', 'AgeOfPolicyHolder', 'PoliceReportFiled', 'WitnessPresent', 'AgentType', 'NumberOfSuppliments', 'AddressChange_Claim', 'NumberOfCars', 'BasePolicy']


In [7]:
# List of categorical columns to be one-hot encoded
columns_to_encode = ['Month', 'DayOfWeek', 'Make', 'AccidentArea', 'DayOfWeekClaimed', 'MonthClaimed', 'Sex', 'MaritalStatus', 'Fault', 'PolicyType', 'VehicleCategory', 'VehiclePrice', 'Days_Policy_Accident', 'Days_Policy_Claim', 'PastNumberOfClaims', 'AgeOfVehicle', 'AgeOfPolicyHolder', 'PoliceReportFiled', 'WitnessPresent', 'AgentType',
                     'NumberOfSuppliments', 'AddressChange_Claim', 'NumberOfCars', 'BasePolicy']

# Fit OneHotEncoder to the specified columns
enc = OneHotEncoder(sparse=False).fit(df.loc[:, columns_to_encode])

# Save the fitted encoder to a pickle file
with open("encoder1.pkl", "wb") as file:
    pickle.dump(enc, file)



In [8]:
from sklearn.model_selection import train_test_split

# Get the names of the encoded columns
column_names = enc.get_feature_names_out(columns_to_encode)

# Encode categorical variables and create a DataFrame
encoded_variables = pd.DataFrame(enc.transform(df.loc[:, columns_to_encode]), columns=column_names)

# Drop original categorical columns from the DataFrame
df = df.drop(columns_to_encode, 1)

# Concatenate the encoded variables DataFrame with the original DataFrame
df = pd.concat([df, encoded_variables], axis=1)

# Separate features (X) and target variable (y)
X, y = df.drop('FraudFound_P', 1), df.loc[:, 'FraudFound_P']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=1)


  df = df.drop(columns_to_encode, 1)
  X, y = df.drop('FraudFound_P', 1), df.loc[:, 'FraudFound_P']


In [9]:
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)


Shape of X_train: (10794, 148)
Shape of X_test: (4626, 148)


## Importing Modeling libraries

In [10]:
!pip install -q xgboost lightgbm

In [11]:
import xgboost as xgb
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV

In [12]:
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    recall_score,
    precision_score,
    confusion_matrix,
    roc_auc_score,

    classification_report,
    precision_recall_curve
)



### XGBClassifier

In [None]:
# Create an XGBoost model with the binary logistic objective
xgb_model = XGBClassifier(objective='binary:logistic')

# Define a grid of hyperparameters to search through
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.1, 0.01, 0.001],
    'gamma': [0, 0.1, 0.5],
    'subsample': [0.5, 0.75, 1],
    'colsample_bytree': [0.5, 0.75, 1]
}

# Perform Grid Search to find the best combination of hyperparameters
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and corresponding accuracy score
print("Best hyperparameters:", grid_search.best_params_)
print("Accuracy score:", grid_search.best_score_)

# Create the best XGBoost model using the best hyperparameters found
best_xgb_model = XGBClassifier(objective='binary:logistic', **grid_search.best_params_)
best_xgb_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_test = best_xgb_model.predict(X_test)

# Generate a classification report to assess the model's performance
report = classification_report(y_test, y_pred_test)
print("Classification Report:\n", report)

### LGBMClassifier

In [15]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Define a grid of hyperparameters to search through
param_grid = {
    'num_leaves': [10, 20, 30],
    'learning_rate': [0.1, 0.05, 0.01],
    'max_depth': [3, 5, 7],
    'n_estimators': [50, 100, 200],
}

# Create a LightGBM model
lgbm_model = LGBMClassifier(objective='binary')

# Perform Grid Search to find the best combination of hyperparameters
grid_search = GridSearchCV(estimator=lgbm_model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Create the best LightGBM model using the best hyperparameters found
best_lgbm_model = LGBMClassifier(**grid_search.best_params_, objective='binary')
best_lgbm_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_test = best_lgbm_model.predict(X_test)

# Generate a classification report to assess the model's performance
report = classification_report(y_test, y_pred_test)
print("Classification Report:\n", report)


Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97      4317
           1       0.80      0.21      0.33       309

    accuracy                           0.94      4626
   macro avg       0.87      0.60      0.65      4626
weighted avg       0.94      0.94      0.93      4626



### Saving the model1

In [16]:
# Save the best LightGBM model to a pickle file
pickle.dump(best_lgbm_model, open('model1.pickle', 'wb'))



In [17]:
# Get the list of original column names from the DataFrame
original_columns = df.columns.tolist()

# Print the list of original column names
print("Original Columns:", original_columns)


Original Columns: ['WeekOfMonth', 'WeekOfMonthClaimed', 'Age', 'FraudFound_P', 'PolicyNumber', 'RepNumber', 'Deductible', 'DriverRating', 'Year', 'Month_Apr', 'Month_Aug', 'Month_Dec', 'Month_Feb', 'Month_Jan', 'Month_Jul', 'Month_Jun', 'Month_Mar', 'Month_May', 'Month_Nov', 'Month_Oct', 'Month_Sep', 'DayOfWeek_Friday', 'DayOfWeek_Monday', 'DayOfWeek_Saturday', 'DayOfWeek_Sunday', 'DayOfWeek_Thursday', 'DayOfWeek_Tuesday', 'DayOfWeek_Wednesday', 'Make_Accura', 'Make_BMW', 'Make_Chevrolet', 'Make_Dodge', 'Make_Ferrari', 'Make_Ford', 'Make_Honda', 'Make_Jaguar', 'Make_Lexus', 'Make_Mazda', 'Make_Mecedes', 'Make_Mercury', 'Make_Nisson', 'Make_Pontiac', 'Make_Porche', 'Make_Saab', 'Make_Saturn', 'Make_Toyota', 'Make_VW', 'AccidentArea_Rural', 'AccidentArea_Urban', 'DayOfWeekClaimed_0', 'DayOfWeekClaimed_Friday', 'DayOfWeekClaimed_Monday', 'DayOfWeekClaimed_Saturday', 'DayOfWeekClaimed_Sunday', 'DayOfWeekClaimed_Thursday', 'DayOfWeekClaimed_Tuesday', 'DayOfWeekClaimed_Wednesday', 'MonthClai