In [2]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


## Loading and Preprocessing Loans Encoded Data

Load the `sba_loans_encoded.csv` in a pandas DataFrame called `df_loans`

In [55]:
# Read the CSV file into a Pandas DataFrame
df = pd.read_csv('insurance_claims.csv')

# Review the DataFrame
df.head()

Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,...,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported,_c39
0,328,48,521585,2014-10-17,OH,250/500,1000,1406.91,0,466132,...,YES,71610,6510,13020,52080,Saab,92x,2004,Y,
1,228,42,342868,2006-06-27,IN,250/500,2000,1197.22,5000000,468176,...,?,5070,780,780,3510,Mercedes,E400,2007,Y,
2,134,29,687698,2000-09-06,OH,100/300,2000,1413.14,5000000,430632,...,NO,34650,7700,3850,23100,Dodge,RAM,2007,N,
3,256,41,227811,1990-05-25,IL,250/500,2000,1415.74,6000000,608117,...,NO,63400,6340,6340,50720,Chevrolet,Tahoe,2014,Y,
4,228,44,367455,2014-06-06,IL,500/1000,1000,1583.91,6000000,610706,...,NO,6500,1300,650,4550,Accura,RSX,2009,N,


In [56]:
# List of columns to drop
columns_to_drop = ['policy_bind_date', 'policy_state', 'policy_csl', 'policy_deductable',\
                   'umbrella_limit', 'insured_zip','insured_occupation', 'insured_hobbies','incident_date',\
                   'auto_make','auto_model', 'auto_year',  '_c39','incident_state', 'incident_city' ,'policy_number'   ]

# Drop the specified columns
clean_df = df.drop(columns=columns_to_drop)

# Check the result
clean_df.columns

Index(['months_as_customer', 'age', 'policy_annual_premium', 'insured_sex',
       'insured_education_level', 'insured_relationship', 'capital-gains',
       'capital-loss', 'incident_type', 'collision_type', 'incident_severity',
       'authorities_contacted', 'incident_location',
       'incident_hour_of_the_day', 'number_of_vehicles_involved',
       'property_damage', 'bodily_injuries', 'witnesses',
       'police_report_available', 'total_claim_amount', 'injury_claim',
       'property_claim', 'vehicle_claim', 'fraud_reported'],
      dtype='object')

In [57]:
clean_df['collision_type'].unique()

array(['Side Collision', '?', 'Rear Collision', 'Front Collision'],
      dtype=object)

In [58]:
clean_df['collision_type'] = clean_df['collision_type'].replace('?', 'NO')
clean_df.dropna(subset=['authorities_contacted'], inplace=True)

In [59]:
# Check the number of data per column
clean_df.count()

months_as_customer             909
age                            909
policy_annual_premium          909
insured_sex                    909
insured_education_level        909
insured_relationship           909
capital-gains                  909
capital-loss                   909
incident_type                  909
collision_type                 909
incident_severity              909
authorities_contacted          909
incident_location              909
incident_hour_of_the_day       909
number_of_vehicles_involved    909
property_damage                909
bodily_injuries                909
witnesses                      909
police_report_available        909
total_claim_amount             909
injury_claim                   909
property_claim                 909
vehicle_claim                  909
fraud_reported                 909
dtype: int64

In [60]:
# Replace 'FEMALE' with 0 and 'MALE' with 1 in the 'insured_sex' column
clean_df['insured_sex'] = clean_df['insured_sex'].map({'FEMALE': 0, 'MALE': 1})
# Replace '?' with 'N' in the 'police_report_available' column
clean_df['police_report_available'] = clean_df['police_report_available'].replace('?', 'NO')
clean_df['property_damage'] = clean_df['property_damage'].replace('?', 'NO')

In [62]:
clean_df.head(20)

Unnamed: 0,months_as_customer,age,policy_annual_premium,insured_sex,insured_education_level,insured_relationship,capital-gains,capital-loss,incident_type,collision_type,...,number_of_vehicles_involved,property_damage,bodily_injuries,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,fraud_reported
0,328,48,1406.91,1,MD,husband,53300,0,Single Vehicle Collision,Side Collision,...,1,YES,1,2,YES,71610,6510,13020,52080,Y
1,228,42,1197.22,1,MD,other-relative,0,0,Vehicle Theft,NO,...,1,NO,0,0,NO,5070,780,780,3510,Y
2,134,29,1413.14,0,PhD,own-child,35100,0,Multi-vehicle Collision,Rear Collision,...,3,NO,2,3,NO,34650,7700,3850,23100,N
3,256,41,1415.74,0,PhD,unmarried,48900,-62400,Single Vehicle Collision,Front Collision,...,1,NO,1,2,NO,63400,6340,6340,50720,Y
5,256,39,1351.1,0,PhD,unmarried,0,0,Multi-vehicle Collision,Rear Collision,...,3,NO,0,2,NO,64100,6410,6410,51280,Y
6,137,34,1333.35,1,PhD,husband,0,-77000,Multi-vehicle Collision,Front Collision,...,3,NO,0,0,NO,78650,21450,7150,50050,N
7,165,37,1137.03,1,Associate,unmarried,0,0,Multi-vehicle Collision,Front Collision,...,3,NO,2,2,YES,51590,9380,9380,32830,N
8,27,33,1442.99,0,PhD,own-child,0,0,Single Vehicle Collision,Front Collision,...,1,NO,1,1,YES,27700,2770,2770,22160,N
9,212,42,1315.68,1,PhD,wife,0,-39300,Single Vehicle Collision,Rear Collision,...,1,NO,2,1,NO,42300,4700,4700,32900,N
10,235,42,1253.12,0,Masters,other-relative,38400,0,Single Vehicle Collision,Front Collision,...,1,YES,2,2,NO,87010,7910,15820,63280,N


Define the features set, by copying the `df_loans` DataFrame and dropping the `Default` column.

In [63]:
# Separate the features, X, from the target variable, y
y = clean_df['fraud_reported']
X = clean_df.drop(columns='fraud_reported', axis=1)
# Preview the features data
X.head()

Unnamed: 0,months_as_customer,age,policy_annual_premium,insured_sex,insured_education_level,insured_relationship,capital-gains,capital-loss,incident_type,collision_type,...,incident_hour_of_the_day,number_of_vehicles_involved,property_damage,bodily_injuries,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim
0,328,48,1406.91,1,MD,husband,53300,0,Single Vehicle Collision,Side Collision,...,5,1,YES,1,2,YES,71610,6510,13020,52080
1,228,42,1197.22,1,MD,other-relative,0,0,Vehicle Theft,NO,...,8,1,NO,0,0,NO,5070,780,780,3510
2,134,29,1413.14,0,PhD,own-child,35100,0,Multi-vehicle Collision,Rear Collision,...,7,3,NO,2,3,NO,34650,7700,3850,23100
3,256,41,1415.74,0,PhD,unmarried,48900,-62400,Single Vehicle Collision,Front Collision,...,5,1,NO,1,2,NO,63400,6340,6340,50720
5,256,39,1351.1,0,PhD,unmarried,0,0,Multi-vehicle Collision,Rear Collision,...,19,3,NO,0,2,NO,64100,6410,6410,51280


Create the target vector by assigning the values of the `Default` column from the `df_loans` DataFrame.

In [64]:
# Define target vector
y = clean_df['fraud_reported'].values.reshape(-1, 1)
y[:5]


array([['Y'],
       ['Y'],
       ['N'],
       ['Y'],
       ['Y']], dtype=object)

Split the data into training and testing sets.

In [65]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

### 3. Encode the categorical variables from the features data using `get_dummies`.

In [66]:
# Encode the categorical variables using get_dummies
X = pd.get_dummies(X)

In [67]:
# Review the features data
X.head()

Unnamed: 0,months_as_customer,age,policy_annual_premium,insured_sex,capital-gains,capital-loss,incident_hour_of_the_day,number_of_vehicles_involved,bodily_injuries,witnesses,...,incident_location_9911 Britain Lane,incident_location_9918 Andromedia Drive,incident_location_9929 Rock Drive,incident_location_9935 4th Drive,incident_location_9942 Tree Ave,incident_location_9980 Lincoln Ave,property_damage_NO,property_damage_YES,police_report_available_NO,police_report_available_YES
0,328,48,1406.91,1,53300,0,5,1,1,2,...,False,False,False,True,False,False,False,True,False,True
1,228,42,1197.22,1,0,0,8,1,0,0,...,False,False,False,False,False,False,True,False,True,False
2,134,29,1413.14,0,35100,0,7,3,2,3,...,False,False,False,False,False,False,True,False,True,False
3,256,41,1415.74,0,48900,-62400,5,1,1,2,...,False,False,False,False,False,False,True,False,True,False
5,256,39,1351.1,0,0,0,19,3,0,2,...,False,False,False,False,False,False,True,False,True,False


Use the `StandardScaler` to scale the features data, remember that only `X_train` and `X_testing` DataFrames should be scaled.

In [70]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [71]:
# Create the StandardScaler instance
scaler = StandardScaler()

In [72]:
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

In [74]:
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Fitting the Random Forest Model

Once the data is scaled, create a random forest instance and train it with the training data (`X_train_scaled` and `y_train`), define `n_estimators=500` and `random_state=78`.

In [75]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [76]:
# Fit the model and use .ravel()on the "y_train" data.
rf_model = rf_model.fit(X_train_scaled, y_train.ravel())

## Making Predictions Using the Random Forest Model

Validate the trained model by predicting loan defaults using the testing data (`X_test_scaled`).

In [77]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

## Model Evaluation

Evaluate model's results, by using `sklearn` to calculate the confusion matrix, the accuracy score and to generate the classification report.

In [78]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [79]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,168,13
Actual 1,29,18


Accuracy Score : 0.8157894736842105
Classification Report
              precision    recall  f1-score   support

           N       0.85      0.93      0.89       181
           Y       0.58      0.38      0.46        47

    accuracy                           0.82       228
   macro avg       0.72      0.66      0.68       228
weighted avg       0.80      0.82      0.80       228



## Feature Importance

In this section, you are asked to fetch the features' importance from the random forest model and display the top 10 most important features.

In [80]:
# Get the feature importance array
importances = rf_model.feature_importances_
# List the top 10 most important features
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted[:10]

[(0.09559873840931507, 'incident_severity_Major Damage'),
 (0.0477461195731404, 'property_claim'),
 (0.044403711635260024, 'vehicle_claim'),
 (0.04404802367489845, 'total_claim_amount'),
 (0.04055680929697879, 'injury_claim'),
 (0.03986561063245987, 'policy_annual_premium'),
 (0.03924149750962508, 'months_as_customer'),
 (0.036842979983327286, 'age'),
 (0.03473111403317143, 'incident_hour_of_the_day'),
 (0.033162398070438484, 'incident_severity_Minor Damage')]

## Analysis Questions

Finally, analyze the model's evaluation results and answer the following questions.

* **Question 1:** Would you trust this model to detect if a loan will default?

 * **Sample Answer:** Yes, I would trust this model to some extent to detect if a claim FRAUD will default. The model achieves an accuracy score of 81.58%, which indicates that it performs reasonably well. However, considering the nature of the problem, where correctly identifying defaults (class 1) is crucial, we should also look at the F1-score and recall for class 1. The model achieves a F1-score of 0.46 and a recall of 0.38 for class 1, which suggests that the model is not very good at identifying defaults. While the accuracy score is relatively high, the F1-score and recall for defaults are not satisfactory, so I would be cautious about relying solely on this model for detecting loan defaults.


* **Question 2:** What are your insights about the top 10 most important features?

 * **Sample Answer:** The top 10 most important features are as follows:
incident_severity_Major Damage
property_claim
vehicle_claim
total_claim_amount
injury_claim
policy_annual_premium
months_as_customer
age
incident_hour_of_the_day
incident_severity_Minor Damage
These features seem to have a significant impact on the model's predictions. Notably, features related to the severity of the incident (incident_severity_Major Damage and incident_severity_Minor Damage) and various claim amounts (property_claim, vehicle_claim, total_claim_amount, and injury_claim) are among the top features. This suggests that the severity of the incident and the amount of the claim play a crucial role in determining whether a fraud is reported. Additionally, features such as policy_annual_premium, months_as_customer, age, and incident_hour_of_the_day also appear to be important in predicting fraud reports. Therefore, when using this model, focusing on these top features could improve its performance and efficiency, and collecting new data mainly on these features would be sufficient for piloting this model in a business environment.