In [20]:
# Auto Insurance Fraud Detection - Logistic Regression


In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

df1 = pd.read_csv('Auto_Insurance_Fraud_Claims_File01.csv')
df2 = pd.read_csv('Auto_Insurance_Fraud_Claims_File02.csv')
df3 = pd.read_csv('Auto_Insurance_Fraud_Claims_File03.csv')

train_df = pd.concat([df1, df2], ignore_index=True)
test_df = df3.copy()

drop_cols = ['Claim_ID', 'Policy_Num', 'Policy_Start_Date', 'Policy_Expiry_Date',
             'Accident_Date', 'Claims_Date', 'Vehicle_Registration', 'DL_Expiry_Date',
             'Check_Point', 'Insured_Zip', 'Auto_Model', 'Accident_Location',
             'Vehicle_Color', 'Garage_Location']

train_df.drop(columns=drop_cols, inplace=True, errors='ignore')
test_df.drop(columns=drop_cols, inplace=True, errors='ignore')

train_df.fillna('Unknown', inplace=True)
test_df.fillna('Unknown', inplace=True)

label_encoder = LabelEncoder()
train_df['Fraud_Ind'] = label_encoder.fit_transform(train_df['Fraud_Ind'])  # Y/N → 1/0

common_cols = train_df.columns.intersection(test_df.columns)
X = train_df[common_cols].copy()
X_test = test_df[common_cols].copy()
y = train_df['Fraud_Ind']

cat_cols = X.select_dtypes(include='object').columns
for col in cat_cols:
    le = LabelEncoder()
    combined = pd.concat([X[col], X_test[col]], axis=0).astype(str)
    le.fit(combined)
    X[col] = le.transform(X[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))

num_cols = X.select_dtypes(include=np.number).columns
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
report = classification_report(y_val, y_pred)

print("Validation Accuracy:", accuracy)
print("Classification Report:\n", report)

test_preds = model.predict(X_test)
pred_labels = label_encoder.inverse_transform(test_preds)

output = pd.DataFrame()
if 'Claim_ID' in df3.columns:
    output['Claim_ID'] = df3['Claim_ID']
output['Predicted_Fraud'] = pred_labels

print("\nSample Predictions:")
print(output.head())

output.to_csv("Predicted_Fraud_Claims.csv", index=False)


Validation Accuracy: 0.787
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.91      0.86      8912
           1       0.62      0.44      0.52      3088

    accuracy                           0.79     12000
   macro avg       0.72      0.67      0.69     12000
weighted avg       0.77      0.79      0.77     12000


Sample Predictions:
     Claim_ID Predicted_Fraud
0  CC00000001               N
1  CC00000002               N
2  CC00000003               Y
3  CC00000004               N
4  CC00000005               N


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
df1 = pd.read_csv('Auto_Insurance_Fraud_Claims_File01.csv')
df2 = pd.read_csv('Auto_Insurance_Fraud_Claims_File02.csv')
df3 = pd.read_csv('Auto_Insurance_Fraud_Claims_File03.csv')
train_df = pd.concat([df1, df2], ignore_index=True)
test_df = df3.copy()
drop_cols = ['Claim_ID', 'Policy_Num', 'Policy_Start_Date', 'Policy_Expiry_Date',
             'Accident_Date', 'Claims_Date', 'Vehicle_Registration', 'DL_Expiry_Date',
             'Check_Point', 'Insured_Zip', 'Auto_Model', 'Accident_Location',
             'Vehicle_Color', 'Garage_Location']
train_df.drop(columns=drop_cols, inplace=True, errors='ignore')
test_df.drop(columns=drop_cols, inplace=True, errors='ignore')
train_df.fillna('Unknown', inplace=True)
test_df.fillna('Unknown', inplace=True)
label_encoder = LabelEncoder()
train_df['Fraud_Ind'] = label_encoder.fit_transform(train_df['Fraud_Ind'])
common_cols = train_df.columns.intersection(test_df.columns)
X = train_df[common_cols].copy()
X_test = test_df[common_cols].copy()
y = train_df['Fraud_Ind']
cat_cols = X.select_dtypes(include='object').columns
for col in cat_cols:
    le = LabelEncoder()
    combined = pd.concat([X[col], X_test[col]], axis=0).astype(str)
    le.fit(combined)
    X[col] = le.transform(X[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))
num_cols = X.select_dtypes(include=np.number).columns
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])
corr_matrix = pd.concat([X, y], axis=1).corr()
high_corr = corr_matrix['Fraud_Ind'][abs(corr_matrix['Fraud_Ind']) > 0.95].drop('Fraud_Ind').index.tolist()
X.drop(columns=high_corr, inplace=True)
X_test.drop(columns=high_corr, inplace=True)
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
knn_model = KNeighborsClassifier(n_neighbors=5, metric='minkowski')
knn_model.fit(X_resampled, y_resampled)
y_pred = knn_model.predict(X_val)
print(" Accuracy:", accuracy_score(y_val, y_pred))
print(" Precision:", precision_score(y_val, y_pred))
print(" Recall:", recall_score(y_val, y_pred))
print(" F1 Score:", f1_score(y_val, y_pred))
print(" ROC AUC:", roc_auc_score(y_val, y_pred))
print("\nClassification Report:\n", classification_report(y_val, y_pred))
test_preds = knn_model.predict(X_test)
pred_labels = label_encoder.inverse_transform(test_preds)
output = pd.DataFrame()
if 'Claim_ID' in df3.columns:
    output['Claim_ID'] = df3['Claim_ID']
output['Predicted_Fraud'] = pred_labels
print("\n Sample Predictions:")
print(output.head())
output.to_csv("KNN_Predicted_Fraud_Claims.csv", index=False)

 Accuracy: 0.9930833333333333
 Precision: 0.9777059773828756
 Recall: 0.9953947368421052
 F1 Score: 0.9864710676446617
 ROC AUC: 0.9938469219924813

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00      8960
           1       0.98      1.00      0.99      3040

    accuracy                           0.99     12000
   macro avg       0.99      0.99      0.99     12000
weighted avg       0.99      0.99      0.99     12000


 Sample Predictions:
     Claim_ID Predicted_Fraud
0  CC00000001               N
1  CC00000002               N
2  CC00000003               Y
3  CC00000004               N
4  CC00000005               N
