In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import preprocessing as proc

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_classif
from sklearn.svm import SVC

pd.pandas.set_option('display.max_columns', None)

In [None]:
df = pd.read_csv('carclaims.csv')
print(df.shape)
df.head()

In [None]:
X = df.drop(['PolicyNumber', 'FraudFound'], axis=1)
y = df['FraudFound']
print(X.shape, y.shape)
X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                   test_size=0.2, 
                                                   random_state=42,
                                                   stratify=y)
X_train.shape, X_test.shape

In [None]:
X_train.reset_index(inplace=True)
X_train.drop(columns='index', inplace=True)
X_test.reset_index(inplace=True)
X_test.drop(columns='index', inplace=True)

In [None]:
sns.countplot(x=y_train)
fraud_ratio = round((y_test == 'Yes').sum() / len(df) * 100, ndigits=4)
plt.yticks(np.arange(0, 13000, 1000))
print(f'Fraud ratio: {fraud_ratio}%')
print(f'Fraud count: {(y_train == "Yes").sum()}')

The wide imbalance warrants resampling. In this manner, random oversampling (ROS) will be used.

From the study of Pérez, et. al, (2022): FS+ROS is the best balancing configuration when using (Random Forest) RF as the classifier. [Link to the paper](https://www.sciencedirect.com/science/article/pii/S0957417421013622#:~:text=Some%20general%20conclusions%20of%20the,better%20results%20when%20applied%20afterwards.)

"RF: The balancing configuration FS+ROS in the area plots, and FS+RUS in the average ranks, showed better performance than the others. Unlike C4.5, the rankings showed that no use of balancing was by far the worst combination, a behavior also supported by the results of a previous study (Pes, 2020)."

"On the contrary, the results suggest that, in general, SMOTE and ROS perform better if applied after feature selection."

Feature selection will be conducted to find the best set of features. Based on the same study: ANOVA as the feature selector yields a viable percentage of victories for FS+ROS compared to other selectors.

SVM will be tested with SelectFromModel alongside the ANOVA selectors to compare the differences of available features when it comes to the differences of weights vs the variances with f-value.

In [None]:
TEMP_REPLACE = ['Month', 'MonthClaimed', 
                 'DayOfWeek', 'DayOfWeekClaimed',
                 'WeekOfMonth', 'WeekOfMonthClaimed']

TEMP_SIN_COS = ['Month', 'MonthClaimed', 'DayOfWeek', 'DayOfWeekClaimed']

AGE = ['Age']

ONE_HOT_NUMERICAL = ['WeekOfMonth', 'WeekOfMonthClaimed', 'RepNumber']
ONE_HOT_CATEGORICAL = ['Make', 'PolicyType', 'MaritalStatus', 'VehicleCategory',
                       'BasePolicy', 'AgentType', 'WitnessPresent', 
                       'PoliceReportFiled', 'Fault', 'Sex', 'AccidentArea']

"""ORDINAL_CATEGORICAL = ['AgeOfVehicle', 'AgeOfPolicyHolder', 'VehiclePrice',
                       'Days:Policy-Accident', 'NumberOfCars', 'AddressChange-Claim',
                       'NumberOfSuppliments', 'PastNumberOfClaims', 'Days:Policy-Claim']
                       """

AGE_OF_VEH_VAR = ['AgeOfVehicle']
AGE_OF_POL_VAR = ['AgeOfPolicyHolder']
VEH_PRICE_VAR = ['VehiclePrice']
DAYS_ACC_VAR = ['Days:Policy-Accident']
NUM_CAR_VAR = ['NumberOfCars']
ADD_CHANGE_VAR = ['AddressChange-Claim']
NUM_SUPP_VAR = ['NumberOfSuppliments']
PAST_CLAIM_VAR = ['PastNumberOfClaims']
DAYS_CLAIM_VAR = ['Days:Policy-Claim']

MONTH_MAP = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}
DAY_MAP = {'Sunday': 6, 'Monday': 0, 'Tuesday': 1, 'Wednesday': 2, 'Thursday': 3, 'Friday': 4, 'Saturday': 5}
AGE_OF_VEH_MAP = {'new': 8, '2 years': 7, '3 years': 6, '4 years': 5,
                  '5 years': 4, '6 years': 3, '7 years': 2, 'more than 7': 1}
AGE_OF_POL_MAP = {'16 to 17': 1, '18 to 20': 2, '21 to 25': 3, '26 to 30': 4,
                  '31 to 35': 5, '36 to 40': 6, '41 to 50': 7, '51 to 65': 8,
                  'over 65': 9}
VEH_PRICE_MAP = {'less than 20,000': 1, '20,000 to 29,000': 2,
                 '30,000 to 39,000': 3, '40,000 to 59,000': 4,
                 '60,000 to 69,000': 5, 'more than 69,000': 6}
DAYS_ACC_MAP = {'none': 1, '1 to 7': 2, '8 to 15': 3, '15 to 30': 4, 'more than 30': 5}
NUM_CAR_MAP = {'1 vehicle': 1, '2 vehicles': 2, '3 to 4': 3, '5 to 8': 4}
ADD_CHANGE_MAP = {'no change': 1, 'under 6 months': 2, '1 year': 3,
                  '2 to 3 years': 4, '4 to 8 years': 5}
NUM_SUPP_MAP = {'none': 1, '1 to 2': 2, '3 to 5': 3, 'more than 5': 4}
PAST_CLAIM_MAP = {'none': 1, '1': 2, '2 to 4': 3, 'more than 4': 4}
DAYS_CLAIM_MAP = {'8 to 15': 1, '15 to 30': 2, 'more than 30': 3}

In [None]:
for feature in TEMP_REPLACE:
    if feature == 'DayOfWeekClaimed' or feature == 'DayOfWeek':
        X_train[feature] = X_train[feature].replace('0', 'Monday')
        X_test[feature] = X_test[feature].replace('0', 'Monday')
    if feature == 'WeekOfMonthClaimed' or feature == 'WeekOfMonth':
        X_train[feature] = X_train[feature].replace('0', 1)
        X_test[feature] = X_test[feature].replace('0', 1)
    if feature == 'MonthClaimed' or feature == 'Month':
        X_train[feature] = X_train[feature].replace('0', 'Jan')
        X_test[feature] = X_test[feature].replace('0', 'Jan')

In [9]:
for feature in TEMP_SIN_COS:
    if feature == 'MonthClaimed' or feature == 'Month':
        X_train[feature] = X_train[feature].map(MONTH_MAP)
        X_test[feature] = X_test[feature].map(MONTH_MAP)
    if feature == 'DayOfWeekClaimed' or feature == 'DayOfWeek':
        X_train[feature] = X_train[feature].map(DAY_MAP)
        X_test[feature] = X_test[feature].map(DAY_MAP)

In [10]:
for feature in TEMP_SIN_COS:
    if feature == 'MonthClaimed' or feature == 'Month':
        X_train[feature+'_sin'] = np.sin(2 * np.pi * X_train[feature] / 12)
        X_train[feature+'_cos'] = np.cos(2 * np.pi * X_train[feature] / 12)
        
        X_test[feature+'_sin'] = np.sin(2 * np.pi * X_test[feature] / 12)
        X_test[feature+'_cos'] = np.cos(2 * np.pi * X_test[feature] / 12)
        
    if feature == 'DayOfWeekClaimed' or feature == 'DayOfWeek':
        X_train[feature+'_sin'] = np.sin(2 * np.pi * X_train[feature] / 7)
        X_train[feature+'_cos'] = np.cos(2 * np.pi * X_train[feature] / 7)

        X_test[feature+'_sin'] = np.sin(2 * np.pi * X_test[feature] / 7)
        X_test[feature+'_cos'] = np.cos(2 * np.pi * X_test[feature] / 7)

In [11]:
X_train.drop(columns=['MonthClaimed', 'Month'], inplace=True)
X_test.drop(columns=['MonthClaimed', 'Month'], inplace=True)
X_train.drop(columns=['DayOfWeekClaimed', 'DayOfWeek'], inplace=True)
X_test.drop(columns=['DayOfWeekClaimed', 'DayOfWeek'], inplace=True)

In [None]:
from scipy import stats

mean_train = X_train[X_train['Age'] > 0]['Age'].mean()
mean_test = X_test[X_test['Age'] > 0]['Age'].mean()

for feature in AGE:
    X_train[feature] = X_train[feature].apply(lambda z: mean_train if z <=0 else z)
    X_test[feature] = X_test[feature].apply(lambda z: mean_test if z <=0 else z)
    
    X_train[feature], _ = stats.boxcox(X_train[feature])
    X_test[feature], _ = stats.boxcox(X_test[feature])

In [None]:
X_train.head()

In [None]:
svc_preprocessor = ColumnTransformer(
    transformers=[
        ('hot_num', OneHotEncoder(), ONE_HOT_NUMERICAL),
        ('hot_cat', OneHotEncoder(), ONE_HOT_CATEGORICAL),
        ('age_veh', proc.MapTransform(AGE_OF_VEH_VAR, AGE_OF_VEH_MAP), AGE_OF_VEH_VAR),
        ('age_pol', proc.MapTransform(AGE_OF_POL_VAR, AGE_OF_POL_MAP), AGE_OF_POL_VAR),
        ('veh_price', proc.MapTransform(VEH_PRICE_VAR, VEH_PRICE_MAP), VEH_PRICE_VAR),
        ('day_acc', proc.MapTransform(DAYS_ACC_VAR, DAYS_ACC_MAP), DAYS_ACC_VAR),
        ('num_car', proc.MapTransform(NUM_CAR_VAR, NUM_CAR_MAP), NUM_CAR_VAR),
        ('add_change', proc.MapTransform(ADD_CHANGE_VAR, ADD_CHANGE_MAP), ADD_CHANGE_VAR),
        ('num_supp', proc.MapTransform(NUM_SUPP_VAR, NUM_SUPP_MAP), NUM_SUPP_VAR),
        ('past_claim', proc.MapTransform(PAST_CLAIM_VAR, PAST_CLAIM_MAP), PAST_CLAIM_VAR),
        ('day_claim', proc.MapTransform(DAYS_CLAIM_VAR, DAYS_CLAIM_MAP), DAYS_CLAIM_VAR)
    ], remainder='passthrough')

anova_preprocessor = ColumnTransformer(
    transformers=[
        ('hot_num', OneHotEncoder(), ONE_HOT_NUMERICAL),
        ('hot_cat', OneHotEncoder(), ONE_HOT_CATEGORICAL),
        ('age_veh', proc.MapTransform(AGE_OF_VEH_VAR, AGE_OF_VEH_MAP), AGE_OF_VEH_VAR),
        ('age_pol', proc.MapTransform(AGE_OF_POL_VAR, AGE_OF_POL_MAP), AGE_OF_POL_VAR),
        ('veh_price', proc.MapTransform(VEH_PRICE_VAR, VEH_PRICE_MAP), VEH_PRICE_VAR),
        ('day_acc', proc.MapTransform(DAYS_ACC_VAR, DAYS_ACC_MAP), DAYS_ACC_VAR),
        ('num_car', proc.MapTransform(NUM_CAR_VAR, NUM_CAR_MAP), NUM_CAR_VAR),
        ('add_change', proc.MapTransform(ADD_CHANGE_VAR, ADD_CHANGE_MAP), ADD_CHANGE_VAR),
        ('num_supp', proc.MapTransform(NUM_SUPP_VAR, NUM_SUPP_MAP), NUM_SUPP_VAR),
        ('past_claim', proc.MapTransform(PAST_CLAIM_VAR, PAST_CLAIM_MAP), PAST_CLAIM_VAR),
        ('day_claim', proc.MapTransform(DAYS_CLAIM_VAR, DAYS_CLAIM_MAP), DAYS_CLAIM_VAR)
    ], remainder='passthrough')

In [None]:
X_train_svc = svc_preprocessor.fit_transform(X_train)
X_test_svc = svc_preprocessor.transform(X_test)
X_train_anova = anova_preprocessor.fit_transform(X_train)
X_test_anova = anova_preprocessor.transform(X_test)

In [None]:
X_train_svc

In [None]:
X_train_anova

#### Target

In [None]:
svc_label_bin = LabelBinarizer()
anova_label_bin = LabelBinarizer()

y_train_svc = svc_label_bin.fit_transform(y_train)
y_test_svc = svc_label_bin.transform(y_test)
y_train_anova = anova_label_bin.fit_transform(y_train)
y_test_anova = anova_label_bin.transform(y_test)

In [None]:
y_train_svc

In [None]:
y_train_anova

#### Feature selection test

#### SVC

In [None]:
svc_selector = SelectFromModel(SVC(probability=True, random_state=42))
svc_selector.fit(X_train_svc, y_train_svc)

In [None]:
print(svc_selector.get_support().sum())
svc_selector.get_support()

In [None]:
svc_selected = X_train.columns[(svc_selector.get_support())]

print(f'total features: {X_train.shape[1]}')
print(f'selected features: {len(svc_selected)}')
print(f'features with coef shrank to 0: {np.sum(svc_selector.estimator_.coef_ == 0)}')

In [None]:
svc_selected

#### ANOVA

In [None]:
anova_selector = SelectKBest(f_classif, k='all')
anova_selector.fit(X_train_anova, y_train_anova)

In [None]:
print('feature importance: ', anova_selector.scores_)
print('pvalues: ', anova_selector.pvalues_)

In [None]:
feature_stats = pd.DataFrame({
    'Feature': X_train.columns,
    'Score': anova_selector.scores_,
    'P-Value': anova_selector.pvalues_
})

feature_stats.sort_values(by='Score', ascending=False, inplace=True)

In [None]:
print(anova_selector.get_support().sum())
anova_selector.get_support()

In [None]:
anova_selected = X_train.columns[(anova_selector.get_support())]

print(f'total features: {X_train.shape[1]}')
print(f'selected features: {len(anova_selected)}')
print(f'features with coef shrank to 0: {np.sum(anova_selector.estimator_.coef_ == 0)}')

In [None]:
anova_selected

#### Sampling

In [None]:
from collections import Counter
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
X_res, y_res = ros.fit_resample(X_train, y_train)
print(sorted(Counter(y_res).items()))
print(X_res.shape, y_res.shape)