In [159]:
import pandas as pd
import numpy as np

from sklearn.metrics import f1_score

In [160]:
train = pd.read_csv("tidy_train.csv")
train = train.drop(columns=('Unnamed: 0'))
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22228 entries, 0 to 22227
Data columns (total 35 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   policy_tenure                     22228 non-null  float64
 1   age_of_car                        22228 non-null  float64
 2   age_of_policyholder               22228 non-null  float64
 3   area_cluster                      22228 non-null  int64  
 4   population_density                22228 non-null  float64
 5   make                              22228 non-null  int64  
 6   segment                           22228 non-null  int64  
 7   model                             22228 non-null  int64  
 8   fuel_type                         22228 non-null  int64  
 9   airbags                           22228 non-null  float64
 10  is_esc                            22228 non-null  int64  
 11  is_adjustable_steering            22228 non-null  int64  
 12  is_t

In [161]:
train.describe()

Unnamed: 0,policy_tenure,age_of_car,age_of_policyholder,area_cluster,population_density,make,segment,model,fuel_type,airbags,...,is_brake_assist,is_power_door_locks,is_central_locking,is_power_steering,is_driver_seat_height_adjustable,is_day_night_rear_view_mirror,is_ecw,is_speed_alert,ncap_rating,is_claim
count,22228.0,22228.0,22228.0,22228.0,22228.0,22228.0,22228.0,22228.0,22228.0,22228.0,...,22228.0,22228.0,22228.0,22228.0,22228.0,22228.0,22228.0,22228.0,22228.0,22228.0
mean,0.918931,0.00072,0.351809,7.682877,0.419201,1.760122,2.926669,4.432518,1.949703,3.132311,...,0.546293,0.721387,0.721387,0.97935,0.582014,0.379386,0.721387,0.993297,1.7509,0.064198
std,0.862633,0.029988,0.527929,4.505899,0.611739,1.135428,1.567091,2.601827,0.804947,1.830441,...,0.497864,0.448326,0.448326,0.142212,0.493239,0.485245,0.448326,0.0816,1.394023,0.245111
min,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,3.0,0.0,1.0,1.0,1.0,1.0,2.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,1.0,0.0,0.0,8.0,0.0,1.0,3.0,4.0,2.0,2.0,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,2.0,0.0
75%,2.0,0.0,1.0,10.0,1.0,3.0,5.0,6.0,3.0,6.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,0.0
max,2.0,2.0,2.0,22.0,2.0,5.0,6.0,11.0,3.0,6.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,5.0,1.0


## split data

In [162]:
### split data
# 因為演算法設計上將 x y 是置於同一 df中因此拆分時不特別做 x、y切分
split = round(len(train)*0.8)
training_set = train[:split]
testing_set = train[split:]

## Navie Bayes

In [144]:
from navie_bayes import naive_bayes_train,naive_bayes_test

In [145]:
target_feature = 'is_claim'
feature_names = list(train.columns)
feature_names.remove(target_feature)

# put training set into naive_bayes_train, and return the probabilities of all features
prob = naive_bayes_train(training_set, target_feature, feature_names)

# use the prob to apply into each instance, and return the better class
result_nb =testing_set.apply(naive_bayes_test,axis=1, args=(feature_names,prob))

In [146]:
data = {'real':testing_set['is_claim'],
       'pred':result_nb}
acc = pd.DataFrame(data)

# criteria：Accuracy , F1-score
print('Accuracy on the test data is ' + str(round(sum(acc['real']==acc['pred'] ) / (1.0*len(testing_set)),2)))
print('F1-score on the test data is ' +str(round(f1_score(acc['real'], acc['pred'], average='weighted'),2)))

Accuracy on the test data is 0.94
F1-score on the test data is 0.91


## random_forest

In [147]:
from random_forest import random_forest_train,random_forest_test
from pprint import pprint

In [148]:
# n: the number of the tree
n = 10

# put training set into random_forest_train, and return all the tree into forest
forest =random_forest_train(training_set,'is_claim', n_estimators =n)

# use forest to pred the class, and bagging for the best class 
result_rf = random_forest_test(testing_set,forest, n_estimators =n)

In [149]:
data = {'real':testing_set['is_claim'],
       'pred':result_rf}
acc = pd.DataFrame(data)

# criteria：Accuracy , F1-score
print('Accuracy on the test data is ' + str(round(sum(acc['real']==acc['pred'] ) / (1.0*len(testing_set)),2)))
print('F1-score on the test data is ' +str(round(f1_score(acc['real'], acc['pred'], average='weighted'),2)))

Accuracy on the test data is 0.94
F1-score on the test data is 0.91


## RandomForestClassifier by sklearn

In [150]:
split = round(len(train)*0.8)
training_set = train[:split]
testing_set = train[split:]

training_X = training_set.loc[:,:'ncap_rating']
training_y = training_set['is_claim']
testing_X = testing_set.loc[:,:'ncap_rating']
testing_y = testing_set['is_claim']

In [151]:
from sklearn.ensemble import RandomForestClassifier

In [152]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(training_X, training_y)
result_sk_rf = clf.predict(testing_X)

In [153]:
data = {'real':testing_y,
       'pred':result_sk_rf}
acc = pd.DataFrame(data)

# criteria：Accuracy , F1-score
print('Accuracy on the test data is ' + str(round(sum(acc['real']==acc['pred'] ) / (1.0*len(testing_set)),2)))
print('F1-score on the test data is ' +str(round(f1_score(acc['real'], acc['pred'], average='weighted'),2)))

Accuracy on the test data is 0.94
F1-score on the test data is 0.91


## xgboost

In [154]:
from xgboost import XGBClassifier,XGBRegressor

In [155]:
model = XGBClassifier()
model.fit(training_X, training_y)

result_xg = model.predict(testing_X)

In [156]:
data = {'real':testing_y,
       'pred':result_xg}
acc = pd.DataFrame(data)

# criteria：Accuracy , F1-score
print('Accuracy on the test data is ' + str(round(sum(acc['real']==acc['pred'] ) / (1.0*len(testing_set)),2)))
print('F1-score on the test data is ' +str(round(f1_score(acc['real'], acc['pred'], average='weighted'),2)))

Accuracy on the test data is 0.94
F1-score on the test data is 0.91


# Cross-validation

In [134]:
split = round(len(train)*0.8)
training_set = train[:split]
testing_set = train[split:]

In [135]:
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score

In [136]:
###  naive_bayes  ###

from navie_bayes import naive_bayes_train,naive_bayes_test
acc_list = []
for i in ([3,5,10]):
    kf = KFold(n_splits=i)
    for train, valid in kf.split(training_set):
        
        # split train / validation
        target_feature = 'is_claim'
        feature_names = list(training_set.columns)
        feature_names.remove(target_feature)

        training = training_set.iloc[train]
        valid = training_set.iloc[valid]
        
        # main：training model and predict result
        prob = naive_bayes_train(training, target_feature, feature_names)
        result_nb =valid.apply(naive_bayes_test,axis=1, args=(feature_names,prob))
        
        # criteria：Accuracy 
        data = {'real':valid['is_claim'],
                'pred':result_nb}
        acc = pd.DataFrame(data)
        accuracy = round(f1_score(acc['real'], acc['pred'], average='weighted'),2)
        
        acc_list.append(accuracy)
    
    # criteria：Average accuracy 
    print(f'{i}-fold of F1-score on the test data is ' +str(np.mean(acc_list)))


3-fold of F1-score on the test data is 0.9
5-fold of F1-score on the test data is 0.9012500000000001
10-fold of F1-score on the test data is 0.9022222222222221


In [139]:
###  random forest   ###
from random_forest import random_forest_train,random_forest_test
acc_list = []
for i in ([3,5,10]):
    kf = KFold(n_splits=i)
    for train, valid in kf.split(training_set):
        
        # n : number of trees
        n = 100
        
        # split train / validation
        training = training_set.iloc[train]
        valid = training_set.iloc[valid]
        
        # main：training model and predict result
        forest =random_forest_train(training,'is_claim', n_estimators =n)
        result_rf = random_forest_test(valid,forest, n_estimators =n)
        
        # criteria：Accuracy 
        data = {'real':valid['is_claim'],
                    'pred':result_rf}
        acc = pd.DataFrame(data)
        accuracy = round(f1_score(acc['real'], acc['pred'], average='weighted'),2)

        acc_list.append(accuracy)
    
    # criteria：Average accuracy 
    print(f'{i}-fold of F1-score on the test data is ' +str(np.mean(acc_list)))

3-fold of F1-score on the test data is 0.9
5-fold of F1-score on the test data is 0.9012500000000001
10-fold of F1-score on the test data is 0.9022222222222221


In [137]:
###　RandomForestClassifier by sklearn  ###
from sklearn.ensemble import RandomForestClassifier

acc_list = []
for i in ([3,5,10]):
    kf = KFold(n_splits=i)
    for train, valid in kf.split(training_set):
        
        # split train / validation
        training = training_set.iloc[train]
        valid = training_set.iloc[valid]
        
        training_X = training.loc[:,:'ncap_rating']
        training_y = training['is_claim']
        valid_X = valid.loc[:,:'ncap_rating']
        valid_y = valid['is_claim']
        
        # main：training model and predict result
        clf = RandomForestClassifier(max_depth=2, random_state=0)
        clf.fit(training_X, training_y)
        result_sk_rf = clf.predict(valid_X)
        
        # criteria：Accuracy 
        data = {'real':valid_y,
               'pred': result_sk_rf}
        acc = pd.DataFrame(data)
        
        accuracy = round(f1_score(acc['real'], acc['pred'], average='weighted'),2)

        acc_list.append(accuracy)
    
    # criteria：Average accuracy 
    print(f'{i}-fold of F1-score on the test data is ' +str(np.mean(acc_list)))

3-fold of F1-score on the test data is 0.9
5-fold of F1-score on the test data is 0.9012500000000001
10-fold of F1-score on the test data is 0.9022222222222221


In [138]:
### xgboost ###

from xgboost import XGBClassifier,XGBRegressor

acc_list = []
for i in ([3,5,10]):
    kf = KFold(n_splits=i)
    for train, valid in kf.split(training_set):
        
        # split train / validation
        training = training_set.iloc[train]
        valid = training_set.iloc[valid]
        
        training_X = training.loc[:,:'ncap_rating']
        training_y = training['is_claim']
        valid_X = valid.loc[:,:'ncap_rating']
        valid_y = valid['is_claim']
        
        # main：training model and predict result
        model = XGBClassifier()
        model.fit(training_X, training_y)
        result_xg = model.predict(valid_X)
        
        # criteria：Accuracy
        data = {'real':valid_y,
               'pred': result_xg}
        acc = pd.DataFrame(data)
        
        accuracy = round(f1_score(acc['real'], acc['pred'], average='weighted'),2)

        acc_list.append(accuracy)
    
    # criteria：Average accuracy 
    print(f'{i}-fold of F1-score on the test data is ' +str(np.mean(acc_list)))

3-fold of F1-score on the test data is 0.9
5-fold of F1-score on the test data is 0.9
10-fold of F1-score on the test data is 0.9016666666666667


### Cross validation Q2

In [110]:
def bagging(x):
    return np.argmax(x.value_counts())

In [166]:
# easy conbine all the result, and bagging for the best result

acc_list = []
for i in ([3,5,10]):
    kf = KFold(n_splits=i)
    for train_, test_ in kf.split(train):
        
        target_feature = 'is_claim'
        feature_names = list(training_set.columns)
        feature_names.remove(target_feature)
        
        # split train / test
        training = train.iloc[train_]
        testing = train.iloc[test_]
        
        df_result = pd.DataFrame(index=testing.index)
        
        # main：training model and predict result
        prob = naive_bayes_train(training, target_feature, feature_names)
        result_nb =valid.apply(naive_bayes_test,axis=1, args=(feature_names,prob))
        df_result[0] = result_nb
        
        # random_forest #
        n = 10
        forest =random_forest_train(training_set,'is_claim', n_estimators =n)
        result_rf = random_forest_test(testing_set,forest, n_estimators =n)
        df_result[1] = result_rf
        
        
        training_X = training.loc[:,:'ncap_rating']
        training_y = training['is_claim']
        testing_X = testing.loc[:,:'ncap_rating']
        testing_y = testing['is_claim']
        
        # RandomForestClassifier by sklearn #
        clf = RandomForestClassifier(max_depth=2, random_state=0)
        clf.fit(training_X, training_y)
        result_sk_rf = clf.predict(testing_X)
        df_result[2] = result_sk_rf 
        
        # xgboost #
        model = XGBClassifier()
        model.fit(training_X, training_y)
        result_xg = model.predict(testing_X)
        df_result[3] = result_xg
        
        result= df_result.apply(bagging, axis=1)
        
        data = {'real':testing_y,
               'pred' :result}
        acc = pd.DataFrame(data)
        
        result = f1_score(acc['real'], acc['pred'], average='weighted')
        
        acc_list.append(result)
    # criteria：Average accuracy 
    print(f'{i}-fold of F1-score on the test data is ' +str(np.mean(acc_list)))

3-fold of F1-score on the test data is 0.9047700922639864
5-fold of F1-score on the test data is 0.9047693083596062
10-fold of F1-score on the test data is 0.904773278539197
