In [108]:
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler, MaxAbsScaler
import category_encoders as ce
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from imblearn.over_sampling import ADASYN

In [110]:
data=pd.read_pickle('socar_origin_ko.p')

In [107]:
no_use_list=['b2b','sharing_type','has_previous_accident','car_model','accident_ratio','socarpass','socarsave','repair_cnt','insurance_site_aid_YN','total_prsn_cnt','car_part2','insurance_site_aid_YN']

num_col_list=['repair_cost','insure_cost']

no_cat_list = num_col_list + ['fraud_YN', 'test_set']

In [None]:
data=data[[col for col in data.columns if col not in no_use_list]]

In [80]:
one_hot_col_list= [col for col in data.columns if col not in no_cat_list]
OHE = ce.OneHotEncoder(cols=one_hot_col_list)
data = OHE.fit_transform(data)

In [56]:
scaler= RobustScaler()
data[num_col_list]=scaler.fit_transform(data[num_col_list])

In [64]:
test_y, train_y=data["fraud_YN"][data['test_set']==1],data["fraud_YN"][data['test_set']==0]
test_x, train_x=data[data['test_set']==1].drop(["fraud_YN",'test_set'], axis=1),data[(data['test_set']==0)].drop(["fraud_YN",'test_set'], axis=1)

In [65]:
train_x.shape, test_x.shape, train_y.shape, test_y.shape

((12879, 107), (3121, 107), (12879,), (3121,))

In [85]:
smote=SMOTE(random_state=13)
x_train_over, y_train_over = smote.fit_resample(x_train, y_train)

In [90]:
rf_clf = RandomForestClassifier(random_state=13, n_jobs = -1 , n_estimators=100)
dt_clf = DecisionTreeClassifier(random_state=13, max_depth=4)
lr_clf = LogisticRegression(random_state=13, solver='liblinear')
lgbm_clf = LGBMClassifier(n_estimators=1000, num_leaves=64, n_jobs=-1, boost_from_average=False)


In [81]:
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score)
def get_clf_eval(y_test, pred):
    acc=accuracy_score(y_test, pred)
    pre=precision_score(y_test, pred)
    re=recall_score(y_test, pred)
    f1=f1_score(y_test, pred)
    auc=roc_auc_score(y_test, pred)
    return acc,pre,re,f1,auc

In [82]:
from sklearn.metrics import confusion_matrix
def print_clf_eval(y_test,pred):
    confusion = confusion_matrix(y_test, pred)
    acc, pre, re, f1, auc = get_clf_eval(y_test, pred)
    
    print(confusion)
    print(acc,pre)
    print(re,f1,auc)

In [102]:
def get_result(model, x_train, y_train, x_test, y_test):
    model.fit(x_train, y_train)
    pred=model.predict(x_test)
    return get_clf_eval(y_test, pred)

In [103]:
def get_result_pd(models, model_names, x_train, y_train, x_test, y_test):
    col_names = ['accuracy', 'precision','recall','f1','roc_auc']
    tmp = []
    for model in models:
        tmp.append(get_result(model, x_train, y_train, x_test, y_test))
    return pd.DataFrame(tmp, columns=col_names, index=model_names)

In [104]:
models = [lr_clf, dt_clf, rf_clf, lgbm_clf]
model_names = ['LinearReg','DecisionTree', 'RandomForest', 'LightGBM']

In [105]:
results = get_result_pd(models, model_names, x_train_over, y_train_over, x_test, y_test)

In [106]:
results

Unnamed: 0,accuracy,precision,recall,f1,roc_auc
LinearReg,0.997757,0.0,0.0,0.0,0.5
DecisionTree,0.731817,0.003589,0.428571,0.007117,0.580535
RandomForest,0.997757,0.0,0.0,0.0,0.5
LightGBM,0.997757,0.0,0.0,0.0,0.5


In [70]:
pre_process = ColumnTransformer(remainder = 'passthrough', 
                                transformers=[('drop_columns', 'drop',no_use_list),
                               ('scaler',scaler,num_col_list)])

In [74]:
full_pipeline = Pipeline(steps=[('pre_processing', pre_process), ('random_forest', RandomForestClassifier(max_depth=10,random_state=2))])

In [76]:
full_pipeline.fit(train_x, train_y)

Pipeline(steps=[('pre_processing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('drop_columns', 'drop', []),
                                                 ('scaler', RobustScaler(),
                                                  ['repair_cost',
                                                   'insure_cost'])])),
                ('random_forest',
                 RandomForestClassifier(max_depth=10, random_state=2))])

In [77]:
full_pipeline.predict(test_x)

array([0, 0, 0, ..., 0, 0, 0])

In [13]:
# chat('police_site_aid_YN')

In [None]:
# def scaling(x):
#     data_num_tmp=data_num.copy()
#     transformer = x()
#     x_data = transformer.fit_transform(data_num_tmp)
#     data_num_tmp[data_num_tmp.columns]=x_data
#     pd.plotting.scatter_matrix(data_num_tmp, alpha=0.2)
#     data_num_tmp['fraud']=data['fraud_YN']
#     insure_cost_dist=data_num_tmp[data_num_tmp['fraud']==1][['repair_cost','insure_cost']]
#     insure_cost_dist.plot.box()
# for i in [RobustScaler, StandardScaler, MinMaxScaler]:
#     scaling(i)

In [None]:
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import RobustScaler
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import OrdinalEncoder
# from sklearn.preprocessing import OneHotEncoder

# num_col_list=['repair_cost','insure_cost']
# drop_list= num_col_list + ['fraud_YN']
# cat_col_list=[x for x in data.columns if x not in drop_list]

# num_pipeline = Pipeline([
#     ('robust_scaler', RobustScaler())
# ])

# full_pipeline = ColumnTransformer([
#     ("num", num_pipeline, num_col_list),
#     ("cat", OneHotEncoder(),cat_col_list)
# ])
# prepared_data = full_pipeline.fit_transform(x_train)