In [3]:
import numpy as np
# 訓練データと検証データの分割
from sklearn.model_selection import train_test_split
# データを扱う
import pandas as pd
# グラフ描画
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

In [4]:
# ロジスティック回帰
from sklearn.linear_model import LogisticRegression
# 評価関数（f1_score）
from sklearn.metrics import f1_score 
# 混同行列
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# 学習データ

In [5]:
class TrainDataset():
    def __init__(self):
        
        # CSVファイル読み込み（訓練データ）
        df = pd.read_csv('train.csv')
        
        # 不正データの補正 (レコード削除のパターンもあるため、ファイル読み込み直後に処理する)
        df = self.correction(df)
        
        # 説明変数と目的変数に分割する
        df_x = df.drop(['loan_status'], axis=1)
        self.df_y = df.loc[:,['loan_status']]
        self.targetConv()
        
        # 説明変数の量的データ、質的データ分割
        self.x_cate , self.x_cont, self.x_id = self.DataChange(df_x.copy())
        self.x_desc = self.x_cont.describe()
        
        self.x_cate_conv = self.x_cate.copy()
        self.x_cont_conv = self.x_cont.copy()
        
        # データの加工（標準化や対数化など）
        #self.std_scale = 0
        #self.max_scale = 0
        #self.dobule_scale = 0
        #self.polynomialFlg = 0
        #self.binSplitFlg = 0
        self.logFlg = 0
        self.DataConv()
        
        #self.NpToPy()
        #self.num = len(self.X_train)
        self.num = len(self.x_cont)
        self.coti_size = self.x_cont.shape[1]
        

    
    def correction(self, df):
        
        print('null check')
        print(df.isnull().sum())
        print('-------------------------')
        
        return df
    
    def viewDescribe(self):
        self.x_cont.describe()
        self.x_cate.describe(include='O')
    
    def dropCol(self):
        
        #self.x_cate_conv = self.x_cate_conv.drop(columns=['employment_length'], axis=1)
        self.x_cate_conv = self.x_cate_conv.drop(columns=['employment_length','application_type'], axis=1)
        # application_type  は使ってみる
        
        
        #self.x_cont_conv = self.x_cont_conv.drop(columns=['loan_amnt'], axis=1)
        self.x_cont_conv = self.x_cont_conv.drop(columns=['loan_amnt','credit_score'], axis=1)
        # credit_score  は使ってみる（差があるかは要確認）
        
    def DataChange(self, df_x):
        categ_cols = ['term' ,'grade' ,'employment_length', 'purpose' ,'application_type']
        contin_cols = ['loan_amnt', 'interest_rate', 'credit_score']
        index_cols = ['id']

        ####################################
        ##  データ分割
        ##  説明変数と目的変数に分ける。
        ##  説明変数はカテゴリデータと連続データに分ける。
        ####################################
        x_cate = df_x[categ_cols].copy()
        x_cont = df_x[contin_cols].copy()
        x_id = df_x[index_cols].copy()
        x_id = x_id.astype('int64')
        
        return x_cate , x_cont  ,x_id
    
    def targetConv(self):
        self.df_y = self.df_y.replace('FullyPaid', '0').replace('ChargedOff', '1')
        self.df_y.astype('float32')
        
    def NpToPy(self):
        
        #tmp = self.x_cont[['displacement_log', 'horsepower_log', 'weight_log', 'acceleration_log']]
        #tmp = self.x_cont[['horsepower_log', 'weight_log', 'acceleration_log']]
        
        x_temp = pd.concat([self.x_cate_conv , self.x_cont_conv] , axis=1)
        self.train_x_col_name = x_temp.columns.values
        #x_temp = self.x_cont.copy()
        x_temp = np.array(x_temp)
        y_temp = np.array(self.df_y, dtype=np.float64)
        self.train_y_col_name = self.df_y.columns.values
        
        # 訓練データと検証データに分割
        x_train,x_test,y_train,y_test=train_test_split(x_temp, y_temp, test_size=0.2, random_state=3) 
        
        # 
        self.X_train = x_train
        self.Y_train = y_train
        self.X_test = x_test
        self.Y_test = y_test
        
        self.downSampling()
        
        self.coti_size = self.x_cont.shape[1]
        self.in_size  = self.X_train.shape[1]
        self.out_size = self.Y_train.shape[1]  

    def downSampling(self):
        
        x = pd.DataFrame(self.X_train ,columns=self.train_x_col_name)
        y = pd.DataFrame(self.Y_train ,columns=self.train_y_col_name)
        
        train = pd.concat([x, y] ,axis=1)

        # 完済のデータと貸し倒れのデータを別々の変数に代入
        fp = train.loc[train['loan_status'] == 0, :]
        co = train[train['loan_status'] == 1]
        
        print('fp_cnt=', len(fp))
        print('co_cnt=', len(co))
        # 貸し倒れのデータ数と同じ数だけ完済のデータをランダムに取り出し
        fp = fp.sample(n=co.shape[0], random_state=0)
        print('fp_cnt=', len(fp))
        
        # 完済のデータと貸し倒れのデータを縦結合
        train = pd.concat([fp, co] ,axis=0)

        # 説明変数と目的変数をそれぞれ別の変数に代入
        self.X_train_sample = train.drop(columns=['loan_status'] ,axis=1)
        self.Y_train_sample = train['loan_status']

    
    def termConv(self):
        
        col = 'term'
        
        self.x_cate_conv[col + '_1'] = 0

        row_index = self.x_cate.index[self.x_cate[col]=='3 years']
        self.x_cate_conv.loc[row_index, col + '_1'] = 1
        
        # ALL ゼロが5 yearsなので処理しない
        row_index = self.x_cate.index[self.x_cate[col]=='5 years']
        
        self.x_cate_conv = self.x_cate_conv.drop(columns=col ,axis=1)
    
    def gradeConv(self):
        
        col = 'grade'
        
        self.x_cate_conv[col + '_01'] = 0
        self.x_cate_conv[col + '_02'] = 0
        self.x_cate_conv[col + '_03'] = 0
        self.x_cate_conv[col + '_04'] = 0
        self.x_cate_conv[col + '_05'] = 0
        
        self.x_cate_conv[col + '_06'] = 0
        self.x_cate_conv[col + '_07'] = 0
        self.x_cate_conv[col + '_08'] = 0
        self.x_cate_conv[col + '_09'] = 0
        self.x_cate_conv[col + '_10'] = 0
        
        self.x_cate_conv[col + '_11'] = 0
        self.x_cate_conv[col + '_12'] = 0
        self.x_cate_conv[col + '_13'] = 0
        self.x_cate_conv[col + '_14'] = 0
        self.x_cate_conv[col + '_15'] = 0
        
        self.x_cate_conv[col + '_16'] = 0
        self.x_cate_conv[col + '_17'] = 0
        self.x_cate_conv[col + '_18'] = 0
        self.x_cate_conv[col + '_19'] = 0
        self.x_cate_conv[col + '_20'] = 0        

        self.x_cate_conv[col + '_21'] = 0
        self.x_cate_conv[col + '_22'] = 0
        self.x_cate_conv[col + '_23'] = 0
        self.x_cate_conv[col + '_24'] = 0
        self.x_cate_conv[col + '_25'] = 0
        
        self.x_cate_conv[col + '_26'] = 0
        self.x_cate_conv[col + '_27'] = 0
        self.x_cate_conv[col + '_28'] = 0
        self.x_cate_conv[col + '_29'] = 0
            
        
        row_index = self.x_cate.index[self.x_cate[col]=='A1']
        self.x_cate_conv.loc[row_index, col + '_01'] = 1
        row_index = self.x_cate.index[self.x_cate[col]=='A2']
        self.x_cate_conv.loc[row_index, col + '_02'] = 1
        row_index = self.x_cate.index[self.x_cate[col]=='A3']
        self.x_cate_conv.loc[row_index, col + '_03'] = 1
        row_index = self.x_cate.index[self.x_cate[col]=='A4']
        self.x_cate_conv.loc[row_index, col + '_04'] = 1        
        row_index = self.x_cate.index[self.x_cate[col]=='A5']
        self.x_cate_conv.loc[row_index, col + '_05'] = 1     

        row_index = self.x_cate.index[self.x_cate[col]=='B1']
        self.x_cate_conv.loc[row_index, col + '_06'] = 1
        row_index = self.x_cate.index[self.x_cate[col]=='B2']
        self.x_cate_conv.loc[row_index, col + '_07'] = 1
        row_index = self.x_cate.index[self.x_cate[col]=='B3']
        self.x_cate_conv.loc[row_index, col + '_08'] = 1
        row_index = self.x_cate.index[self.x_cate[col]=='B4']
        self.x_cate_conv.loc[row_index, col + '_09'] = 1        
        row_index = self.x_cate.index[self.x_cate[col]=='B5']
        self.x_cate_conv.loc[row_index, col + '_10'] = 1  
 
        row_index = self.x_cate.index[self.x_cate[col]=='C1']
        self.x_cate_conv.loc[row_index, col + '_11'] = 1
        row_index = self.x_cate.index[self.x_cate[col]=='C2']
        self.x_cate_conv.loc[row_index, col + '_12'] = 1
        row_index = self.x_cate.index[self.x_cate[col]=='C3']
        self.x_cate_conv.loc[row_index, col + '_13'] = 1
        row_index = self.x_cate.index[self.x_cate[col]=='C4']
        self.x_cate_conv.loc[row_index, col + '_14'] = 1        
        row_index = self.x_cate.index[self.x_cate[col]=='C5']
        self.x_cate_conv.loc[row_index, col + '_15'] = 1     

        row_index = self.x_cate.index[self.x_cate[col]=='D1']
        self.x_cate_conv.loc[row_index, col + '_16'] = 1
        row_index = self.x_cate.index[self.x_cate[col]=='D2']
        self.x_cate_conv.loc[row_index, col + '_17'] = 1
        row_index = self.x_cate.index[self.x_cate[col]=='D3']
        self.x_cate_conv.loc[row_index, col + '_18'] = 1
        row_index = self.x_cate.index[self.x_cate[col]=='D4']
        self.x_cate_conv.loc[row_index, col + '_19'] = 1        
        row_index = self.x_cate.index[self.x_cate[col]=='D5']
        self.x_cate_conv.loc[row_index, col + '_20'] = 1  
        
        row_index = self.x_cate.index[self.x_cate[col]=='E1']
        self.x_cate_conv.loc[row_index, col + '_21'] = 1
        row_index = self.x_cate.index[self.x_cate[col]=='E2']
        self.x_cate_conv.loc[row_index, col + '_22'] = 1
        row_index = self.x_cate.index[self.x_cate[col]=='E3']
        self.x_cate_conv.loc[row_index, col + '_23'] = 1
        row_index = self.x_cate.index[self.x_cate[col]=='E4']
        self.x_cate_conv.loc[row_index, col + '_24'] = 1        
        row_index = self.x_cate.index[self.x_cate[col]=='E5']
        self.x_cate_conv.loc[row_index, col + '_25'] = 1     

        row_index = self.x_cate.index[self.x_cate[col]=='F1']
        self.x_cate_conv.loc[row_index, col + '_26'] = 1
        row_index = self.x_cate.index[self.x_cate[col]=='F2']
        self.x_cate_conv.loc[row_index, col + '_27'] = 1
        row_index = self.x_cate.index[self.x_cate[col]=='F3']
        self.x_cate_conv.loc[row_index, col + '_28'] = 1
        row_index = self.x_cate.index[self.x_cate[col]=='F4']
        self.x_cate_conv.loc[row_index, col + '_29'] = 1        
        
        # ALL ゼロがF5なので処理しない
        row_index = self.x_cate.index[self.x_cate[col]=='F5']
        
        self.x_cate_conv = self.x_cate_conv.drop(columns=col ,axis=1)
        
        
    def purposeConv(self):
        
        col = 'purpose'
        
        self.x_cate_conv[col + '_1'] = 0
        self.x_cate_conv[col + '_2'] = 0
        self.x_cate_conv[col + '_3'] = 0
        self.x_cate_conv[col + '_4'] = 0
        self.x_cate_conv[col + '_5'] = 0
        self.x_cate_conv[col + '_6'] = 0 
        self.x_cate_conv[col + '_7'] = 0
        self.x_cate_conv[col + '_8'] = 0
        
        row_index = self.x_cate.index[self.x_cate[col]=='car']
        self.x_cate_conv.loc[row_index, col + '_1'] = 1
        
        row_index = self.x_cate.index[self.x_cate[col]=='credit_card']
        self.x_cate_conv.loc[row_index, col + '_2'] = 1
        
        row_index = self.x_cate.index[self.x_cate[col]=='debt_consolidation']
        self.x_cate_conv.loc[row_index, col + '_3'] = 1
        
        row_index = self.x_cate.index[self.x_cate[col]=='home_improvement']
        self.x_cate_conv.loc[row_index, col + '_4'] = 1
        
        row_index = self.x_cate.index[self.x_cate[col]=='house']
        self.x_cate_conv.loc[row_index, col + '_5'] = 1
        
        row_index = self.x_cate.index[self.x_cate[col]=='major_purchase']
        self.x_cate_conv.loc[row_index, col + '_6'] = 1
        
        row_index = self.x_cate.index[self.x_cate[col]=='medical']
        self.x_cate_conv.loc[row_index, col + '_7'] = 1
        
        row_index = self.x_cate.index[self.x_cate[col]=='small_business']
        self.x_cate_conv.loc[row_index, col + '_8'] = 1
        
        # ALL ゼロがotherなので処理しない
        row_index = self.x_cate.index[self.x_cate[col]=='other']
        
        self.x_cate_conv = self.x_cate_conv.drop(columns=col ,axis=1)
        
    def appConv(self):
        
        col = 'application_type'
        
        self.x_cate_conv[col + '_1'] = 0

        row_index = self.x_cate.index[self.x_cate[col]=='Individual']
        self.x_cate_conv.loc[row_index, col + '_1'] = 1
        
        # ALL ゼロがJoint Appなので処理しない
        row_index = self.x_cate.index[self.x_cate[col]=='Joint App']
        
        self.x_cate_conv = self.x_cate_conv.drop(columns=col ,axis=1)
 
    def binSplit(self):
        contin_cols = self.x_cont_conv.columns.values

        for col in contin_cols:
            split_min = 0
            split_min2 = self.x_desc.loc['min',col]
            split1 = self.x_desc.loc['25%',col]
            split2 = self.x_desc.loc['50%',col]
            split3 = self.x_desc.loc['75%',col]
            std_m = self.x_desc.loc['std',col]/3
            std_p = self.x_desc.loc['std',col]*3
            split_max=  self.x_desc.loc['max',col]
            split_max2=  self.x_desc.loc['max',col]*10
            bins = [split_min,  split_min2, split1,  split2,  split3, std_m,  std_p,  split_max, split_max2]
            bins = np.sort(bins)
            self.x_cont_conv[col] = pd.cut(self.x_cont_conv[col], bins=bins, labels=False)
    
    def binSplit2(self):
        contin_cols = self.x_cont_conv.columns.values

        for col in contin_cols:
            
            bins = 30
            self.x_cont_conv[col] = pd.cut(self.x_cont_conv[col], bins=bins ,labels=False)
            
    def DataConv(self):

        self.termConv()
        self.gradeConv()
        self.purposeConv()
        #self.appConv()
        #self.binSplit2()
        self.dropCol()        
        self.NpToPy()
        

In [6]:
train_dataset = TrainDataset()

null check
id                   0
loan_amnt            0
term                 0
interest_rate        0
grade                0
employment_length    0
purpose              0
credit_score         0
application_type     0
loan_status          0
dtype: int64
-------------------------
fp_cnt= 159809
co_cnt= 33911
fp_cnt= 33911


In [7]:
train_dataset.x_cate_conv

Unnamed: 0,term_1,grade_01,grade_02,grade_03,grade_04,grade_05,grade_06,grade_07,grade_08,grade_09,...,grade_28,grade_29,purpose_1,purpose_2,purpose_3,purpose_4,purpose_5,purpose_6,purpose_7,purpose_8
0,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242145,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
242146,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
242147,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
242148,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [8]:
train_dataset.x_desc

Unnamed: 0,loan_amnt,interest_rate,credit_score
count,242150.0,242150.0,242150.0
mean,1520.389009,13.801496,683.575024
std,830.250197,4.588924,29.554795
min,323.797279,5.704849,655.424269
25%,761.954545,10.876086,659.531106
50%,1212.680586,13.543833,678.672563
75%,2152.21333,17.172395,698.59196
max,3851.867974,27.980604,808.551641


In [9]:
train_dataset.X_train

array([[ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.        , 14.55474123],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.        , 13.88483838],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.        , 10.62806214],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.        , 10.90278019],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        , 12.81285601],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.        , 18.56445013]])

In [10]:
train_dataset.x_id

Unnamed: 0,id
0,0
1,1
2,2
3,3
4,4
...,...
242145,242145
242146,242146
242147,242147
242148,242148


# 評価データ

In [11]:
class TestDataset(TrainDataset):
    def __init__(self, x_desc):
        
        # CSVファイル読み込み（訓練データ）
        df = pd.read_csv('test.csv' )
        self.df_data = df.copy()
        # 不正データの補正 (レコード削除のパターンもあるため、ファイル読み込み直後に処理する)
        df = self.correction(df)
        
        # 目的変数の格納
        self.x_cate , self.x_cont, self.x_id = self.DataChange(df.copy())
        self.x_desc = x_desc
        
        self.x_cate_conv = self.x_cate.copy()
        self.x_cont_conv = self.x_cont.copy()
        
        # データの加工（標準化や対数化など）
        #self.std_scale = 0
        #self.max_scale = 0
        #self.dobule_scale = 0
        #self.polynomialFlg = 0
        #self.binSplitFlg = 0
        self.logFlg  = 0
        self.DataConv()
        
        #self.NpToPy()
        #self.num = len(self.x_cate)
        self.num = len(self.x_cont)
        self.coti_size = self.x_cont.shape[1]

    def __getitem__(self, index):
        return self.X_test[index]
    
    def __len__(self):
        return self.num
    
    def NpToPy(self):
        
        x_temp = pd.concat([self.x_cate_conv , self.x_cont_conv] , axis=1)
        
        #x_temp = self.x_cont.copy()
        x_temp = np.array(x_temp)
        
        # numpy の値を torch の値に変換する
        self.X_vat = x_temp



In [12]:
test_dataset = TestDataset(train_dataset.x_desc)

null check
id                   0
loan_amnt            0
term                 0
interest_rate        0
grade                0
employment_length    0
purpose              0
credit_score         0
application_type     0
dtype: int64
-------------------------


In [13]:
test_dataset.x_id

Unnamed: 0,id
0,242150
1,242151
2,242152
3,242153
4,242154
...,...
26895,269045
26896,269046
26897,269047
26898,269048


# モデル定義

In [14]:
model = LogisticRegression()
model.fit(train_dataset.X_train, train_dataset.Y_train)

  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

# 予測する

In [32]:
y_pred_train = model.predict(train_dataset.X_train)
#y_pred_train = model.predict_proba(train_dataset.X_train)
print(y_pred_train)
score_train = f1_score(train_dataset.Y_train, y_pred_train)

# f1_scoreの表示
print(score_train)



[0. 0. 0. ... 0. 0. 0.]
0.1521327256899995


In [33]:
# f1_scoreの表示
print(score_train)

0.1521327256899995


In [34]:
# 混同行列
confusion_matrix(train_dataset.Y_train, y_pred_train)

array([[157633,   2176],
       [ 30940,   2971]], dtype=int64)

In [37]:
print(classification_report(train_dataset.Y_train, y_pred_train))

              precision    recall  f1-score   support

         0.0       0.84      0.99      0.90    159809
         1.0       0.58      0.09      0.15     33911

    accuracy                           0.83    193720
   macro avg       0.71      0.54      0.53    193720
weighted avg       0.79      0.83      0.77    193720



In [38]:

y_pred_test = model.predict(train_dataset.X_test)
score_test = f1_score(train_dataset.Y_test, y_pred_test)

# f1_scoreの表示
print(score_test)

0.15386210471747017


In [39]:
# 混同行列
confusion_matrix(train_dataset.Y_test, y_pred_test)

array([[39527,   458],
       [ 7703,   742]], dtype=int64)

In [40]:
print(classification_report(train_dataset.Y_test, y_pred_test))

              precision    recall  f1-score   support

         0.0       0.84      0.99      0.91     39985
         1.0       0.62      0.09      0.15      8445

    accuracy                           0.83     48430
   macro avg       0.73      0.54      0.53     48430
weighted avg       0.80      0.83      0.78     48430



# 閾値変更

In [42]:
model2 = LogisticRegression()
model2.fit(train_dataset.X_train, train_dataset.Y_train)

  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

# 予測精度の確認

### 訓練データ

In [43]:
y_pred_train = model.predict_proba(train_dataset.X_train)

df_pred_train = pd.DataFrame(y_pred_train)
print(df_pred_train)


               0         1
0       0.848712  0.151288
1       0.837461  0.162539
2       0.886462  0.113538
3       0.893607  0.106393
4       0.877392  0.122608
...          ...       ...
193715  0.902385  0.097615
193716  0.831780  0.168220
193717  0.860828  0.139172
193718  0.822413  0.177587
193719  0.755887  0.244113

[193720 rows x 2 columns]


In [108]:
# 予測結果から閾値の算出
chargedoff_val = train_dataset.Y_train.sum()
chargedoff_val = int(chargedoff_val)
proba_desc = df_pred_train[1].sort_values(ascending = False)
threshold = proba_desc.iat[chargedoff_val]
print(threshold)


def classification(x):
    #print(len(x))
    
    if x.ndim == 1:
        if x >= threshold:
            return 1
        else:
            return 0
    else:
        tmp = np.zeros(len(x) ,dtype=np.int32)
        for i in range(len(x)):
            if x[i] >= threshold:
                tmp[i] = 1
            
        return tmp

        

0.4


In [109]:
pred_tmp = classification(np.array(df_pred_train[1]))
score_train = f1_score(train_dataset.Y_train, pred_tmp)
# f1_scoreの表示
print(score_train)


0.23973897389738974


In [110]:
# 混同行列
confusion_matrix(train_dataset.Y_train, pred_tmp)

array([[154607,   5202],
       [ 28584,   5327]], dtype=int64)

In [112]:
print(classification_report(train_dataset.Y_train, pred_tmp))

              precision    recall  f1-score   support

         0.0       0.84      0.97      0.90    159809
         1.0       0.51      0.16      0.24     33911

    accuracy                           0.83    193720
   macro avg       0.67      0.56      0.57    193720
weighted avg       0.78      0.83      0.79    193720



### 検証データ

In [113]:
y_pred_test = model.predict_proba(train_dataset.X_test)

df_pred_test = pd.DataFrame(y_pred_test)
print(df_pred_test)


              0         1
0      0.527592  0.472408
1      0.774070  0.225930
2      0.883242  0.116758
3      0.972063  0.027937
4      0.905930  0.094070
...         ...       ...
48425  0.840832  0.159168
48426  0.724808  0.275192
48427  0.895805  0.104195
48428  0.887001  0.112999
48429  0.794577  0.205423

[48430 rows x 2 columns]


In [114]:
pred_tmp = classification(np.array(df_pred_test[1]))
score_train = f1_score(train_dataset.Y_test,pred_tmp )
# f1_scoreの表示
print(score_train)


0.2408069692801467


In [115]:
# 混同行列
confusion_matrix(train_dataset.Y_test, pred_tmp)

array([[38838,  1147],
       [ 7132,  1313]], dtype=int64)

In [116]:
print(classification_report(train_dataset.Y_test, pred_tmp))

              precision    recall  f1-score   support

         0.0       0.84      0.97      0.90     39985
         1.0       0.53      0.16      0.24      8445

    accuracy                           0.83     48430
   macro avg       0.69      0.56      0.57     48430
weighted avg       0.79      0.83      0.79     48430



# CSV出力

In [105]:
y_pred_vat = model.predict_proba(test_dataset.X_vat)

df_pred_vat = pd.DataFrame(y_pred_vat)
print(df_pred_vat)
pred_tmp = classification(np.array(df_pred_vat[1]))

              0         1
0      0.875332  0.124668
1      0.740738  0.259262
2      0.964701  0.035299
3      0.969841  0.030159
4      0.903196  0.096804
...         ...       ...
26895  0.877215  0.122785
26896  0.973854  0.026146
26897  0.911261  0.088739
26898  0.912889  0.087111
26899  0.959944  0.040056

[26900 rows x 2 columns]


In [107]:
df_out = pd.DataFrame(pred_tmp, columns=['val'])
df_out = pd.concat([test_dataset.x_id['id'] ,df_out['val']] , axis=1)
df_out.to_csv('./submit.csv', encoding='utf_8_sig' , header=False ,index=False)
df_out

Unnamed: 0,id,val
0,242150,0
1,242151,1
2,242152,0
3,242153,0
4,242154,0
...,...,...
26895,269045,0
26896,269046,0
26897,269047,0
26898,269048,0
