In [2]:
import pandas as pd
import numpy as np
import collections
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.calibration import CalibratedClassifierCV
from matplotlib.patches import ArrowStyle
from scipy import stats
from sklearn import metrics
from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score, log_loss
from sklearn.metrics import confusion_matrix
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.feature_selection import VarianceThreshold
import csv
%matplotlib inline

In [3]:
df = pd.read_csv('./dat', encoding='utf-8', sep=',') # data read
df_proc = df.copy()

In [4]:
df_proc['x0_age'] = df_proc['x0_age'].fillna(df_proc['x0_age'].median())

In [5]:
df_proc['x10'] = df_proc['x10'].fillna(df_proc['x10'].median())

In [6]:
df_hist = df_proc.copy()

In [7]:
df_proc['x58'] = df_proc['x58'].fillna(0)
df_proc['x59'] = df_proc['x59'].fillna(0)
df_proc['x60'] = df_proc['x60'].fillna(0)
df_proc['x61'] = df_proc['x61'].fillna(0)
df_proc['x62'] = df_proc['x62'].fillna(-1)
df_proc['x63'] = df_proc['x63'].fillna(0)
df_proc['x64'] = df_proc['x64'].fillna(99999999)
df_proc['x65'] = df_proc['x65'].fillna(0)
df_proc['x66'] = df_proc['x66'].fillna(0)
df_proc['x67'] = df_proc['x67'].fillna(99)
df_proc['x68'] = df_proc['x68'].fillna(0)
df_proc['x69'] = df_proc['x69'].fillna(0)
df_proc['x70'] = df_proc['x70'].fillna(0)
df_proc['x71'] = df_proc['x71'].fillna(-1)

In [8]:
# one hot encoding
df_proc = pd.get_dummies(df_proc, columns=['x9'])

In [9]:
dfs = df_proc.copy()

In [10]:
drop_col = ['x14','x18','x19','x21','x22','x23','x27','x28','x29','x30','x31','x33','x34','x35','x36','x37','x38','x40','x49','x72']
df_spray = df_proc.drop(drop_col, axis=1)

In [11]:
# 相関があるが、同数の項目値が0⇆99と異なる。「x6」は99→0へ変換する。
df_spray['x6'] = df_spray['x6'].replace(99, 0)

In [12]:
# 相関があるが、同数の項目値が0⇆99と異なる。「x6」は99→0へ変換する。
df_spray['x67'] = df_spray['x67'].replace(99, 0)

In [13]:
# 相関があるが、同数の項目値が0⇆99999999へ変換すると異なる。「x63」は0→99999999へ変換する。
df_spray['x63'] = df_spray['x63'].replace(0, 99999999)

In [14]:
df_del = df_spray.copy()

In [15]:
df_del = df_del.drop(columns='x61')
df_del = df_del.drop(columns='x65')
df_del = df_del.drop(columns='x66')
df_del = df_del.drop(columns='x70')

In [16]:
df_label = df_del.copy()

In [17]:
# ランダム・フォレストによる相互作用評価
X = df_label.drop(columns='y')
y = df_label['y']

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=0)

classifier_model = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
classifier_model.fit(train_X, train_y)

pred_y = classifier_model.predict(test_X)
cm = confusion_matrix(test_y, pred_y)
print(cm)
print(accuracy_score(test_y, pred_y))

[[14929    61]
 [  628   132]]
0.9562539682539682


In [18]:
# Importtance
# 組み込み法（モデルベース特徴量選択：モデルが学習するタイミングで特徴量を評価する）
importance = classifier_model.feature_importances_
importances = pd.DataFrame(importance, index = X.columns, columns=['importance'])
pd.set_option('display.max_rows', 80)
importances.sort_values(by='importance', ascending=False)

Unnamed: 0,importance
x1_term,0.143358
x10,0.098021
x0_age,0.095925
x5,0.075107
x3,0.05806
x57,0.049504
x6,0.039396
x4,0.038059
x56,0.037499
x2_sex,0.025759


In [27]:
df_zero = df_label.copy()

In [28]:
# 下記も削除対象とする
cols =['x46', 'x48', 'x50', 'x51'] # 'x47'残す
df_zero = df_zero.drop(columns=cols)

In [29]:
# ②他の特徴量と値が完全に一致している変数の片方を削除
train_X_T = train_X.T
duplicated_features = train_X_T[train_X_T.duplicated()].index.values
print(duplicated_features)
# 'x47' 'x48' 'x50' 'x51' ←1つ残す、ただし全て相互作用評価結果(Importance=0)

['x47' 'x48' 'x50' 'x51']


In [30]:
df_add = df_zero.copy()
df_add['x73'] = (df_add['x0_age'] + df_add['x10'])/2
df_add['x74'] = (df_add['x5'] + df_add['x6'])/2
df_add['x75'] = (df_add['x59'] + df_add['x67'])/2
df_add['x76'] = (df_add['x59'] + df_add['x68'])/2
df_add['x77'] = (df_add['x62'] + df_add['x71'])/2
df_add['x78'] = (df_add['x63'] + df_add['x64'])/2

In [31]:
# ランダム・フォレストによる相互作用評価
X_re = df_add.drop(columns='y')
y_re = df_add['y']

train_X_re, test_X_re, train_y_re, test_y_re = train_test_split(X_re, y_re, test_size=0.3, random_state=0)

classifier_model_re = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
classifier_model_re.fit(train_X_re, train_y_re)

pred_y_re = classifier_model_re.predict(test_X_re)
cm_re = confusion_matrix(test_y_re, pred_y_re)
print(cm_re)
print(accuracy_score(test_y, pred_y))

[[14921    69]
 [  631   129]]
0.9562539682539682


In [32]:
# Importtance
importance_re = classifier_model_re.feature_importances_
importances_re = pd.DataFrame(importance_re, index = X_re.columns, columns=['importance'])
pd.set_option('display.max_rows', 50)
importances_re.sort_values(by='importance', ascending=False)

Unnamed: 0,importance
x1_term,0.118974
x73,0.080239
x10,0.077516
x0_age,0.073920
x5,0.059724
...,...
x62,0.000701
x71,0.000691
x67,0.000586
x68,0.000526


In [33]:
# Importanceからも削除
print(type(importances_re))
print(importances_re.shape)
importances_re = importances_re.drop('x75', axis=0)
importances_re = importances_re.drop('x76', axis=0)
importances_re = importances_re.drop('x77', axis=0)
importances_re = importances_re.drop('x78', axis=0)

<class 'pandas.core.frame.DataFrame'>
(59, 1)


In [34]:
df_select_del = df_add.copy()
del_col = ['x7', 'x8', 'x11', 'x15', 'x16', 'x24', 'x25', 'x26', 'x32', 'x39','x41', 'x44', 'x47', 'x54', 'x58', 'x59', 'x60', 'x62', 'x63', 'x64','x67', 'x68', 'x69', 'x71']
df_select_del = df_select_del.drop(columns=del_col)

In [35]:
df_light = df_select_del.copy()
df_light.shape

(52500, 36)

In [36]:
# デフォルト
# params = {
#     'boosting_type': 'gbdt',
#     'objective': 'binary',
#     'num_leaves': 31,
#     'num_iterations': 100,
#     'learning_rate': 0.1,
#     'feature_fraction': 0.9,
#     'bagging_fraction': 0.8,
#     'max_depth': ,
#     'min_data_in_leaf': 20,
# }

In [37]:
#     'boosting_type': 'gbdt', # アルゴリズム：勾配ブースティング
#     'objective': 'binary', # 目的 : 2値分類 
#     'metric': {'binary_error'}, # 評価指標 : 誤り率, binary_error（正答率）, binary_logloss(クロスエントロピー), auc
#     'num_leaves': 200, # Treeの複雑さ：（デフォルトは31）
#     'num_iterations':200, # 木の数：（デフォルトは100）
#     'learning_rate': 0.01, # 学習率：（デフォルト0.1の半分）
#     'feature_fraction': 0.9, # 各木を作成するときに使用可能な特徴量の割合：90%（デフォルト1.0）
#     'bagging_fraction': 0.8, # 使用するデータの割合：80%（デフォルト1.0）
#     'bagging_freq': 10, # 何回に一回baggingするか
#     'max_depth':10, # デフォルト：None
#     'colsample_bytree':1,
#     'n_estimators':50000, # ランダムサーチ(パラメータ範囲指定)
#     'min_data_in_leaf': 20, # 決定木のノード（葉）の最小データ数
#     'vervose': 0 # 途中経過出力：する

In [38]:
# # 自動チューニング
# import optuna.integration.lightgbm as lgb_tune

# df_train, df_val =train_test_split(df_light, test_size=0.2)
 
# col = "y"
# train_y = df_train[col]
# train_x = df_train.drop(col, axis=1)
 
# val_y = df_val[col]
# val_x = df_val.drop(col, axis=1)
 
# trains = lgb.Dataset(train_x, train_y)
# valids = lgb.Dataset(val_x, val_y)
 
# model_tune = lgb_tune.train(
#     params, trains, valid_sets=valids, 
#     verbose_eval=100, 
#     early_stopping_rounds=100,
# )
# print(model_tune.params)

# # 予測
# preds = model_tune.predict(train_x)
# # AUC
# fpr, tpr, thresholds = metrics.roc_curve(train_y, preds)
# auc = metrics.auc(fpr, tpr)
# auc

In [39]:
# ホールドアウト法→交差検証 (クロスバリデーション)に切り替える
kf = KFold(n_splits=5, shuffle=True, random_state=0)
predicted_df = pd.DataFrame({'index':0, 'pred':0}, index=[1])

val_indexs=[]
i=0
for train_index, val_index in kf.split(df_light.index):
    X_train = df_light.drop('y', axis=1).iloc[train_index]
    y_train = df_light['y'].iloc[train_index]
    X_test = df_light.drop('y', axis=1).iloc[val_index]
    y_test = df_light['y'].iloc[val_index]
    
# XGBoost Modeling
    clf = XGBClassifier(
            objective='binary:logistic',
            num_leaves=31,
            num_iterations=100,
            learning_rate=0.1,
            feature_fraction=1,
            subsample=1,
            max_depth=1,
            min_data_in_leaf=5
    )
    metLearn=CalibratedClassifierCV(clf)
    clf.fit(X_train, y_train)



Parameters: { feature_fraction, min_data_in_leaf, num_iterations, num_leaves } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { feature_fraction, min_data_in_leaf, num_iterations, num_leaves } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { feature_fraction, min_data_in_leaf, num_iterations, num_leaves } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find a

In [40]:
# 予測
Predictions = clf.predict_proba(X_test)

# AUC
fpr, tpr, thresholds = metrics.roc_curve(y_test, Predictions[:,1])
auc = metrics.auc(fpr, tpr)
auc

0.862829556425745

In [41]:
y_pred = np.where(Predictions[:, 1] < 0.5, 0, 1) # 閾値=0.5

In [42]:
print(accuracy_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
print(log_loss(y_test, y_pred))

0.9518095238095238
0.001984126984126984
0.25
0.003937007874015748
1.6644403099635339


In [43]:
# KS値
# K-S検定（コルモゴロフ-スミルノフ検定）
clf= RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
        max_depth=25, max_features='auto', max_leaf_nodes=None,
        min_impurity_decrease=0.0, min_impurity_split=None,
        min_samples_leaf=1, min_samples_split=15,
        min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=4,
        oob_score=False, random_state=0, verbose=0, warm_start=False)

clf.fit(X_train, y_train)
pre=clf.predict_proba(X_train)
pre_pd = pd.DataFrame(pre[:,0])
pre_pd['y'] = y_train

In [44]:
# import scipy.stats
# from sklearn.model_selection import RandomizedSearchCV
# LR_random = {LogisticRegression(): {"C": scipy.stats.uniform(0.00001, 1000),
#                                     "random_state": scipy.stats.randint(0, 100)}}
# #ランダムサーチ
# max_score = 0
# for model, param in LR_random.items():
#     clf =RandomizedSearchCV(model, param)
#     clf.fit(X_train_C, y_train)
#     perds_C = clf.predict_proba(X_test_C)
#     y_pred_C = np.where(perds_C[:, 1] < 0.7, 0, 1)
#     score = f1_score(y_test, y_pred_C, average="micro")

# # 最高スコア時の値を取得
#     if max_score < score:
#         max_score = score
#         best_model = model.__class__.__name__

# print("max_score:{}".format(max_score))
# print("best_param:{}".format(best_param))

In [45]:
# 変数のAUC
X_train_C = X_train.loc[:,['x0_age','x1_term','x9_C']].copy()
X_train_D = X_train.loc[:,['x0_age','x1_term','x9_D']].copy()
X_train_H = X_train.loc[:,['x0_age','x1_term','x9_H']].copy()
X_train_K = X_train.loc[:,['x0_age','x1_term','x9_K']].copy()
X_train_N = X_train.loc[:,['x0_age','x1_term','x9_N']].copy()
X_train_Q = X_train.loc[:,['x0_age','x1_term','x9_Q']].copy()
X_train_R = X_train.loc[:,['x0_age','x1_term','x9_R']].copy()
X_train_S = X_train.loc[:,['x0_age','x1_term','x9_S']].copy()
X_train_T = X_train.loc[:,['x0_age','x1_term','x9_T']].copy()

X_test_C = X_test.loc[:,['x0_age','x1_term','x9_C']].copy()
X_test_D = X_test.loc[:,['x0_age','x1_term','x9_D']].copy()
X_test_H = X_test.loc[:,['x0_age','x1_term','x9_H']].copy()
X_test_K = X_test.loc[:,['x0_age','x1_term','x9_K']].copy()
X_test_N = X_test.loc[:,['x0_age','x1_term','x9_N']].copy()
X_test_Q = X_test.loc[:,['x0_age','x1_term','x9_Q']].copy()
X_test_R = X_test.loc[:,['x0_age','x1_term','x9_R']].copy()
X_test_S = X_test.loc[:,['x0_age','x1_term','x9_S']].copy()
X_test_T = X_test.loc[:,['x0_age','x1_term','x9_T']].copy()

In [46]:
# ロジスティック回帰モデル
lr = LogisticRegression(C=961, random_state=32)

In [47]:
#ハイパーパラメータを調整しない場合
lr.fit(X_train_C, y_train)
y_pred_C = lr.predict_proba(X_test_C)

lr.fit(X_train_D, y_train)
y_pred_D = lr.predict_proba(X_test_D)

lr.fit(X_train_H, y_train)
y_pred_H = lr.predict_proba(X_test_H)

lr.fit(X_train_K, y_train)
y_pred_K = lr.predict_proba(X_test_K)

lr.fit(X_train_N, y_train)
y_pred_N = lr.predict_proba(X_test_N)

lr.fit(X_train_Q, y_train)
y_pred_Q = lr.predict_proba(X_test_Q)

lr.fit(X_train_R, y_train)
y_pred_R = lr.predict_proba(X_test_R)

lr.fit(X_train_S, y_train)
y_pred_S = lr.predict_proba(X_test_S)

lr.fit(X_train_T, y_train)
y_pred_T = lr.predict_proba(X_test_T)

In [48]:
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_C[:,1])
auc_C = metrics.auc(fpr, tpr)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_D[:,1])
auc_D = metrics.auc(fpr, tpr)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_H[:,1])
auc_H = metrics.auc(fpr, tpr)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_K[:,1])
auc_K = metrics.auc(fpr, tpr)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_N[:,1])
auc_N = metrics.auc(fpr, tpr)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_Q[:,1])
auc_Q = metrics.auc(fpr, tpr)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_R[:,1])
auc_R = metrics.auc(fpr, tpr)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_S[:,1])
auc_S = metrics.auc(fpr, tpr)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_T[:,1])
auc_T = metrics.auc(fpr, tpr)

In [49]:
print(auc_C)
print(auc_D)
print(auc_H)
print(auc_K)
print(auc_N)
print(auc_Q)
print(auc_R)
print(auc_S)
print(auc_T)

0.7343863140494293
0.7349830805338008
0.7334146952431767
0.7320327932760406
0.7344172788162884
0.7350843115023789
0.7347558070847386
0.7345919518601092
0.727334286889359


In [50]:
# 合成変数
X_train_3 = X_train.loc[:,['x0_age','x1_term','x73']].copy()
X_train_4 = X_train.loc[:,['x0_age','x1_term','x74']].copy()
X_train_5 = X_train.loc[:,['x0_age','x1_term','x75']].copy()
X_train_6 = X_train.loc[:,['x0_age','x1_term','x76']].copy()
X_train_7 = X_train.loc[:,['x0_age','x1_term','x77']].copy()
X_train_8 = X_train.loc[:,['x0_age','x1_term','x78']].copy()

In [51]:
X_test_3 = X_test.loc[:,['x0_age','x1_term','x73']].copy()
X_test_4 = X_test.loc[:,['x0_age','x1_term','x74']].copy()
X_test_5 = X_test.loc[:,['x0_age','x1_term','x75']].copy()
X_test_6 = X_test.loc[:,['x0_age','x1_term','x76']].copy()
X_test_7 = X_test.loc[:,['x0_age','x1_term','x77']].copy()
X_test_8 = X_test.loc[:,['x0_age','x1_term','x78']].copy()

In [52]:
# ロジスティック回帰モデル
lr = LogisticRegression(C=961, random_state=32)

In [53]:
lr.fit(X_train_3, y_train)
y_pred_3 = lr.predict_proba(X_test_3)

lr.fit(X_train_4, y_train)
y_pred_4 = lr.predict_proba(X_test_4)

lr.fit(X_train_5, y_train)
y_pred_5 = lr.predict_proba(X_test_5)

lr.fit(X_train_6, y_train)
y_pred_6 = lr.predict_proba(X_test_6)

lr.fit(X_train_7, y_train)
y_pred_7 = lr.predict_proba(X_test_7)

lr.fit(X_train_8, y_train)
y_pred_8 = lr.predict_proba(X_test_8)

In [54]:
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_3[:,1])
auc_3 = metrics.auc(fpr, tpr)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_4[:,1])
auc_4 = metrics.auc(fpr, tpr)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_5[:,1])
auc_5 = metrics.auc(fpr, tpr)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_6[:,1])
auc_6 = metrics.auc(fpr, tpr)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_7[:,1])
auc_7 = metrics.auc(fpr, tpr)

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_8[:,1])
auc_8 = metrics.auc(fpr, tpr)

In [55]:
print(auc_3)
print(auc_4)
print(auc_5)
print(auc_6)
print(auc_7)
print(auc_8)

0.733350483050363
0.7943484933656002
0.7350832197958548
0.7351677774284318
0.7345226781188666
0.7373817582588591


In [56]:
# 結果予測

In [57]:
df_test = pd.read_csv('./dat_test', encoding='utf-8', sep=',') # data read
df_test.shape

(21000, 73)

In [58]:
df_proc_test = df_test.copy()

In [59]:
df_proc_test['x0_age'] = df_proc_test['x0_age'].fillna(df_proc['x0_age'].median())

In [60]:
df_proc_test['x10'] = df_proc_test['x10'].fillna(df_proc['x10'].median())

In [61]:
df_proc_test['x58'] = df_proc_test['x58'].fillna(0)
df_proc_test['x59'] = df_proc_test['x59'].fillna(0)
df_proc_test['x60'] = df_proc_test['x60'].fillna(0)
df_proc_test['x61'] = df_proc_test['x61'].fillna(0)
df_proc_test['x62'] = df_proc_test['x62'].fillna(-1)
df_proc_test['x63'] = df_proc_test['x63'].fillna(0)
df_proc_test['x64'] = df_proc_test['x64'].fillna(99999999)
df_proc_test['x65'] = df_proc_test['x65'].fillna(0)
df_proc_test['x66'] = df_proc_test['x66'].fillna(0)
df_proc_test['x67'] = df_proc_test['x67'].fillna(99)
df_proc_test['x68'] = df_proc_test['x68'].fillna(0)
df_proc_test['x69'] = df_proc_test['x69'].fillna(0)
df_proc_test['x70'] = df_proc_test['x70'].fillna(0)
df_proc_test['x71'] = df_proc_test['x71'].fillna(-1)

In [62]:
# one hot encoding
df_proc_test = pd.get_dummies(df_proc_test, columns=['x9'])

In [63]:
drop_col = ['x14','x18','x19','x21','x22','x23','x27','x28','x29','x30','x31','x33','x34','x35','x36','x37','x38','x40','x49','x72']
df_proc_test = df_proc_test.drop(drop_col, axis=1)

In [64]:
df_proc_test['x6'] = df_proc_test['x6'].replace(99, 0)
df_proc_test['x67'] = df_proc_test['x67'].replace(99, 0)
df_proc_test['x63'] = df_proc_test['x63'].replace(0, 99999999)

In [65]:
df_proc_test = df_proc_test.drop(columns='x61')
df_proc_test = df_proc_test.drop(columns='x65')
df_proc_test = df_proc_test.drop(columns='x66')
df_proc_test = df_proc_test.drop(columns='x70')

In [66]:
# 下記も削除対象とする
cols =['x46', 'x48', 'x50', 'x51'] # 'x47'残す
df_proc_test = df_proc_test.drop(columns=cols)

In [67]:
df_proc_test['x73'] = (df_proc_test['x0_age'] + df_proc_test['x10'])/2
df_proc_test['x74'] = (df_proc_test['x5'] + df_proc_test['x6'])/2
df_proc_test['x75'] = (df_proc_test['x59'] + df_proc_test['x67'])/2
df_proc_test['x76'] = (df_proc_test['x59'] + df_proc_test['x68'])/2
df_proc_test['x77'] = (df_proc_test['x62'] + df_proc_test['x71'])/2
df_proc_test['x78'] = (df_proc_test['x63'] + df_proc_test['x64'])/2

In [68]:
del_col = ['x9_X','x7', 'x8', 'x11', 'x15', 'x16', 'x24', 'x25', 'x26', 'x32', 'x39','x41', 'x44', 'x47', 'x54', 'x58', 'x59', 'x60', 'x62', 'x63', 'x64','x67', 'x68', 'x69', 'x71']
df_proc_test = df_proc_test.drop(columns=del_col)

In [69]:
df_proc_test.shape

(21000, 35)

In [70]:
# 予測
Pred_test = clf.predict_proba(df_proc_test)

In [71]:
y_pred_test = np.where(Pred_test[:, 1] < 0.5, 0, 1) # 閾値=0.5
y_pred_test.shape

(21000,)

In [72]:
nonzero = np.count_nonzero(y_pred_test)
nonzero

182

In [73]:
with open('./test_result_2.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(y_pred_test)

In [74]:
with open('./test_result_2.csv') as f:
    print(f.read())

0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,