In [None]:
import sys
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import japanize_matplotlib #日本語読み込み

from pandas.plotting import parallel_coordinates
get_ipython().run_line_magic('matplotlib', 'inline')
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics  import log_loss

import gc　#メモリ開放に使用
import pickle #モデル保存に使用

In [None]:
pd.set_option('display.max_columns', 200)

#読み込み
train_pitch_df = pd.read_csv("../Data/train_pitch_add_column1.csv")
train_player_df = pd.read_csv("../Data/train_player.csv")

print(train_pitch_df.columns)
train_pitch_df.head()

#one-hot化が必要

columns1 = ['球場名','試合種別詳細','イニング','表裏','打者打席左右',
            '投手投球左右','投手役割','投手登板順','打者打順', '打者守備位置', '1球前ステータス']

#one-hot化が必要だが多いので保留
columns2 = ['日付', '時刻','年度','試合ID','投手ID','投手チームID','打者ID','打者チームID','プレイ前走者状況',
            '一塁走者ID', '二塁走者ID','三塁走者ID', '捕手ID', '一塁手ID', '二塁手ID', '三塁手ID', '遊撃手ID',
            '左翼手ID', '中堅手ID','右翼手ID', '成績対象投手ID', '成績対象打者ID','ホームチームID','アウェイチームID','球場ID']

#one-hot化が必要なし
columns3 = ['試合内連番', '試合内投球数','イニング内打席数', '打席内投球数','投手試合内対戦打者数', '投手試合内投球数',
            '投手イニング内投球数','打者試合内打席数', 'プレイ前ホームチーム得点数', 'プレイ前アウェイチーム得点数',
            'プレイ前アウト数', 'プレイ前ボール数', 'プレイ前ストライク数','自チーム得点数', '相手チーム得点数', '点差']

columns4 = ['データ内連番','球種','投球位置区域']

key = ['データ内連番']
y1 = ['球種'] #目的変数１
y2 = ['投球位置区域'] #目的変数２

#指定cloumnのone-hot化
train_pitch_df = pd.get_dummies(train_pitch_df, columns = columns1)

#余分な特徴量を削除
train_pitch_df2 = train_pitch_df.drop(columns = columns2)
X = train_pitch_df2.drop(columns = columns4)

Y = train_pitch_df['球種']
Y2 = train_pitch_df['球種'].astype(str)
Y2 = pd.get_dummies(Y2)

X.head()

#要素ごとの個数を表示
Y.value_counts()

In [None]:
#要素の偏りをなくしたデータセットの作成

Y_X = pd.concat([Y,X],axis = 1)
Y_X_10000 = pd.DataFrame()
for i in range(8):
    tmp = Y_X[Y_X['球種'] == i]
    if len(tmp) > 10000:
        tmp = tmp[:10000]
    else:
        tmp = tmp[:len(tmp)]
    print(len(tmp))
    Y_X_10000 = pd.concat([Y_X_10000, tmp])

len(Y_X_10000)
X2 = Y_X_10000.drop(columns = ["球種"])
Y2 = Y_X_10000["球種"]

#相関係数
Y_X_corr = Y_X.corr()
print(type(Y_X_corr))

#相関係数のソート
#200行まで表示
pd.set_option('display.max_rows',200)
Y_X_corr_rank = Y_X_corr['球種']
Y_X_corr_rank = Y_X_corr.sort_values('球種',ascending=False)
print("OK")

In [None]:
#メモリの開放
del train_pitch_df, train_pitch_df2,train_player_df,tmp,Y_X,Y_X_10000
gc.collect()

In [None]:
#通常
train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.3, shuffle=True)

#球種割合揃え
#train_x, test_x, train_y, test_y = train_test_split(X2, Y2, test_size=0.3, shuffle=True)


In [None]:
########### XGBoost ###########
import xgboost as xgb
import GPy
import GPyOpt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

'''
X：特徴量
Y：目的変数
必要なもの
pip install xgboost
pip install japanize-matplotlib
'''

#変数設定
init_num = 50          #初期サンプル数
max_iter = 200         #サンプリング回数
path_model = "../Model/xgb_model.pickle" #Model保存場所

#関数内で使う変数
initial_design = 1     #初期サンプル回数計測
try_ = 0               #サンプリング回数計測
bounds_list = list()   #パラメータを保存用list

#推定用関数
def f(x):
    start = time.time()
    model = xgb.XGBClassifier(num_class = 8, 
                              eta = float(x[:,0]),
                              max_depth = int(x[:,1]),
                              min_child_weight = float(x[:,2]),
                              subsample = float(x[:,3]),
                              colsample_bytree = float(x[:,4]),
                              gamma = float(x[:,5]),
                              n_estimators = int(x[:,6]),
                              learning_rate = float(x[:,7]),
                              reg_lambda = float(x[:,8]),
                              reg_alpha = float(x[:,9]),
                              tree_method = 'gpu_hist',　　　　　#cpuの場合削除
                              objective = 'multi:softprob'
                              )
    #global変数の読み込み
    
    global initial_design
    global try_
    global bounds_list
    
    #回数の表示　Inital_Design:初期　Try:
    print("---------------------------------")
    print("Inital_Design:{0} / {1}  Try:{2} / {3}\n".format(initial_design, init_num, try_, max_iter))
    
    #回数の更新
    if initial_design < init_num:
        initial_design += 1
    else:
        try_ += 1
        
    print("Next bounds is")
    x = np.reshape(x,(x.size,)) #なぜか(1,10)の二次元配列だったためreshape
    
    #パラメータの表示
    for bound, x_ in zip(bounds, x):
        print(f"{bound['name']:s} = {x_:.3f}  ", end="")
    
    
    x_list = x.tolist() #numpyをlist化(numpyはappendが遅いため)
    bounds_list.append(x_list) #パラメータをlistで保存
    
    # CV
    kfold = KFold(n_splits=5, random_state=7)
    results = cross_validate(model, train_x, train_y, scoring = 'neg_log_loss', cv=kfold)
    
    #時間表示
    t = time.time() - start
    print("\n\ntime:",t)
    
    #loglossの表示
    score = results['test_score'].mean()*(-1)
    print(f"\nLogloss: {score:f}")
    
    
    return score

bounds = [{'name': 'eta', 'type': 'continuous', 'domain': (0.3,0.4)},
          {'name': 'max_depth', 'type': 'continuous', 'domain': (3,15)},
          {'name': 'min_child_weight', 'type': 'continuous', 'domain': (0,2)},
          {'name': 'subsample', 'type': 'continuous', 'domain': (0.8,1)},
          {'name': 'colsample_bytree', 'type': 'continuous', 'domain': (0.8,1)},
          {'name': 'gamma', 'type': 'continuous', 'domain': (0,10)},
          {'name': 'n_estimators', 'type': 'continuous', 'domain': (10,200)},
          {'name': 'learning_rate', 'type': 'continuous', 'domain': (0.3,1)},
          {'name': 'reg_lambda', 'type': 'continuous', 'domain': (0.8,1)},
          {'name': 'reg_alpha', 'type': 'continuous', 'domain': (0,0.3)}]
start_all = time.time()
print("Bayesian Optimization")
myBopt = GPyOpt.methods.BayesianOptimization(f=f,initial_design_numdata=init_num, verbosity = True, domain=bounds)
myBopt.run_optimization(max_iter=max_iter)

#処理時間の表示
total_time = time.time() - start_all
print("\nTotal-Time:",total_time)

print("\nEnd")
result_z = myBopt.Y


In [None]:
#結果の可視化
plt.figure()
plt.plot(result_z)
plt.xlabel("epochs")
plt.ylabel("CV Accuracy")
plt.show()

df = pd.DataFrame(bounds_list)
for bound ,i in zip(bounds,range(len(bounds))):
    df = df.rename(columns={i:bound["name"]})

for name, bound_list in df.iteritems():
    plt.figure()
    plt.plot(bound_list)
    plt.xlabel("epochs")
    plt.ylabel(name)
    plt.show()
    plt.clf
    plt.close()
print(len(bounds_list))
    
#最適なパラメータの表示
print("### best parameters ###")

for bound, x_opt in zip(bounds, myBopt.x_opt):
    print(f"{bound['name']:s} = {x_opt:.3f}")


In [None]:
#最適なパラメータで再度学習、
#x = myBopt.x_opt
x=([3.58159257e-01, 1.31312711e+01, 1.72685680e-01, 8.54364803e-01,
       8.01858020e-01, 1.45562823e+00, 1.88617583e+02, 1.12371349e-01,
       9.54153925e-01, 2.94973626e-01])


dtrain = xgb.DMatrix(train_x, label=train_y)
dtest = xgb.DMatrix(test_x, label=test_y)  
evallist = [(dtrain, 'train'), (dtest, 'eval')]  #saigonosiyhyougaearly_stopnisiyou
num_round = 500  

param = {'eta': x[0],'max_depth':int(x[1]), 'min_child_weight':x[2], 'subsample':x[3],'colsample_bytree':x[4],
         'gamma': x[5],'n_estimators':int(x[6]),'learning_rate':x[7],'reg_lambda':x[8],'reg_alpha':x[9],
         'colsample':0.5, 'objective':'multi:softprob','num_class':8, 'eval_metric':'mlogloss', 
         'terr_method':'gpu_hist'}  #cpuの場合削除

bst = xgb.train(param, dtrain, num_round, evallist, early_stopping_rounds=10)


In [None]:
#modelの保存
pickle.dump(bst, open(path_model, "wb"))


In [None]:
#結果表示
dtest = xgb.DMatrix(test_x, label=test_y)
pred = bst.predict(dtest)
pred_p = pd.DataFrame(pred)
print(len(pred_p))
print(len(test_y))
print("Optimized XGBoost")
print(log_loss(test_y, pred_p))

#重要度の可視化
fig, ax = plt.subplots(1, 1, figsize=(8,25))
xgb.plot_importance(bst, ax=ax)

In [None]:
#CVで再学習
num_round = 500 
model = xgb.XGBClassifier(num_class = 8, eta=x[0], max_depth=int(x[1]),min_child_weight=x[2],
                          subsample=x[3], colsample_bytree=x[4], gamma=x[5], n_estimators=int(x[6]),
                          learning_rate=x[7], reg_lambda=x[8], reg_alpha=x[9],
                          tree_method = 'gpu_hist',　　　　　　　#cpuの場合削除
                          objective = 'multi:softprob')
fit_params = {"early_stopping_rounds": 10,
              "eval_set": [[test_x, test_y]],
              'eval_metric':'mlogloss'}

# CV

kfold = KFold(n_splits=5, random_state=7)
results = cross_validate(model, train_x, train_y, fit_params = fit_params, scoring = 'neg_log_loss', cv=kfold)
score = results['test_score'].mean()*(-1)
print("result:",results['test_score'])
print(f"Logloss: {score:f}")

In [None]:
#メモリの確認
print("{}{: >25}{}{: >10}{}".format('|','Variable Name','|','Memory','|'))
print(" ------------------------------------ ")
for var_name in dir():
    if not var_name.startswith("_"):
        print ("{}{: >25}{}{: >10}{}".format('|',var_name,'|',sys.getsizeof(eval(var_name)),'|'))
        