# import

In [159]:
import pandas as pd
import numpy as np
import pickle
import glob
import os
import lightgbm as lgb
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import cross_val_score
#Supress default INFO logging
import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

# data load  and preprocessing

#### 수비와 타자를 이름으로 조합 시켜야함

In [14]:
defende_path = "./data/수비/"
batter_path = "./data/타자/"
file_list = ['2011.tsv','2012.tsv','2013.tsv','2014.tsv','2015.tsv','2016.tsv','2017.tsv','2018.tsv','2019.tsv']

In [54]:
def data_load(file_list, defende_path,batter_path):
    df = pd.DataFrame()
    for data_path in file_list:
        defender = pd.read_csv(defende_path + data_path,sep="\t")
        batter = pd.read_csv(batter_path + data_path,sep="\t")
        data = pd.merge(defender,batter,on=["선수명","팀명"])
        df = pd.concat([df,data])
    return df, list(defender.columns)

In [48]:
def del_noise_column(data, columns):
    noise_columns = ["순위_x","순위_y","G_x"] + columns
    for column in noise_columns:
        if column in data:
            del data[column]
    return data

#### defender_coumns 제거를 위해  수비수 포지션을 받고  포지션은 제거 리스트에서 제외

In [111]:
df, defender_coumns = data_load(file_list, defende_path, batter_path)
del defender_coumns[defender_coumns.index("POS")]
del defender_coumns[defender_coumns.index("선수명")]
del defender_coumns[defender_coumns.index("팀명")]

In [112]:
df = del_noise_column(df,list(defender_coumns))

> 선수명은 나중에 값 확인을 위해 따로 빼기  + label 추출

In [113]:
if "선수명" in df:
    df_name = df["선수명"]
    del df["선수명"]

label = None
if "POS" in df:
    label = df["POS"]
    del df["POS"]

> 팀명 원핫 인코딩

In [114]:
if "팀명" in df:
    one_hot_encoded = pd.get_dummies(df.팀명) 
    del df["팀명"]
    df = pd.concat([one_hot_encoded,df],axis=1)

# modelling

In [160]:
def divide_train_test(data,label):
    '''
    데이터를 트레이닝 테스트 데이터로 나눠서 반환
    책에서는 train_test_split random_state=11의 파라미터를 주나 코드에서는 제거
    :return:
    '''
    x_train, x_test, y_train, y_test = train_test_split(data,label,
                                                        test_size=0.2)
    return x_train, x_test, y_train, y_test

def find_model(model_name):
    clt = None
    if model_name == "LogisticRegression":
        clt = LogisticRegression()
    elif model_name == "DecisionTreeClassifier":
        clt = DecisionTreeClassifier()
    elif model_name == "svm":
        clt = svm.SVC(gamma=0.001)
    elif model_name == "RandomForestClassifier":
        clt = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)
    elif model_name == "AdaBoostClassifier":
        clt = AdaBoostClassifier()
    
    return clt

### sciket basic classification
- 지원 모델 <br>
```
LogisticRegression 
DecisionTreeClassifier 
svm 
RandomForestClassifier
AdaBoostClassifier 
```

In [154]:

def sciket_basic_model_train(data, label,model, debug = False):
    '''
    데이터 트레이닝 및 예측 진행
    :parameter model_save : 모델을 반환받기를 원할경우 True 입력
    :return:
    '''
    print("================================================================")
    print("{} train".format(model))
    # 데이터 트레이닝
    x_train, x_test, y_train, y_test = divide_train_test(data, label)
    clt = find_model(model)
    if clt == None:
        print("***** 모델 없음 ****")
        return
    clt.fit(x_train,y_train)
    # 예측
    pred = clt.predict(x_test)
    print("모델 정확도 : {0:.4f}".format(accuracy_score(y_test,pred)))
    if debug:
        print(list(y_test))
        print(list(pred))
        print(" -- cross_val_score 5 -- ")
        print(cross_val_score(clt, x_train,y_train, cv=5).mean())
    print("================================================================")

#### lgbm

In [68]:
def lgbm_model_train(data, label,debug = False):
    '''
    데이터 트레이닝 및 예측 진행
    :parameter model_save : 모델을 반환받기를 원할경우 True 입력
    :return:
    '''
    print("Light gbm train")
    # 데이터 트레이닝
    x_train, x_test, y_train, y_test = divide_train_test(data, label)
    # 학습 모델
    params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'multiclass',
        'metric': {'multi_logloss'},
        'num_leaves': 63,
        'learning_rate': 0.1,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.9,
        'bagging_freq': 0,
        'verbose': 0,
        'num_class': 3
    }
    # create dataset for lightgbm
    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_test = lgb.Dataset(x_test, y_test, reference=lgb_train)
    lgb_model = lgb.train(params, lgb_train, num_boost_round=20,
                          valid_sets=lgb_test, early_stopping_rounds=5)
    # 예측
    pred = lgb_model.predict(x_test)
    pred = [pr.argmax() for pr in pred]
    print("모델 정확도 : {0:.4f}".format(accuracy_score(y_test, pred)))
    print(list(y_test))
    print(list(pred))

# training

In [168]:
sciket_basic_model_train(df, label,"LogisticRegression")
sciket_basic_model_train(df, label, "svm")
sciket_basic_model_train(df, label, "DecisionTreeClassifier")
sciket_basic_model_train(df, label, "RandomForestClassifier")
sciket_basic_model_train(df, label, "AdaBoostClassifier")
# print(lgbm_model_train(data, label))

LogisticRegression train
모델 정확도 : 0.4039
svm train
모델 정확도 : 0.2118
DecisionTreeClassifier train
모델 정확도 : 0.2000

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



RandomForestClassifier train
모델 정확도 : 0.2863
AdaBoostClassifier train
모델 정확도 : 0.3451
