# EDA

In [206]:
import pandas as pd
import numpy as np
import pickle
import glob
import os
import lightgbm as lgb
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import cross_val_score

In [44]:
def position_to_number(data):
    column = "POS"
    data[column] = data["POS"].replace("1루수",1)
    data[column] = data["POS"].replace("2루수",2)
    data[column] = data["POS"].replace("3루수",3)
    data[column] = data["POS"].replace("유격수",4)
    return data


In [108]:
def del_noise_column(data):
    noise_columns = ["PB","SB","CS","CS%","순위","선수명","IP"]
    data_columns = data.columns
    for column in noise_columns:
        if column in data_columns:
            del data[column]
    label = data["POS"]
    if "POS" in data_columns:
        del data["POS"]
    return data, label

In [188]:
def one_hot_encoding(df,column):
    mlb = MultiLabelBinarizer()
    df = df.join(pd.DataFrame(mlb.fit_transform(df.pop(column)),
                          columns=mlb.classes_,
                          index=df.index))

In [194]:
def replace_zero_missing_value(df):
    missing_values = ["","-","_"]
    colmuns = df.columns
    for column in colmuns:
        for missing_value in missing_values:
            df = df.replace({column:missing_value},{column:0})
    return df

In [174]:
## cst 에서 tsv로 바꾸기 
# data_name = "2018.tsv"
# t = pd.read_csv(path + data_name)
# del t["Unnamed: 0"]
# t.to_csv(path + data_name,mode="w",sep="\t", index=False)

In [178]:
path = "./data/"
file_list = os.listdir(path)
df = pd.DataFrame()
for data_path in file_list:
    data = pd.read_csv(path + data_path,sep="\t")
    df = pd.concat([df,data])
# b_2011 = pd.read_csv("./data/2011.tsv",sep="\t")
df

Unnamed: 0,순위,선수명,팀명,POS,G,GS,IP,E,PKO,PO,A,DP,FPCT,PB,SB,CS,CS%
0,1,김상수,삼성,유격수,126,121,1059 1/3,22,0,205,356,74,0.962,0,0,0,-
1,2,정성훈,LG,3루수,125,123,1015 2/3,12,0,63,216,19,0.959,0,0,0,-
2,3,강정호,넥센,유격수,123,122,1059 1/3,13,0,186,365,81,0.977,0,0,0,-
3,4,문규현,롯데,유격수,122,104,892,16,0,184,354,89,0.971,0,0,0,-
4,5,김민우,넥센,3루수,120,109,963,13,0,77,192,23,0.954,0,0,0,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,145,박찬호,KIA,2루수,11,7,69 2/3,1,0,25,23,8,0.98,0,0,0,-
146,145,김성훈,삼성,유격수,11,3,50 1/3,1,0,14,13,5,0.964,0,0,0,-
147,145,김민수,롯데,3루수,11,10,89,1,0,9,13,0,0.957,0,0,0,-
148,145,신용수,롯데,유격수,11,2,36,4,0,4,9,1,0.765,0,0,0,-


# Preprocessing

In [220]:
def divide_train_test(data,label):
    '''
    데이터를 트레이닝 테스트 데이터로 나눠서 반환
    책에서는 train_test_split random_state=11의 파라미터를 주나 코드에서는 제거
    :return:
    '''
    x_train, x_test, y_train, y_test = train_test_split(data,label,
                                                        test_size=0.2)
    return x_train, x_test, y_train, y_test

def dt_model_train(data, label):
    '''
    데이터 트레이닝 및 예측 진행
    :parameter model_save : 모델을 반환받기를 원할경우 True 입력
    :return:
    '''
    print("DecisionTreeClassifier train")
    # 데이터 트레이닝
    x_train, x_test, y_train, y_test = divide_train_test(data, label)
    dt_clt = DecisionTreeClassifier()
    dt_clt.fit(x_train,y_train)
    # 예측
    print("================================================================")
    pred = dt_clt.predict(x_test)
    print("모델 정확도 : {0:.4f}".format(accuracy_score(y_test,pred)))
    print(list(y_test))
    print(list(pred))
    print(" -- cross_val_score 5 -- ")
    print(cross_val_score(dt_clt, x_train,y_train, cv=5).mean())
    print("================================================================")
    
def lr_model_train(data, label):
    '''
    데이터 트레이닝 및 예측 진행
    :parameter model_save : 모델을 반환받기를 원할경우 True 입력
    :return:
    '''
    print("LogisticRegression train")
    # 데이터 트레이닝
    x_train, x_test, y_train, y_test = divide_train_test(data, label)
    lr_clt = LogisticRegression()
    
    lr_clt.fit(x_train,y_train)
    # 예측
    pred = lr_clt.predict(x_test)
    print("================================================================")
    print("모델 정확도 : {0:.4f}".format(accuracy_score(y_test,pred)))
    print(list(y_test))
    print(list(pred))
    print(" -- cross_val_score 5 -- ")
    print(cross_val_score(lr_clt, x_train,y_train, cv=5).mean())
    print("================================================================")

def lgbm_model_train(data, label):
    '''
    데이터 트레이닝 및 예측 진행
    :parameter model_save : 모델을 반환받기를 원할경우 True 입력
    :return:
    '''
    print("Light gbm train")
    # 데이터 트레이닝
    x_train, x_test, y_train, y_test = divide_train_test(data, label)
    # 학습 모델
    params = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'multiclass',
        'metric': {'multi_logloss'},
        'num_leaves': 63,
        'learning_rate': 0.1,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.9,
        'bagging_freq': 0,
        'verbose': 0,
        'num_class': 3
    }
    # create dataset for lightgbm
    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_test = lgb.Dataset(x_test, y_test, reference=lgb_train)
    lgb_model = lgb.train(params, lgb_train, num_boost_round=20,
                          valid_sets=lgb_test, early_stopping_rounds=5)
    # 예측
    pred = lgb_model.predict(x_test)
    pred = [pr.argmax() for pr in pred]
    print("모델 정확도 : {0:.4f}".format(accuracy_score(y_test, pred)))
    print(list(y_test))
    print(list(pred))

In [126]:
label

0      4
1      3
2      4
3      4
4      3
      ..
145    3
146    4
147    4
148    1
149    2
Name: POS, Length: 150, dtype: int64

In [179]:
data = position_to_number(df)
data, label = del_noise_column(data)

In [195]:
data = replace_zero_missing_value(data)
data = one_hot_encoding(data,"팀명")

KeyError: '팀명'

In [223]:
dt_model_train(data, label)
lr_model_train(data, label)
# print(lgbm_model_train(data, label))

DecisionTreeClassifier train
모델 정확도 : 0.7519
[4, 4, 2, 4, 4, 3, 2, 3, 4, 2, 3, 2, 2, 4, 3, 1, 2, 1, 3, 2, 1, 3, 1, 1, 2, 1, 3, 1, 1, 4, 3, 3, 1, 2, 1, 3, 4, 1, 4, 2, 3, 2, 3, 1, 3, 1, 1, 2, 4, 2, 3, 3, 2, 4, 3, 1, 2, 4, 1, 4, 3, 4, 3, 1, 3, 1, 1, 3, 2, 2, 1, 4, 2, 3, 4, 1, 3, 3, 1, 2, 1, 4, 3, 3, 4, 1, 3, 1, 4, 3, 1, 3, 1, 2, 3, 2, 2, 3, 1, 2, 2, 4, 2, 4, 2, 1, 1, 4, 1, 2, 2, 4, 4, 3, 1, 3, 3, 2, 1, 2, 3, 4, 1, 2, 1, 4, 3, 1, 4, 2, 1, 3, 3, 3, 4, 4, 3, 2, 3, 1, 4, 1, 1, 3, 1, 1, 2, 2, 1, 1, 1, 4, 3, 1, 2, 4, 1, 4, 3, 4, 1, 4, 1, 4, 4, 2, 3, 4, 4, 2, 3, 4, 1, 2, 3, 3, 4, 2, 3, 2, 3, 2, 4, 1, 2, 2, 2, 3, 2, 4, 2, 1, 3, 1, 2, 3, 1, 1, 4, 3, 3, 1, 4, 2, 1, 1, 1, 1, 4, 3, 4, 2, 1, 1, 1, 1, 2, 4, 2, 4, 3, 4, 2, 1, 4, 1, 2, 2, 1, 3, 4, 1, 2, 2, 3, 4, 3, 1, 1, 2, 2, 1, 3, 3, 4, 3, 2, 2, 4, 4, 4, 4, 2, 2, 1, 3, 1, 1, 1, 2, 4, 1, 3, 2, 2, 2, 3, 4, 4, 4]
[4, 4, 2, 4, 4, 3, 3, 3, 4, 2, 3, 3, 2, 2, 3, 1, 2, 1, 3, 4, 1, 3, 1, 1, 3, 1, 2, 1, 1, 2, 3, 3, 1, 4, 1, 4, 2, 1, 4, 4, 3, 2, 3, 1, 3, 1, 1, 2,

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

0.8777777777777779


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [184]:
data

Unnamed: 0,G,GS,E,PKO,PO,A,DP,FPCT
0,126,121,22,0,205,356,74,0.962
1,125,123,12,0,63,216,19,0.959
2,123,122,13,0,186,365,81,0.977
3,122,104,16,0,184,354,89,0.971
4,120,109,13,0,77,192,23,0.954
...,...,...,...,...,...,...,...,...
145,11,7,1,0,25,23,8,0.98
146,11,3,1,0,14,13,5,0.964
147,11,10,1,0,9,13,0,0.957
148,11,2,4,0,4,9,1,0.765
