In [51]:
import pandas as pd
import numpy as np
import pickle
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import cross_val_score
#Supress default INFO logging
import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)
plt.rcParams['font.family'] = 'NanumGothic'
plt.rcParams['font.family'] = 'Malgun Gothic'


# data load  and preprocessing

In [146]:
defende_path = "./data/수비/"
batter_path = "./data/타자/"
file_list = ['2011.tsv','2012.tsv','2013.tsv','2014.tsv','2015.tsv','2016.tsv','2017.tsv','2018.tsv','2019.tsv']

In [179]:
def data_load(file_list, defende_path,batter_path):
    df = pd.DataFrame()
    for data_path in file_list:
        defender = pd.read_csv(defende_path + data_path,sep="\t")
        batter = pd.read_csv(batter_path + data_path,sep="\t")
        data = pd.merge(defender,batter,on=["선수명","팀명"])
        df = pd.concat([df,data])
    return df, list(defender.columns)

In [148]:
def del_noise_column(data, columns):
    noise_columns = ["순위_x","순위_y","G_x"] + columns
    for column in noise_columns:
        if column in data:
            del data[column]
    return data

In [149]:
def change_one_hot(df, column):
    one_hot_encoded = pd.get_dummies(df[column]) 
    del df[column]
    df = pd.concat([one_hot_encoded,df],axis=1)
    return df

In [150]:
def change_years(df,column,split_tag,index):
    df[column] = df[column].str.split(split_tag).str[index]
    return df

#### defender_coumns 제거를 위해  수비수 포지션을 받고  포지션은 제거 리스트에서 제외

In [200]:
df, defender_coumns = data_load(file_list, defende_path, batter_path)
del defender_coumns[defender_coumns.index("POS")]
del defender_coumns[defender_coumns.index("선수명")]
del defender_coumns[defender_coumns.index("팀명")]
df

Unnamed: 0,순위_x,선수명,팀명,POS,G_x,GS,IP,E,PKO,PO,...,AB,R,H,2B,3B,HR,TB,RBI,SAC,SF
0,1,김상수,삼성,유격수,126,121,1059 1/3,22,0,205,...,406,53,113,16,4,2,143,47,15,1
1,2,정성훈,LG,3루수,125,123,1015 2/3,12,0,63,...,422,61,123,18,1,10,173,57,11,7
2,3,강정호,넥센,유격수,123,122,1059 1/3,13,0,186,...,444,53,125,22,2,9,178,63,3,5
3,4,문규현,롯데,유격수,122,104,892,16,0,184,...,327,40,79,13,3,2,104,39,14,3
4,132,문규현,롯데,3루수,6,0,12,0,0,1,...,327,40,79,13,3,2,104,39,14,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,130,고명성,KT,유격수,13,1,35 2/3,1,0,3,...,16,4,2,0,0,0,2,0,0,0
141,141,최준우,SK,2루수,12,11,76,0,0,15,...,33,3,7,2,0,0,9,0,0,0
142,145,김민수,롯데,3루수,11,10,89,1,0,9,...,34,0,8,3,0,0,11,1,0,0
143,145,신용수,롯데,유격수,11,2,36,4,0,4,...,20,4,2,0,0,1,5,2,0,0


In [173]:
df = del_noise_column(df,list(defender_coumns))

In [221]:
label = None
if "POS" in df:
    label = df["POS"]
    del df["POS"]

# 테스트 추가 데이터 로드

In [204]:
test_path = "./data/test/2019.tsv"
crawl_path = "./data/crawl/kbo/kbo.csv"

> 내야수만 남기고 나머지 정보 삭제

In [215]:
test_data = pd.read_csv(test_path,sep="\t")
crawl_data = pd.read_csv(crawl_path)
crawl_data = crawl_data[crawl_data["position"].str.contains("내야수")]

In [228]:
data = pd.merge(df,crawl_data, left_on="선수명",right_on="name")
data = change_one_hot(data, "팀명")
data = change_one_hot(data, "position")
del data["name"]
palyer_name = data["선수명"]
del data["선수명"]
del data["순위_y"]
del data["G_y"]
data = change_years(data,"berth","년 ",0)
data

Unnamed: 0,내야수(우투양타),내야수(우투우타),내야수(우투좌타),내야수(좌투좌타),KIA,KT,LG,NC,SK,넥센,...,TB,RBI,SAC,SF,number,berth,height,weight,payment,salary
0,0,1,0,0,0,0,0,0,0,0,...,143,47,15,1,7.0,1990,175.0,68.0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,143,47,15,1,7.0,1900,175.0,67.0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,143,47,15,1,7.0,1990,175.0,68.0,28000,25000
3,0,1,0,0,0,0,0,0,0,0,...,147,36,16,6,7.0,1990,175.0,68.0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,147,36,16,6,7.0,1900,175.0,67.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1325,0,1,0,0,0,1,0,0,0,0,...,2,0,0,0,3.0,1999,178.0,68.0,7000,2800
1326,0,0,1,0,0,0,0,0,1,0,...,9,0,0,0,68.0,1999,176.0,78.0,7000,2900
1327,0,1,0,0,0,0,0,0,0,0,...,11,1,0,0,30.0,1998,184.0,96.0,11000,2700
1328,0,1,0,0,0,0,0,0,0,0,...,11,1,0,0,96.0,1998,180.0,83.0,4000,2700


In [202]:
df[:20]

Unnamed: 0,순위_x,선수명,팀명,POS,G_x,GS,IP,E,PKO,PO,...,AB,R,H,2B,3B,HR,TB,RBI,SAC,SF
0,1,김상수,삼성,유격수,126,121,1059 1/3,22,0,205,...,406,53,113,16,4,2,143,47,15,1
1,2,정성훈,LG,3루수,125,123,1015 2/3,12,0,63,...,422,61,123,18,1,10,173,57,11,7
2,3,강정호,넥센,유격수,123,122,1059 1/3,13,0,186,...,444,53,125,22,2,9,178,63,3,5
3,4,문규현,롯데,유격수,122,104,892,16,0,184,...,327,40,79,13,3,2,104,39,14,3
4,132,문규현,롯데,3루수,6,0,12,0,0,1,...,327,40,79,13,3,2,104,39,14,3
5,5,김민우,넥센,3루수,120,109,963,13,0,77,...,466,66,115,22,0,6,155,31,17,2
6,95,김민우,넥센,2루수,14,12,88,1,0,33,...,466,66,115,22,0,6,155,31,17,2
7,6,조성환,롯데,2루수,117,114,967 1/3,9,0,244,...,407,45,99,19,0,6,136,36,10,2
8,6,김민성,넥센,2루수,117,96,835 2/3,8,0,245,...,314,37,74,10,1,4,98,23,22,2
9,116,김민성,넥센,유격수,9,1,26,0,0,6,...,314,37,74,10,1,4,98,23,22,2


In [192]:
pd.merge(df,game_data, left_on="선수명",right_on="name",how= "left")

Unnamed: 0,순위_x,선수명,팀명,POS,G_x,GS,IP,E,PKO,PO,...,SAC,SF,name,number,berth,position,height,weight,payment,salary
0,1,김상수,삼성,유격수,126,121,1059 1/3,22,0,205,...,15,1,김상수,7.0,1990년 03월 23일,내야수(우투우타),175.0,68.0,0.0,0.0
1,1,김상수,삼성,유격수,126,121,1059 1/3,22,0,205,...,15,1,김상수,7.0,1900년 01월 01일,내야수(우투우타),175.0,67.0,0.0,0.0
2,1,김상수,삼성,유격수,126,121,1059 1/3,22,0,205,...,15,1,김상수,24.0,1988년 01월 02일,투수(우투우타),180.0,88.0,13000.0,20000.0
3,1,김상수,삼성,유격수,126,121,1059 1/3,22,0,205,...,15,1,김상수,7.0,1990년 03월 23일,내야수(우투우타),175.0,68.0,28000.0,25000.0
4,2,정성훈,LG,3루수,125,123,1015 2/3,12,0,63,...,11,7,정성훈,121.0,1998년 05월 07일,내야수(우투우타),172.0,77.0,3000.0,2700.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1942,145,김민수,롯데,3루수,11,10,89,1,0,9,...,0,0,김민수,100.0,1996년 06월 11일,외야수(우투우타),181.0,93.0,3000.0,2700.0
1943,145,김민수,롯데,3루수,11,10,89,1,0,9,...,0,0,김민수,27.0,2000년 04월 05일,외야수(우투좌타),186.0,87.0,4000.0,2700.0
1944,145,김민수,롯데,3루수,11,10,89,1,0,9,...,0,0,김민수,57.0,1979년 08월 01일,투수(우투우타),182.0,84.0,3000.0,2000.0
1945,145,신용수,롯데,유격수,11,2,36,4,0,4,...,0,0,신용수,68.0,1996년 01월 05일,내야수(우투우타),178.0,78.0,3000.0,2700.0
