In [1]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
import sklearn
import re
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier

import numpy as np
import pandas as pd

from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

import pickle
import joblib

import sys
import xgboost as xgb

from tqdm import tqdm

from sklearn.model_selection import KFold   # K-fold CV    
from bayes_opt import BayesianOptimization  # 베이지안 최적화 라이브러리  
from functools import partial               # 함수 변수 고정
import warnings                             
warnings.filterwarnings("ignore")           # 경고 문구 미표시

In [2]:
def load_train():
    train = pd.read_csv('data/train.csv')
    labels = train.winner.values
    lbl_enc = preprocessing.LabelEncoder()
    labels = lbl_enc.fit_transform(labels)
#     train = train.drop('game_id', axis=1)
    return train.values, labels.astype('int32')

In [3]:
def load_test():
    test = pd.read_csv('data/test.csv')
    train = train.drop('game_id', axis=1)
    return test.values

In [4]:
def score(params):
    print("Training with params : ")
    print(params)
    num_round = int(params['n_estimators'])
    del params['n_estimators']
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_test, label=y_test)
    # watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
    model = xgb.train(params, dtrain, num_round)
    predictions = model.predict(dvalid).reshape((X_test.shape[0], 9))
    score = log_loss(y_test, predictions)
    print("\tScore {0}\n\n".format(score))
    return {'loss': score, 'status': STATUS_OK}

In [5]:
def write_submission(preds, output):
    sample = pd.read_csv('data/sample_submission.csv')
    train = pd.read_csv('data/train.csv')
    labels = train.winner.values
    labels = np.unique(labels)
    preds = pd.DataFrame(
        preds, index=sample.game_id.values, columns=labels)
    preds = preds[list(sample.columns[1:])]
    preds.to_csv(output, index_label='game_id')

-----

# Feature - 자원소모량

In [6]:
data = pd.read_csv('data/train.csv')
data_ability = data[data['event'] == 'Ability']

In [7]:
data_TRB = data_ability[data_ability["event_contents"].str.contains("Train|Research|Build|Morph|Evolve|Upgrade")]

In [8]:
data_trainmorph = data_TRB[data_TRB["event_contents"].str.contains("Train|Morph")]
data_build = data_TRB[data_TRB["event_contents"].str.contains("Build")]
data_research = data_TRB[data_TRB["event_contents"].str.contains("Research|Evolve|Upgrade")]

In [9]:
data_train_T = data_trainmorph[data_trainmorph["species"] == "T"]["event_contents"].unique().tolist()
data_train_Z = data_trainmorph[data_trainmorph["species"] == "Z"]["event_contents"].unique().tolist()
data_train_P = data_trainmorph[data_trainmorph["species"] == "P"]
data_train_P = data_train_P[~data_train_P["event_contents"].str.contains(";")]
data_train_P = data_train_P["event_contents"].unique().tolist()

data_build_T = data_build[data_build["species"] == "T"]["event_contents"].unique().tolist()
data_build_T = [x for xs in data_build_T for x in xs.split(";")]
data_build_T = [i for i in data_build_T if "Build" in i]
data_build_T = list(set(data_build_T))

data_build_Z = data_build[data_build["species"] == "Z"]["event_contents"].unique().tolist()
data_build_Z = [x for xs in data_build_Z for x in xs.split(";")]
data_build_Z = [i for i in data_build_Z if "Build" in i]
data_build_Z = list(set(data_build_Z))

data_build_P = data_build[data_build["species"] == "P"]["event_contents"].unique().tolist()
data_build_P = [x for xs in data_build_P for x in xs.split(";")]
data_build_P = [i for i in data_build_P if "Build" in i]
data_build_P = list(set(data_build_P))

data_research_T = data_research[data_research["species"] == "T"]["event_contents"].unique().tolist()
data_research_Z = data_research[data_research["species"] == "Z"]["event_contents"].unique().tolist()
data_research_P = data_research[data_research["species"] == "P"]["event_contents"].unique().tolist()

In [10]:
data_train_T_re = [(50, 0), (50, 0), (100, 100), (50, 50), (150, 150), (100, 25), (150, 100), (100, 200), (150, 75), (150, 100), (150, 125), (400,300), (100, 100)]
data_train_Z_re = [(50, 0), (100, 0), (150, 0), (50, 0), (100, 50), (75, 25), (50, 50), (25, 25), (25, 75), (100, 100), (0, 0), (150, 100), (100, 75), (100, 150), (150, 150), (0, 0), (100, 150), (50, 100), (300, 200)]
data_train_P_re = [(50, 0), (100, 25), (100, 100), (250, 0), (150, 150), (300, 200), (25, 75), (125, 50), (150, 150), (250, 150), (275, 100), (100, 0), (150, 0), (50, 100), (350, 250), (50, 150), (250, 175), (125, 125), (15, 0)]

data_build_T_re = [(0,0), (50, 50), (0, 0), (50,50), (50,25), (0,0), (0,0), (50,25), (100,0), (150,100), (150,125), (50, 50), (100,0), (150,50), (50,25), (125,0), (0,0), (400,0), (75,25), (150, 0), (100,0), (100,100), (100,50), (125,100), (100,0), (300,200), (100,0), (100,0), (150, 150)]
data_build_Z_re = [(125,0), (200, 150), (0,0), (150, 0), (350,0), (150, 50), (150, 100), (250, 200), (75,0), (200, 200), (50, 50), (0,0), (250, 0), (150,100), (125,0), (200,0)]
data_build_P_re = [(150, 100), (150, 0), (150, 0), (300, 200), (150, 0), (150, 150), (400, 0), (100, 0), (150, 100), (150, 200), (150, 150), (150, 150), (100,0), (0,0), (150,0), (0,0)]

data_research_T_re = [(0,0), (100, 100), (100, 100), (100, 100), (100, 100), (100, 100), (100, 100), (100, 100), (50, 50), (100, 100), (0,0), (150, 150), (150, 150), (100, 100), (175, 175), (150, 150), (150, 150), (0, 0), (75, 75), (175, 175), (100, 100), (175, 175), (150, 150), (100, 100), (100, 100), (100, 100), (150, 150), (0, 0), (175, 175), (175, 175)]
data_research_Z_re = [(100, 100), (100, 100), (100, 100), (150, 150), (100, 100), (100, 100), (100, 100), (100, 100), (150, 100), (100, 100), (150, 150), (100, 100), (225, 225), (100, 100), (0, 0), (150, 150), (150, 150), (200, 150), (100, 100), (150, 150), (150, 150), (0, 0), (175, 175), (200, 200), (150, 150)]
data_research_P_re = [(50, 50), (100, 100), (100, 100), (100, 100), (100, 100), (100, 100), (175, 175), (100, 100), (100, 100), (150, 150), (150, 150), (150, 150), (100, 100), (200, 200), (225, 225), (100, 100), (150, 150), (150, 150), (0, 0), (100, 100), (150, 150), (0, 0), (200, 200)]

In [11]:
mylist1 = data_train_T + data_train_Z + data_train_P
mylist2 = data_train_T_re + data_train_Z_re + data_train_P_re
train_dict = dict(zip(mylist1, mylist2))

mylist1 = data_build_T + data_build_Z + data_build_P
mylist2 = data_build_T_re + data_build_Z_re + data_build_P_re
build_dict = dict(zip(mylist1, mylist2))

mylist1 = data_research_T + data_research_Z + data_research_P
mylist2 = data_research_T_re + data_research_Z_re + data_research_P_re
research_dict = dict(zip(mylist1, mylist2))

worker_dict = {'(1360) - TrainSCV' : (50,0), '(15E0) - TrainProbe': (50, 0), '(1820) - MorphDrone': (50, 0)}
train_dict = {k: v for k, v in train_dict.items() if k not in worker_dict}

train_dict['(1401) - BuildSiegeTank'] = (150, 125)
train_dict['(1418) - BuildWidowMine'] = (75, 25)
train_dict['(1405) - BuildHellion'] = (100, 0)
train_dict['(1404) - BuildThor'] = (300, 200)
train_dict['(1406) - BuildBattleHellion'] = (100, 0)
build_dict = {k: v for k, v in build_dict.items() if k not in train_dict}

### 자원소모량 feature 만들기

In [12]:
p0_worker_mineral = [0]*38872
p1_worker_mineral = [0]*38872

p0_train_mineral = [0]*38872
p0_train_gas = [0]*38872
p1_train_mineral = [0]*38872
p1_train_gas = [0]*38872

p0_build_mineral = [0]*38872
p0_build_gas = [0]*38872
p1_build_mineral = [0]*38872
p1_build_gas = [0]*38872

p0_research_mineral = [0]*38872
p0_research_gas = [0]*38872
p1_research_mineral = [0]*38872
p1_research_gas = [0]*38872

In [13]:
for idx, row in tqdm(data_TRB.iterrows()):
    game_id = row["game_id"]
    player = row["player"]
    action = row["event_contents"].split(";")
    for i in action:
        if i in train_dict.keys():
            spend = train_dict[i]
            if player == 0:
                p0_train_mineral[game_id] += spend[0]
                p0_train_gas[game_id] += spend[1]
            if player == 1:
                p1_train_mineral[game_id] += spend[0]
                p1_train_gas[game_id] += spend[1]
        elif i in build_dict.keys():
            spend = build_dict[i]
            if player == 0:
                p0_build_mineral[game_id] += spend[0]
                p0_build_gas[game_id] += spend[1]
            if player == 1:
                p1_build_mineral[game_id] += spend[0]
                p1_build_gas[game_id] += spend[1]
        elif i in research_dict.keys():
            spend = research_dict[i]
            if player == 0:
                p0_research_mineral[game_id] += spend[0]
                p0_research_gas[game_id] += spend[1]
            if player == 1:
                p1_research_mineral[game_id] += spend[0]
                p1_research_gas[game_id] += spend[1]
        elif i in worker_dict.keys():
            spend = worker_dict[i]
            if player == 0:
                p0_worker_mineral[game_id] += spend[0]
            if player == 1:
                p1_worker_mineral[game_id] += spend[0]

2487701it [03:28, 11908.51it/s]


# Feature - 좌표

In [14]:
time = list(data["time"].groupby(data["game_id"]).tail(1))
for i in range(len(time)):
    new_time = str(time[i])
    new_time = new_time.split(".")
    time[i] = int(new_time[0])*60 + int(new_time[1])
p0_start_location_x = [0]*38872
p0_start_location_y = [0]*38872
p1_start_location_x = [0]*38872
p1_start_location_y = [0]*38872

In [15]:
for idx, row in tqdm(data_TRB.iterrows()):
    game_id = row["game_id"]
    player = row["player"]
    action = row["event_contents"].split(";")
    for i in action:
        if "Location" in i:
            location = re.findall("[0-9]+.[0-9]+", i)
            if player == 0:
                if p0_start_location_x[game_id] == 0:
                    p0_start_location_x[game_id] = float(location[0])
                    p0_start_location_y[game_id] = float(location[1])
            elif player == 1:
                if p1_start_location_x[game_id] == 0:
                    p1_start_location_x[game_id] = float(location[0])
                    p1_start_location_y[game_id] = float(location[1])

2487701it [03:27, 11965.10it/s]


In [16]:
data_right = data[data['event'] == 'Right Click']
data_target = data_right[data_right["event_contents"].str.contains("Target")]

In [17]:
def init_list_of_objects(size):
    list_of_objects = list()
    for i in range(0,size):
        list_of_objects.append( list() )
    return list_of_objects

p0_attack_distance_0 = [0] * 38872
p0_attack_distance_1 = [0] * 38872
p1_attack_distance_0 = [0] * 38872
p1_attack_distance_1 = [0] * 38872

player0_attack_0 = init_list_of_objects(38872)
player0_attack_1 = init_list_of_objects(38872)
player1_attack_0 = init_list_of_objects(38872)
player1_attack_1 = init_list_of_objects(38872)

p0_click_count_0 = [0] * 38872
p0_click_count_1 = [0] * 38872
p1_click_count_0 = [0] * 38872
p1_click_count_1 = [0] * 38872

In [18]:
for idx, row in tqdm(data_target.iterrows()):
    game_id = row["game_id"]
    player = row["player"]
    action = row["event_contents"].split(";")
    tim = str(row["time"]).split(".")
    tim = int(tim[0])*60 + int(tim[1])
    end_time = time[game_id]
    
    location = re.findall("(\d{1,3}\.\d{1,3})", row["event_contents"])
    location = list(map(float, location))
    
    if location == [] : 
        continue
        
    if tim < end_time*0.8:
        if player == 0:
            x = float(p0_start_location_x[game_id]) - float(location[0])
            y = float(p0_start_location_y[game_id]) - float(location[1])
            player0_attack_0[game_id].append((x**2 + y**2)**0.5)

        elif player == 1:
            x = float(p1_start_location_x[game_id]) - float(location[0])
            y = float(p1_start_location_y[game_id]) - float(location[1])
            player1_attack_0[game_id].append((x**2 + y**2)**0.5)

    elif tim >= end_time*0.8:
        if player == 0:
            x = float(p0_start_location_x[game_id]) - float(location[0])
            y = float(p0_start_location_y[game_id]) - float(location[1])
            player0_attack_1[game_id].append((x**2 + y**2)**0.5)            
        elif player == 1:
            x = float(p1_start_location_x[game_id]) - float(location[0])
            y = float(p1_start_location_y[game_id]) - float(location[1])
            player1_attack_1[game_id].append((x**2 + y**2)**0.5)

2676714it [04:45, 9391.97it/s]


In [19]:
data_attack = data_ability[data_ability["event_contents"].str.contains("Attack")]

In [20]:
for idx, row in tqdm(data_attack.iterrows()):
    game_id = row["game_id"]
    player = row["player"]
    action = row["event_contents"].split(";")
    tim = str(row["time"]).split(".")
    tim = int(tim[0])*60 + int(tim[1])
    end_time = time[game_id]
    
    location = re.findall("(\d{1,3}\.\d{1,3})", row["event_contents"])
    location = list(map(float, location))
    
    if location == [] : 
        continue
        
    if tim < end_time*0.8:
        if player == 0:
            x = float(p0_start_location_x[game_id]) - float(location[0])
            y = float(p0_start_location_y[game_id]) - float(location[1])
            player0_attack_0[game_id].append((x**2 + y**2)**0.5)
            player0_attack_0[game_id].append((x**2 + y**2)**0.5)
            player0_attack_0[game_id].append((x**2 + y**2)**0.5)

        elif player == 1:
            x = float(p1_start_location_x[game_id]) - float(location[0])
            y = float(p1_start_location_y[game_id]) - float(location[1])
            player1_attack_0[game_id].append((x**2 + y**2)**0.5)
            player1_attack_0[game_id].append((x**2 + y**2)**0.5)
            player1_attack_0[game_id].append((x**2 + y**2)**0.5)

    elif tim >= end_time*0.8:
        if player == 0:
            x = float(p0_start_location_x[game_id]) - float(location[0])
            y = float(p0_start_location_y[game_id]) - float(location[1])
            player0_attack_1[game_id].append((x**2 + y**2)**0.5)  
            player0_attack_1[game_id].append((x**2 + y**2)**0.5) 
            player0_attack_1[game_id].append((x**2 + y**2)**0.5) 
            
        elif player == 1:
            x = float(p1_start_location_x[game_id]) - float(location[0])
            y = float(p1_start_location_y[game_id]) - float(location[1])
            player1_attack_1[game_id].append((x**2 + y**2)**0.5)
            player1_attack_1[game_id].append((x**2 + y**2)**0.5)
            player1_attack_1[game_id].append((x**2 + y**2)**0.5)

571111it [01:01, 9223.82it/s]


In [21]:
for i in tqdm(range(len(p0_attack_distance_0))):
    if len(player0_attack_0[i]) > 0 : p0_attack_distance_0[i] = sum(player0_attack_0[i]) / len(player0_attack_0[i])
    if len(player0_attack_1[i]) > 0 : p0_attack_distance_1[i] = sum(player0_attack_1[i]) / len(player0_attack_1[i])
    if len(player1_attack_0[i]) > 0 : p1_attack_distance_0[i] = sum(player1_attack_0[i]) / len(player1_attack_0[i])
    if len(player1_attack_1[i]) > 0 : p1_attack_distance_1[i] = sum(player1_attack_1[i]) / len(player1_attack_1[i])
    p0_click_count_0[i] = len(player0_attack_0[i])
    p0_click_count_1[i] = len(player0_attack_1[i])
    p1_click_count_0[i] = len(player1_attack_0[i])
    p1_click_count_1[i] = len(player1_attack_1[i])

100%|████████████████████████████████████████████████████████████████████████| 38872/38872 [00:00<00:00, 268013.61it/s]


# Feature - 베이스라인

In [22]:
def data_preparation(df, answer=False):
    game_ids = df['game_id'].unique()
    events = ['Ability', 'AddToControlGroup', 'Camera', 'ControlGroup', 'GetControlGroup', 'Right Click', 'Selection', 'SetControlGroup']
    unique_event_0, unique_event_1, delta_event = {}, {}, {}
    for event in events:
        unique_event_0['P0_' + event] = 0
        unique_event_1['P1_' + event] = 0
        delta_event['delta_' + event] = 0
        
    species = df.groupby(['game_id', 'player']).species.unique()        
    event_count = df.groupby(['game_id', 'player']).event.value_counts()
    if answer:
        winners = df.groupby(['game_id']).winner.max()
    
    x_data, y_data = [], []
    for game_id in tqdm(game_ids):
        df_event_count = event_count[game_id].unstack(level=-1)
        df = pd.DataFrame(species[game_id])
        df = pd.concat([df, df_event_count], axis=1)   
        df = df.fillna(0)
        df = df.drop(['species'], axis=1)

        df_P0_event = unique_event_0.copy()
        for column in df.columns:
            df_P0_event['P0_' + column] = df.loc[0][column]
        df_P0_event = pd.DataFrame(pd.Series(df_P0_event)).T

        df_P1_event = unique_event_1.copy()
        for column in df.columns:
            df_P1_event['P1_' + column] = df.loc[1][column]
        df_P1_event = pd.DataFrame(pd.Series(df_P1_event)).T
        
#         df_delta_event = delta_event.copy()
#         for column in df.columns:
#             df_delta_event['delta_' + column] = df_P0_event['P0_' + column][0] - df_P1_event['P1_' + column][0]
#         df_delta_event = pd.DataFrame(pd.Series(df_delta_event)).T

        out = pd.concat([df_P0_event, df_P1_event], axis=1)
#         out = pd.concat([df_P0_event, df_P1_event, df_delta_event], axis=1)
        out.index = [game_id]
        out.index.name = 'game_id'
        
        x_data.append(out)
        if answer:
            y_data.append(winners[game_id])  

    x_data = pd.concat(x_data)
    y_data = np.array(y_data)
    
    return x_data, y_data

In [23]:
x_train, _ = data_preparation(data, answer=True)

100%|███████████████████████████████████████████████████████████████████████████| 38872/38872 [03:02<00:00, 213.06it/s]


# 데이터프레임 만들기

In [24]:
game_id = list(range(38872))
time = list(data["time"].groupby(data["game_id"]).tail(1))
for i in range(len(time)):
    new_time = str(time[i])
    new_time = new_time.split(".")
    time[i] = int(new_time[0])*60 + int(new_time[1])
species_0 = list(data[data["player"] == 0]["species"].groupby(data["game_id"]).head(1))
species_1 = list(data[data["player"] == 1]["species"].groupby(data["game_id"]).head(1))
winner = list(data["winner"].groupby(data["game_id"]).head(1))
mydict = {'game_id' : game_id, 'time' : time, "species_0" : species_0, "species_1" : species_1, "winner" : winner}
X = pd.DataFrame(mydict)

In [25]:
y = X["winner"]
X = X.drop("winner", axis=1)
X = X.drop("game_id", axis=1)

In [26]:
X = pd.get_dummies(X, prefix = ["species_0", "species_1"])

In [27]:
X["p0_train_mineral"] = p0_train_mineral
X["p1_train_mineral"] = p1_train_mineral
X["p0_train_gas"] = p0_train_gas
X["p1_train_gas"] = p1_train_gas
X["p0_build_mineral"] = p0_build_mineral
X["p1_build_mineral"] = p1_build_mineral
X["p0_build_gas"] = p0_build_gas
X["p1_build_gas"] = p1_build_gas
X["p0_research_mineral"] = p0_research_mineral
X["p1_research_mineral"] = p1_research_mineral
X["p0_research_gas"] = p0_research_gas
X["p1_research_gas"] = p1_research_gas
X["p0_worker_mineral"] = p0_worker_mineral
X["p1_worker_mineral"] = p1_worker_mineral
X["p0_attack_distance_0"] = p0_attack_distance_0
X["p0_attack_distance_1"] = p0_attack_distance_1
X["p1_attack_distance_0"] = p1_attack_distance_0
X["p1_attack_distance_1"] = p1_attack_distance_1
X["p0_click_count_1"] = p0_click_count_1
X["p1_click_count_1"] = p1_click_count_1

# X["delta_train_mineral"] = X["p0_train_mineral"] - X["p1_train_mineral"]
# X["delta_train_gas"] = X["p0_train_gas"] - X["p1_train_gas"]
# X["delta_build_mineral"] = X["p0_build_mineral"] - X["p1_build_mineral"]
# X["delta_build_gas"] = X["p0_build_gas"] - X["p1_build_gas"]
# X["delta_research_mineral"] = X["p0_research_mineral"] - X["p1_research_mineral"]
# X["delta_research_gas"] = X["p0_research_gas"] - X["p1_research_gas"]
# X["delta_worker_mineral"] = X["p0_worker_mineral"] - X["p1_worker_mineral"]
# X["delta_attack_distance_0"] = X["p0_attack_distance_0"] - X["p1_attack_distance_0"] 
# X["delta_attack_distance_1"] = X["p0_attack_distance_1"] - X["p1_attack_distance_1"]
# X["delta_click_count_1"] = X["p0_click_count_1"] - X["p1_click_count_1"] 

X = pd.concat([X, x_train], axis=1)
y_forsub = y
X_forsub = X

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=77)

In [29]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [30]:
# for data
column_list = list(X.columns) 
scaler = StandardScaler()
X = scaler.fit_transform(X)
X = pd.DataFrame(data = X, columns = column_list)
X["winner"] = y
X.to_csv("train.csv", index_label='game_id')

# 테스트 데이터에 적용하기

In [32]:
data_test = pd.read_csv('data/test.csv')

In [33]:
game_id = list(range(38872, 38872+16787))
time = list(data_test["time"].groupby(data_test["game_id"]).tail(1))
for i in range(len(time)):
    new_time = str(time[i])
    new_time = new_time.split(".")
    time[i] = int(new_time[0])*60 + int(new_time[1])
species_0 = list(data_test[data_test["player"] == 0]["species"].groupby(data_test["game_id"]).head(1))
species_1 = list(data_test[data_test["player"] == 1]["species"].groupby(data_test["game_id"]).head(1))
mydict = {'game_id' : game_id, 'time' : time, "species_0" : species_0, "species_1" : species_1}
test_X = pd.DataFrame(mydict)
test_X = pd.get_dummies(test_X, prefix = ["species_0", "species_1"])

In [34]:
data_ability = data_test[data_test['event'] == 'Ability']
data_TRB = data_ability[data_ability["event_contents"].str.contains("Train|Research|Build|Morph|Evolve|Upgrade")]

### 자원소모량 feature 만들기

In [35]:
p0_worker_mineral = [0]*16787
p1_worker_mineral = [0]*16787

p0_train_mineral = [0]*16787
p0_train_gas = [0]*16787
p1_train_mineral = [0]*16787
p1_train_gas = [0]*16787

p0_build_mineral = [0]*16787
p0_build_gas = [0]*16787
p1_build_mineral = [0]*16787
p1_build_gas = [0]*16787

p0_research_mineral = [0]*16787
p0_research_gas = [0]*16787
p1_research_mineral = [0]*16787
p1_research_gas = [0]*16787

In [36]:
for idx, row in tqdm(data_TRB.iterrows()):
    game_id = row["game_id"] - 38872
    player = row["player"]
    action = row["event_contents"].split(";")
    for i in action:
        if i in train_dict.keys():
            spend = train_dict[i]
            if player == 0:
                p0_train_mineral[game_id] += spend[0]
                p0_train_gas[game_id] += spend[1]
            if player == 1:
                p1_train_mineral[game_id] += spend[0]
                p1_train_gas[game_id] += spend[1]
        elif i in build_dict.keys():
            spend = build_dict[i]
            if player == 0:
                p0_build_mineral[game_id] += spend[0]
                p0_build_gas[game_id] += spend[1]
            if player == 1:
                p1_build_mineral[game_id] += spend[0]
                p1_build_gas[game_id] += spend[1]
        elif i in research_dict.keys():
            spend = research_dict[i]
            if player == 0:
                p0_research_mineral[game_id] += spend[0]
                p0_research_gas[game_id] += spend[1]
            if player == 1:
                p1_research_mineral[game_id] += spend[0]
                p1_research_gas[game_id] += spend[1]
        elif i in worker_dict.keys():
            spend = worker_dict[i]
            if player == 0:
                p0_worker_mineral[game_id] += spend[0]
            if player == 1:
                p1_worker_mineral[game_id] += spend[0]

1070841it [01:30, 11855.42it/s]


### 거리 feature 만들기

In [37]:
time = list(data_test["time"].groupby(data_test["game_id"]).tail(1))
for i in range(len(time)):
    new_time = str(time[i])
    new_time = new_time.split(".")
    time[i] = int(new_time[0])*60 + int(new_time[1])
p0_start_location_x = [0]*16787
p0_start_location_y = [0]*16787
p1_start_location_x = [0]*16787
p1_start_location_y = [0]*16787

In [38]:
for idx, row in tqdm(data_TRB.iterrows()):
    game_id = row["game_id"] - 38872
    player = row["player"]
    action = row["event_contents"].split(";")
    for i in action:
        if "Location" in i:
            location = re.findall("[0-9]+.[0-9]+", i)
            if player == 0:
                if p0_start_location_x[game_id] == 0:
                    p0_start_location_x[game_id] = float(location[0])
                    p0_start_location_y[game_id] = float(location[1])
            elif player == 1:
                if p1_start_location_x[game_id] == 0:
                    p1_start_location_x[game_id] = float(location[0])
                    p1_start_location_y[game_id] = float(location[1])

1070841it [01:32, 11581.22it/s]


In [39]:
data_right = data_test[data_test['event'] == 'Right Click']
data_target = data_right[data_right["event_contents"].str.contains("Target")]

In [40]:
player0_attack_0 = init_list_of_objects(16787)
player0_attack_1 = init_list_of_objects(16787)
player1_attack_0 = init_list_of_objects(16787)
player1_attack_1 = init_list_of_objects(16787)

p0_attack_distance_0 = [0] * 16787
p0_attack_distance_1 = [0] * 16787
p1_attack_distance_0 = [0] * 16787
p1_attack_distance_1 = [0] * 16787

p0_click_count_0 = [0] * 16787
p0_click_count_1 = [0] * 16787
p1_click_count_0 = [0] * 16787
p1_click_count_1 = [0] * 16787

In [41]:
for idx, row in tqdm(data_target.iterrows()):
    game_id = row["game_id"] - 38872
    player = row["player"]
    action = row["event_contents"].split(";")
    tim = str(row["time"]).split(".")
    tim = int(tim[0])*60 + int(tim[1])
    end_time = time[game_id]
    
    location = re.findall("(\d{1,3}\.\d{1,3})", row["event_contents"])
    location = list(map(float, location))
    
    if location == [] : 
        continue
        
    if tim < end_time*0.8:
        if player == 0:
            x = float(p0_start_location_x[game_id]) - float(location[0])
            y = float(p0_start_location_y[game_id]) - float(location[1])
            player0_attack_0[game_id].append((x**2 + y**2)**0.5)

        elif player == 1:
            x = float(p1_start_location_x[game_id]) - float(location[0])
            y = float(p1_start_location_y[game_id]) - float(location[1])
            player1_attack_0[game_id].append((x**2 + y**2)**0.5)

    elif tim >= end_time*0.8:
        if player == 0:
            x = float(p0_start_location_x[game_id]) - float(location[0])
            y = float(p0_start_location_y[game_id]) - float(location[1])
            player0_attack_1[game_id].append((x**2 + y**2)**0.5)            
        elif player == 1:
            x = float(p1_start_location_x[game_id]) - float(location[0])
            y = float(p1_start_location_y[game_id]) - float(location[1])
            player1_attack_1[game_id].append((x**2 + y**2)**0.5)

1138608it [02:02, 9281.39it/s]


In [42]:
data_attack = data_ability[data_ability["event_contents"].str.contains("Attack")]

In [43]:
for idx, row in tqdm(data_attack.iterrows()):
    game_id = row["game_id"] - 38872
    player = row["player"]
    action = row["event_contents"].split(";")
    tim = str(row["time"]).split(".")
    tim = int(tim[0])*60 + int(tim[1])
    end_time = time[game_id]
    
    location = re.findall("(\d{1,3}\.\d{1,3})", row["event_contents"])
    location = list(map(float, location))
    
    if location == [] : 
        continue
        
    if tim < end_time*0.8:
        if player == 0:
            x = float(p0_start_location_x[game_id]) - float(location[0])
            y = float(p0_start_location_y[game_id]) - float(location[1])
            player0_attack_0[game_id].append((x**2 + y**2)**0.5)
            player0_attack_0[game_id].append((x**2 + y**2)**0.5)
            player0_attack_0[game_id].append((x**2 + y**2)**0.5)

        elif player == 1:
            x = float(p1_start_location_x[game_id]) - float(location[0])
            y = float(p1_start_location_y[game_id]) - float(location[1])
            player1_attack_0[game_id].append((x**2 + y**2)**0.5)
            player1_attack_0[game_id].append((x**2 + y**2)**0.5)
            player1_attack_0[game_id].append((x**2 + y**2)**0.5)

    elif tim >= end_time*0.8:
        if player == 0:
            x = float(p0_start_location_x[game_id]) - float(location[0])
            y = float(p0_start_location_y[game_id]) - float(location[1])
            player0_attack_1[game_id].append((x**2 + y**2)**0.5)  
            player0_attack_1[game_id].append((x**2 + y**2)**0.5) 
            player0_attack_1[game_id].append((x**2 + y**2)**0.5) 
            
        elif player == 1:
            x = float(p1_start_location_x[game_id]) - float(location[0])
            y = float(p1_start_location_y[game_id]) - float(location[1])
            player1_attack_1[game_id].append((x**2 + y**2)**0.5)
            player1_attack_1[game_id].append((x**2 + y**2)**0.5)
            player1_attack_1[game_id].append((x**2 + y**2)**0.5)

242775it [00:27, 8861.18it/s]


In [44]:
for i in tqdm(range(len(p0_attack_distance_0))):
    if len(player0_attack_0[i]) > 0 : p0_attack_distance_0[i] = sum(player0_attack_0[i]) / len(player0_attack_0[i])
    if len(player0_attack_1[i]) > 0 : p0_attack_distance_1[i] = sum(player0_attack_1[i]) / len(player0_attack_1[i])
    if len(player1_attack_0[i]) > 0 : p1_attack_distance_0[i] = sum(player1_attack_0[i]) / len(player1_attack_0[i])
    if len(player1_attack_1[i]) > 0 : p1_attack_distance_1[i] = sum(player1_attack_1[i]) / len(player1_attack_1[i])
    p0_click_count_0[i] = len(player0_attack_0[i])
    p0_click_count_1[i] = len(player0_attack_1[i])
    p1_click_count_0[i] = len(player1_attack_0[i])
    p1_click_count_1[i] = len(player1_attack_1[i])

100%|████████████████████████████████████████████████████████████████████████| 16787/16787 [00:00<00:00, 239760.62it/s]


### Feature - 베이스라인

In [45]:
x_test, _ = data_preparation(data_test, answer=False)

100%|███████████████████████████████████████████████████████████████████████████| 16787/16787 [01:25<00:00, 195.91it/s]


### 데이터 열 추가

In [46]:
test_X["p0_train_mineral"] = p0_train_mineral
test_X["p1_train_mineral"] = p1_train_mineral
test_X["p0_train_gas"] = p0_train_gas
test_X["p1_train_gas"] = p1_train_gas
test_X["p0_build_mineral"] = p0_build_mineral
test_X["p1_build_mineral"] = p1_build_mineral
test_X["p0_build_gas"] = p0_build_gas
test_X["p1_build_gas"] = p1_build_gas
test_X["p0_research_mineral"] = p0_research_mineral
test_X["p1_research_mineral"] = p1_research_mineral
test_X["p0_research_gas"] = p0_research_gas
test_X["p1_research_gas"] = p1_research_gas
test_X["p0_worker_mineral"] = p0_worker_mineral
test_X["p1_worker_mineral"] = p1_worker_mineral
test_X["p0_attack_distance_0"] = p0_attack_distance_0
test_X["p0_attack_distance_1"] = p0_attack_distance_1
test_X["p1_attack_distance_0"] = p1_attack_distance_0
test_X["p1_attack_distance_1"] = p1_attack_distance_1
test_X["p0_click_count_1"] = p0_click_count_1
test_X["p1_click_count_1"] = p1_click_count_1
test_X = test_X.drop("game_id", axis=1)


# test_X["delta_train_mineral"] = test_X["p0_train_mineral"] - test_X["p1_train_mineral"]
# test_X["delta_train_gas"] = test_X["p0_train_gas"] - test_X["p1_train_gas"]
# test_X["delta_build_mineral"] = test_X["p0_build_mineral"] - test_X["p1_build_mineral"]
# test_X["delta_build_gas"] = test_X["p0_build_gas"] - test_X["p1_build_gas"]
# test_X["delta_research_mineral"] = test_X["p0_research_mineral"] - test_X["p1_research_mineral"]
# test_X["delta_research_gas"] = test_X["p0_research_gas"] - test_X["p1_research_gas"]
# test_X["delta_worker_mineral"] = test_X["p0_worker_mineral"] - test_X["p1_worker_mineral"]
# test_X["delta_attack_distance_0"] = test_X["p0_attack_distance_0"] - test_X["p1_attack_distance_0"] 
# test_X["delta_attack_distance_1"] = test_X["p0_attack_distance_1"] - test_X["p1_attack_distance_1"]
# test_X["delta_click_count_1"] = test_X["p0_click_count_1"] - test_X["p1_click_count_1"] 

In [47]:
x_test.reset_index(drop = True, inplace = True)
test_X.reset_index(drop = True, inplace = True)

In [48]:
test_X = pd.concat([test_X, x_test], axis=1)

In [49]:
# for data
column_list = list(test_X.columns)
test_X = scaler.transform(test_X) 
test_X = pd.DataFrame(data = test_X, columns = column_list)
test_X.to_csv("test.csv", index_label='game_id')

In [48]:
scaler = StandardScaler()
X_forsub = scaler.fit_transform(X_forsub)
test_X = scaler.transform(test_X)

In [49]:
import time
import random
import numpy as np  # 1.18.1
from numpy.random import shuffle
import pandas as pd  # 0.25.3
import torch  # 1.4.0
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset
from torch.utils.data.sampler import Sampler, SequentialSampler
from torch.backends import cudnn

In [50]:
pd.DataFrame(X_forsub).to_csv("./data/X_forsub.csv", index=None)
pd.DataFrame(y_forsub).to_csv("./data/y_forsub.csv", index=None)
pd.DataFrame(test_X).to_csv("./data/test_X.csv", index=None)

# XGB

In [None]:
xg_clf = xgb.XGBClassifier(eval_metric = 'auc', objective = 'binary:logistic',colsample_bytree = 0.7, learning_rate = 0.01,
                max_depth = 6, alpha = 10, n_estimators = 2000, sub_sample=0.7)
xg_clf.fit(X_train,y_train)

In [None]:
preds = xg_clf.predict(X_test)
roc_auc_score(y_test, preds)

In [None]:
preds = xg_clf.predict_proba(X_test)
roc_auc_score(y_test, preds[:,1])

In [None]:
xg_clf2 = xgb.XGBClassifier(eval_metric = 'auc', objective = 'binary:logistic',colsample_bytree = 0.7, learning_rate = 0.01,
                max_depth = 7, alpha = 10, n_estimators = 2000, sub_sample=0.7)
xg_clf2.fit(X_train,y_train)

In [None]:
preds = xg_clf2.predict(X_test)
roc_auc_score(y_test, preds)

In [None]:
preds = xg_clf2.predict_proba(X_test)
roc_auc_score(y_test, preds[:,1])

# LGBM

In [None]:
lgbm_clf = lgb.LGBMClassifier(eval_metric = 'auc', objective = 'binary=logistic', colsample_bytree = 0.37, learning_rate = 0.01, n_estimators = 1000, num_leaves = 45, reg_alpha = 7.5, reg_lambda = 0.83, subsample = 0.79)
lgbm_clf.fit(X_train, y_train)

In [None]:
preds = lgbm_clf.predict(X_test)
roc_auc_score(y_test, preds)

In [None]:
preds = lgbm_clf.predict_proba(X_test)
roc_auc_score(y_test, preds[:,1])

In [None]:
lgbm_clf2 = lgb.LGBMClassifier(eval_metric = 'auc', objective = 'binary=logistic', colsample_bytree = 0.4, learning_rate = 0.01, n_estimators = 1000, num_leaves = 40, reg_alpha = 7, reg_lambda = 0.8, subsample = 0.75)
lgbm_clf2.fit(X_train, y_train)

In [None]:
preds = lgbm_clf2.predict(X_test)
roc_auc_score(y_test, preds)

In [None]:
preds = lgbm_clf2.predict_proba(X_test)
roc_auc_score(y_test, preds[:,1])

# SVC

In [None]:
# def svc_param_selection(X, y, nfolds):
#     Cs = [0.001, 0.01, 0.1, 1, 10]
#     gammas = [0.001, 0.01, 0.1, 1]
#     param_grid = {'C': Cs, 'gamma' : gammas}
#     grid_search = GridSearchCV(SVC(kernel='rbf'), param_grid, cv=nfolds)
#     grid_search.fit(X, y)
#     grid_search.best_params_
#     return grid_search.best_params_

In [None]:
# svc_param_selection(X_forsub, y_forsub, 3)

In [None]:
svc = SVC(C = 1, gamma = 0.01, kernel = 'rbf', probability=True)
svc.fit(X_train, y_train)

In [None]:
preds = svc.predict(X_test)
roc_auc_score(y_test, preds)

In [None]:
preds = svc.predict_proba(X_test)
roc_auc_score(y_test, preds[:,1])

# Logistic

In [None]:
# logreg = GridSearchCV(cv=5,
#              estimator=LogisticRegression(C=1.0),
#              param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]})
# logreg.fit(X_train, y_train)
# print(logreg.best_params_, logreg.best_score_)

In [None]:
logreg = LogisticRegression(C = 0.001)
logreg.fit(X_train, y_train)

In [None]:
preds = logreg.predict(X_test)
roc_auc_score(y_test, preds)

In [None]:
preds = logreg.predict_proba(X_test)
roc_auc_score(y_test, preds[:,1])

# RF

In [None]:
RF = RandomForestClassifier(n_estimators = 1000)
RF.fit(X_train, y_train)

In [None]:
preds = RF.predict(X_test)
roc_auc_score(y_test, preds)

In [None]:
preds = RF.predict_proba(X_test)
roc_auc_score(y_test, preds[:,1])

# Catboost

In [None]:
# CB = CatBoostClassifier(task_type="GPU", verbose=False, eval_metric="AUC", border_count=254)
# params = {'depth':[6,8,10],
#           'iterations':[250,100,500,1000],
#           'l2_leaf_reg':[1,3,5,7,9]}
# grid_search_result = CB.grid_search(params, 
#                                        X=X_train, 
#                                        y=y_train, 
#                                        plot=True)

In [None]:
CB = CatBoostClassifier(task_type="GPU", verbose=False, eval_metric="AUC", border_count=254, depth=8, l2_leaf_reg=3, iterations=1000)
CB.fit(X_train,y_train)

In [None]:
preds = CB.predict(X_test)
roc_auc_score(y_test, preds)

In [None]:
preds = CB.predict_proba(X_test)
roc_auc_score(y_test, preds[:,1])

In [None]:
CB2 = CatBoostClassifier(task_type="GPU", verbose=False, eval_metric="AUC", border_count=254, depth=8, l2_leaf_reg=1, iterations=1000)
CB2.fit(X_train,y_train)

In [None]:
preds = CB2.predict(X_test)
roc_auc_score(y_test, preds)

In [None]:
preds = CB2.predict_proba(X_test)
roc_auc_score(y_test, preds[:,1])

----

# Ensemble

In [79]:
# Example of DCS techniques
from deslib.dcs import OLA
from deslib.dcs import MCB
# Example of DES techniques
from deslib.des import KNORAE
from deslib.des import DESP
from deslib.des import KNORAU
from deslib.des import METADES
from deslib.static import StackedClassifier
from sklearn.ensemble import VotingClassifier

In [80]:
pool_classifiers = [CB, CB2, xg_clf, xg_clf2, lgbm_clf, lgbm_clf2, svc, logreg, RF]
voting_classifiers = [("CB", CB),
                      ("CB2", CB2),
                      ("xgb", xg_clf),
                      ("xgb2", xg_clf2),
                      ("lgbm", lgbm_clf),
                      ("lgbm2", lgbm_clf2),
                      ("svc", svc),
                      ("logreg", logreg),
                      ("RF", RF)]
model_voting = VotingClassifier(estimators=voting_classifiers).fit(
    X_train, y_train)

# Initializing the techniques
knorau = KNORAU(pool_classifiers)
# kne = KNORAE(pool_classifiers)
# desp = DESP(pool_classifiers)
# metades = METADES(pool_classifiers, mode='hybrid')
# # DCS techniques
# ola = OLA(pool_classifiers)
# mcb = MCB(pool_classifiers)

stacked_lr = StackedClassifier(pool_classifiers, random_state=77)
stacked_dt = StackedClassifier(pool_classifiers,
                               random_state=77,
                               meta_classifier=DecisionTreeClassifier())

knorau.fit(X_train, y_train)
# kne.fit(X_train, y_train)
# desp.fit(X_train, y_train)
# metades.fit(X_train, y_train)
# ola.fit(X_train, y_train)
# mcb.fit(X_train, y_train)

stacked_lr.fit(X_train, y_train)
# stacked_dt.fit(X_train, y_train)

# # Fitting the DS techniques
# knorau.fit(X_dsel, y_dsel)
# kne.fit(X_dsel, y_dsel)
# desp.fit(X_dsel, y_dsel)
# metades.fit(X_dsel, y_dsel)
# ola.fit(X_dsel, y_dsel)
# mcb.fit(X_dsel, y_dsel)

# # Fitting the tacking models
# stacked_lr.fit(X_dsel, y_dsel)
# stacked_dt.fit(X_dsel, y_dsel)

print('Evaluating DS techniques:')
print('Classification accuracy of Majority voting the pool: ',
      model_voting.score(X_test, y_test))
print('Classification accuracy of KNORA-U: ', knorau.score(X_test, y_test))
preds = knorau.predict_proba(X_test)
print(roc_auc_score(y_test, preds[:,1]))
# print('Classification accuracy of KNORA-E: ', kne.score(X_test, y_test))
# print('Classification accuracy of DESP: ', desp.score(X_test, y_test))
# print('Classification accuracy of META-DES: ', metades.score(X_test, y_test))
# print('Classification accuracy of OLA: ', ola.score(X_test, y_test))
print('Classification accuracy Stacking LR', stacked_lr.score(X_test, y_test))
preds = stacked_lr.predict_proba(X_test)
print(roc_auc_score(y_test, preds[:,1]))
# print('Classification accuracy Stacking DT', stacked_dt.score(X_test, y_test))


Evaluating DS techniques:
Classification accuracy of Majority voting the pool:  0.6284244372990354
Classification accuracy of KNORA-U:  0.630096463022508
0.6800292417163468
Classification accuracy Stacking LR 0.6295819935691318
0.6803340855505478


------------------------

# 제출

In [82]:
xg_clf = xgb.XGBClassifier(eval_metric = 'auc', objective = 'binary:logistic',colsample_bytree = 0.7, learning_rate = 0.01,
                max_depth = 6, alpha = 10, n_estimators = 2000, sub_sample=0.7)
xg_clf.fit(X_forsub,y_forsub)

xg_clf2 = xgb.XGBClassifier(eval_metric = 'auc', objective = 'binary:logistic',colsample_bytree = 0.7, learning_rate = 0.01,
                max_depth = 7, alpha = 10, n_estimators = 2000, sub_sample=0.7)
xg_clf2.fit(X_forsub,y_forsub)

lgbm_clf = lgb.LGBMClassifier(eval_metric = 'auc', objective = 'binary=logistic', colsample_bytree = 0.37, learning_rate = 0.01, n_estimators = 1000, num_leaves = 45, reg_alpha = 7.5, reg_lambda = 0.83, subsample = 0.79)
lgbm_clf.fit(X_forsub, y_forsub)

lgbm_clf2 = lgb.LGBMClassifier(eval_metric = 'auc', objective = 'binary=logistic', colsample_bytree = 0.4, learning_rate = 0.01, n_estimators = 1000, num_leaves = 40, reg_alpha = 7, reg_lambda = 0.8, subsample = 0.75)
lgbm_clf2.fit(X_forsub, y_forsub)

CB = CatBoostClassifier(task_type="GPU", verbose=False, eval_metric="AUC", border_count=254, depth=8, l2_leaf_reg=3, iterations=1000)
CB.fit(X_forsub,y_forsub)

CB2 = CatBoostClassifier(task_type="GPU", verbose=False, eval_metric="AUC", border_count=254, depth=8, l2_leaf_reg=1, iterations=1000)
CB2.fit(X_forsub,y_forsub)

svc = SVC(C = 1, gamma = 0.01, kernel = 'rbf', probability=True)
svc.fit(X_forsub, y_forsub)

logreg = LogisticRegression(C = 0.001)
logreg.fit(X_forsub, y_forsub)

RF = RandomForestClassifier(n_estimators = 1000)
RF.fit(X_forsub, y_forsub)

pool_classifiers = [CB, CB2, xg_clf, xg_clf2, lgbm_clf, lgbm_clf2, svc, logreg, RF]

knorau = KNORAU(pool_classifiers)

knorau.fit(X_forsub, y_forsub)

result = knorau.predict_proba(test_X)[:,1]

In [83]:
game_id = list(range(38872, 38872+16787))
mydict = {'game_id' : game_id, "winner" : result}
sub = pd.DataFrame(mydict)
sub.to_csv("final_sub2.csv", index=False)