# Path

In [1]:
import sys
print(sys.argv[0])
import os
print (os.getcwd())#获得当前工作目录
print (os.path.abspath('.'))#获得当前工作目录
print (os.path.abspath('..'))#获得当前工作目录的父目录
print (os.path.abspath(os.curdir))#获得当前工作目录
# print(os.path.abspath('..')+'/src/sample_code_submission')
# os.chdir(os.path.abspath('..')+'/src/sample_code_submission')
# os.path.abspath('..')+'/src/sample_code_submission'
from pprint import pprint

/Users/xijunli/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py
/Users/xijunli/Desktop/KDDCup2019/jupyter
/Users/xijunli/Desktop/KDDCup2019/jupyter
/Users/xijunli/Desktop/KDDCup2019
/Users/xijunli/Desktop/KDDCup2019/jupyter


# Data Input

In [65]:
# pylint: disable=wrong-import-order, wrong-import-position, import-error
# pylint: disable=missing-docstring
import base64
from datetime import datetime
import os
from os.path import join
import sys

#  os.system("pip3 install cryptography")


def mprint(msg):
    """info"""
    cur_time = datetime.now().strftime('%m-%d %H:%M:%S')
    print(f"INFO  [{cur_time}] {msg}")


mprint("Import Model")

import json
import signal
import time
from contextlib import contextmanager
import numpy as np
import pandas as pd
import math

TYPE_MAP = {
    'time': str,
    'cat': str,
    'multi-cat': str,
    'num': np.float64
}


class TimeoutException(Exception):
    pass


class Timer:
    def __init__(self):
        self.duration = 0
        self.total = None
        self.remain = None
        self.exec = None

    def set(self, time_budget):
        self.total = time_budget
        self.remain = time_budget
        self.exec = 0

    @contextmanager
    def time_limit(self, pname):
        def signal_handler(signum, frame):
            raise TimeoutException("Timed out!")
        signal.signal(signal.SIGALRM, signal_handler)
        signal.alarm(self.remain)
        start_time = time.time()
        try:
            yield
        finally:
            exec_time = time.time() - start_time
            signal.alarm(0)
            self.exec += exec_time
            self.duration += exec_time
            remain_time = math.ceil(self.total - self.exec)
            self.remain = remain_time

            mprint(f'{pname} success, time spent so far {self.exec} sec')


def read_train(datapath, info):
    train_data = {}
    for table_name, columns in info['tables'].items():
        mprint(f'Table name: {table_name}')

        table_dtype = {key: TYPE_MAP[val] for key, val in columns.items()}

        if table_name == 'main':
            table_path = join(datapath, 'train', 'main_train.data')
        else:
            table_path = join(datapath, 'train', f'{table_name}.data')

        date_list = [key for key, val in columns.items() if val == 'time']

        train_data[table_name] = pd.read_csv(
            table_path, sep='\t', dtype=table_dtype, parse_dates=date_list,
            date_parser=lambda millisecs: millisecs if np.isnan(
                float(millisecs)) else datetime.fromtimestamp(
                    float(millisecs)/1000))

    # get train label
    train_label = pd.read_csv(
        join(datapath, 'train', 'main_train.solution'))['label']
    return train_data, train_label


def read_info(datapath):
    mprint('Read info')
    with open(join(datapath, 'train', 'info.json'), 'r') as info_fp:
        info = json.load(info_fp)
    mprint(f'Time budget for this task is {info["time_budget"]} sec')
    return info


def read_test(datapath, info):
    # get test data
    main_columns = info['tables']['main']
    table_dtype = {key: TYPE_MAP[val] for key, val in main_columns.items()}

    table_path = join(datapath, 'test', 'main_test.data')

    date_list = [key for key, val in main_columns.items() if val == 'time']

    test_data = pd.read_csv(
        table_path, sep='\t', dtype=table_dtype, parse_dates=date_list,
        date_parser=lambda millisecs: millisecs if np.isnan(
            float(millisecs)) else datetime.fromtimestamp(
                float(millisecs) / 1000))
    return test_data


def write_predict(output_dir, dataname, prediction):
    os.makedirs(output_dir, exist_ok=True)
    prediction.rename('label', inplace=True)
    prediction.to_csv(
        join(output_dir, f'{dataname}.predict'), index=False, header=True)


INFO  [04-25 20:33:04] Import Model


In [66]:
ROOT_DIR = os.path.abspath('/Users/xijunli/Desktop/KDDCup2019/starting_kit_0401/')
DIRS = {
        'input': join(ROOT_DIR, 'sample_data'),
        'output': join(ROOT_DIR, 'sample_predictions'),
        'program': join(ROOT_DIR, 'ingestion_program'),
        'submission': join(ROOT_DIR, 'sample_code_submission')
}

datanames = sorted(os.listdir(DIRS['input']))
mprint(f'Datanames: {datanames}')
timer = Timer()
dataname = datanames[0]
predictions = {}
mprint(f'Read data: {dataname}')
datapath = join(DIRS['input'], dataname)
info = read_info(datapath)
timer.set(info['time_budget'])
train_data, train_label = read_train(datapath, info)

INFO  [04-25 20:33:04] Datanames: ['K', 'L']
INFO  [04-25 20:33:04] Read data: K
INFO  [04-25 20:33:04] Read info
INFO  [04-25 20:33:04] Time budget for this task is 6000 sec
INFO  [04-25 20:33:04] Table name: main
INFO  [04-25 20:33:04] Table name: table_1
INFO  [04-25 20:33:05] Table name: table_2
INFO  [04-25 20:33:05] Table name: table_3


# AutoML Model

## Construct Model

In [68]:
os.chdir(os.path.abspath('..')+'/sample_code_submission/')
print(os.path.abspath('.'))

import copy
import numpy as np
import pandas as pd

from automl import predict, train, validate
from CONSTANT import MAIN_TABLE_NAME
from merge import merge_table
from preprocess import clean_df, clean_tables, feature_engineer
from util import Config, log, show_dataframe, timeit

config = Config(info)
tables = copy.deepcopy(train_data)

/Users/xijunli/Desktop/KDDCup2019/src/sample_code_submission


## fit

### clean_tables

In [69]:
import os
import time
from collections import defaultdict, deque

import numpy as np
import pandas as pd

import CONSTANT
from util import Config, Timer, log, timeit

NUM_OP = [np.std, np.mean]

for tname in tables:
    log(f"cleaning table {tname}")
    clean_df(tables[tname])

----cleaning table main

----Start [clean_df]:
--------Start [fillna]:
--------End   [fillna]. Time elapsed: 0.01 sec.
----End   [clean_df]. Time elapsed: 0.01 sec.
----cleaning table table_1

----Start [clean_df]:
--------Start [fillna]:
--------End   [fillna]. Time elapsed: 0.02 sec.
----End   [clean_df]. Time elapsed: 0.02 sec.
----cleaning table table_2

----Start [clean_df]:
--------Start [fillna]:
--------End   [fillna]. Time elapsed: 0.01 sec.
----End   [clean_df]. Time elapsed: 0.01 sec.
----cleaning table table_3

----Start [clean_df]:
--------Start [fillna]:
--------End   [fillna]. Time elapsed: 0.00 sec.
----End   [clean_df]. Time elapsed: 0.00 sec.


### X = merge_table(Xs, self.config)

In [70]:
from collections import defaultdict, deque

import numpy as np
import pandas as pd

import CONSTANT
from util import Config, Timer, log, timeit
from merge import bfs, dfs
# def merge_table(tables, config):

graph = defaultdict(list)
for rel in config['relations']:
    ta = rel['table_A']
    tb = rel['table_B']
    graph[ta].append({
            "to": tb,
            "key": rel['key'],
            "type": rel['type']
    })
    graph[tb].append({
            "to": ta,
            "key": rel['key'],
            "type": '_'.join(rel['type'].split('_')[::-1])
    })
bfs(CONSTANT.MAIN_TABLE_NAME, graph, config['tables'])
X = dfs(CONSTANT.MAIN_TABLE_NAME, config, tables, graph)

----enter main
----enter table_1
----leave table_1
----join main <--many_to_one--nt table_1

----Start [join]:
----End   [join]. Time elapsed: 0.03 sec.
----enter table_2
----leave table_2
----join main <--many_to_one--nt table_2

----Start [join]:
----End   [join]. Time elapsed: 0.02 sec.
----enter table_3
----leave table_3
----join main <--many_to_one--nt table_3

----Start [join]:
----End   [join]. Time elapsed: 0.01 sec.
----leave main


### Feature Engineer

In [71]:
clean_df(X)
X_fe = copy.deepcopy(X)
feature_engineer(X_fe,config)


----Start [clean_df]:
--------Start [fillna]:
--------End   [fillna]. Time elapsed: 0.03 sec.
----End   [clean_df]. Time elapsed: 0.03 sec.

----Start [feature_engineer]:
--------Start [transform_categorical_hash]:
--------End   [transform_categorical_hash]. Time elapsed: 0.12 sec.

--------Start [transform_datetime]:
--------End   [transform_datetime]. Time elapsed: 0.01 sec.
----End   [feature_engineer]. Time elapsed: 0.13 sec.


In [None]:
left = pd.DataFrame({'key': ['K0', 'K1', 'K1', 'K3', 'K1', 'K1', 'K3'],
                      'A': [1,2,3,4,5,6,7],
                      'B': [3,5,2,5,2,3,2]})
 

left_rolling = left.groupby(["key"]).rolling(3).agg({"A": "sum",
                                     "B": "sum"}) \
    .reset_index(0, drop=True) 

left.loc[0]

### Train

In [64]:
from typing import Dict, List

import hyperopt
import lightgbm as lgb
import numpy as np
import pandas as pd
from hyperopt import STATUS_OK, Trials, hp, space_eval, tpe
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

from util import Config, log, timeit

In [73]:
def data_sample(X: pd.DataFrame, y: pd.Series, nrows: int=5000):
    # -> (pd.DataFrame, pd.Series):
    if len(X) > nrows:
        X_sample = X.sample(nrows, random_state=1)
        y_sample = y[X_sample.index]
    else:
        X_sample = X
        y_sample = y

    return X_sample, y_sample

def data_split(X: pd.DataFrame, y: pd.Series, test_size: float=0.2):
    #  -> (pd.DataFrame, pd.Series, pd.DataFrame, pd.Series):
    return train_test_split(X, y, test_size=test_size, random_state=1)

params = {
        "objective": "binary",
        "metric": "auc",
        "verbosity": -1,
        "seed": 1,
        "num_threads": 4
    }

X_sample, y_sample = data_sample(X_fe, train_label, 30000)

X_train, X_val, y_train, y_val = data_split(X_sample, y_sample, test_size=0.5)
train_data = lgb.Dataset(X_train, label=y_train)
valid_data = lgb.Dataset(X_val, label=y_val)

space = {
        "learning_rate": hp.loguniform("learning_rate", np.log(0.01), np.log(0.5)),
        "max_depth": hp.choice("max_depth", [-1, 2, 3, 4, 5, 6]),
        "num_leaves": hp.choice("num_leaves", np.linspace(10, 200, 50, dtype=int)),
        "feature_fraction": hp.quniform("feature_fraction", 0.5, 1.0, 0.1),
        "bagging_fraction": hp.quniform("bagging_fraction", 0.5, 1.0, 0.1),
        "bagging_freq": hp.choice("bagging_freq", np.linspace(0, 50, 10, dtype=int)),
        "reg_alpha": hp.uniform("reg_alpha", 0, 2),
        "reg_lambda": hp.uniform("reg_lambda", 0, 2),
        "min_child_weight": hp.uniform('min_child_weight', 0.5, 10),
}

def objective(hyperparams):
    model = lgb.train({**params, **hyperparams}, train_data, 300,
                          valid_data, early_stopping_rounds=30, verbose_eval=0)

    score = model.best_score["valid_0"][params["metric"]]
        # in classification, less is better
    return {'loss': -score, 'status': STATUS_OK}

trials = Trials()
best = hyperopt.fmin(fn=objective, space=space, trials=trials,
                         algo=tpe.suggest, max_evals=10, verbose=1,
                         rstate=np.random.RandomState(1))

hyperparams = space_eval(space, best)
log(f"auc = {-trials.best_trial['result']['loss']:0.4f} {hyperparams}")


# hyperparams = hyperopt_lightgbm(X_sample, y_sample, params, config)

# X_train, X_val, y_train, y_val = data_split(X, y, 0.1)
# train_data = lgb.Dataset(X_train, label=y_train)
# valid_data = lgb.Dataset(X_val, label=y_val)

# config["model"] = lgb.train({**params, **hyperparams},
#                                 train_data,
#                                 500,
#                                 valid_data,
#                                 early_stopping_rounds=30,
#                                 verbose_eval=100)

100%|██████████| 10/10 [00:01<00:00,  7.65it/s, best loss: -0.6957329351390847]
----auc = 0.6957 {'bagging_fraction': 0.8, 'bagging_freq': 22, 'feature_fraction': 0.5, 'learning_rate': 0.11643157545599682, 'max_depth': 6, 'min_child_weight': 9.195866326847561, 'num_leaves': 145, 'reg_alpha': 0.8629175589887947, 'reg_lambda': 1.935131592856683}


## testing feature tools

In [16]:
from featuretools.primitives import Day, Percentile, CumMean, CumSum
import featuretools as ft
import pandas as pd
import numpy as np
timesteps = pd.DataFrame({'ts_id': range(12),
                          'timestamp': pd.DatetimeIndex(start='1/1/2018', freq='1d', periods=12),
                          'attr1': np.random.random(12),
                          'obs_id': [1, 2, 3] * 4})
# print(timesteps)

entityset = ft.EntitySet("timeseries")
entityset.entity_from_dataframe("timesteps",
                                timesteps,
                                index='ts_id',
                                time_index='timestamp')
entityset.normalize_entity(base_entity_id='timesteps',
                           new_entity_id='observations',
                           index='obs_id',
                           make_time_index=True)

entityset['observations']

# # per timestep
# cutoffs = timesteps[['ts_id', 'timestamp']]
# feature_matrix, feature_list = ft.dfs(entityset=entityset,
#                                       target_entity='timesteps',
#                                       cutoff_time=cutoffs,
#                                       trans_primitives=[Day, Percentile, CumMean, CumSum],
#                                       agg_primitives=[])

# entityset = ft.EntitySet("timeseries")
# entityset.entity_from_dataframe("timesteps",
#                                 timesteps,
#                                 index='ts_id',
#                                 time_index='timestamp')
# entityset.normalize_entity(base_entity_id='timesteps',
#                            new_entity_id='observations',
#                            index='obs_id',
#                            make_time_index=True)

# # per timestep
# cutoffs = timesteps[['ts_id', 'timestamp']]
# feature_matrix, feature_list = ft.dfs(entityset=entityset,
#                                       target_entity='timesteps',
#                                       cutoff_time=cutoffs,
#                                       trans_primitives=[Day, Percentile, CumMean, CumSum],
#                                       agg_primitives=[])

# feature_list

  


Entity: observations
  Variables:
    obs_id (dtype: index)
    first_timesteps_time (dtype: datetime_time_index)
  Shape:
    (Rows: 3, Columns: 2)