In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, GroupKFold

from ocha.dataset.cross_validator import CrossValidator
from ocha.config.version import Version
from ocha.config.config import GlobalConfig
from ocha.common.notification import Stdout
from ocha.common.logger import StdoutLogger, FileLogger

from module.context import Atmacup15Context
from module.preprocess import Atmacup15Preprocess
from module.metrics import RMSE
from module.lgb.experiment import Atmacup15Experiment, Atmacup15ExperimentConfig
from module.dataset import Atmacup15Dataset

# Config

In [2]:
conf = GlobalConfig(
    version=Version(1),
    logger=StdoutLogger(),
    file_logger=FileLogger("v1"),
    notification=Stdout(),
    n_fold=5,
    seed=1013,
    optimize=False,
    debug=False,
    is_local=True,
    remake=False
)

Experiment [v1] Start.


# Load Data

In [3]:
train = pd.read_csv("../../input/train.csv")
test = pd.read_csv("../../input/test.csv")
anime = pd.read_csv("../../input/anime.csv")

In [4]:
sample_submission = pd.read_csv("../../input/sample_submission.csv")

In [5]:
sample_submission.head()

Unnamed: 0,score
0,6.478691
1,2.513589
2,2.212736
3,6.608664
4,6.339157


In [6]:
test["score"] = 0
train_test = pd.concat([train, test], axis=0).reset_index(drop=True)

In [7]:
train_test_anime = train_test.merge(anime, on="anime_id", how="left").reset_index(drop=True)

In [8]:
sample_oof = pd.DataFrame()
sample_oof[["user_id", "anime_id", "score"]] = train_test_anime.iloc[:len(train)][["user_id", "anime_id", "score"]]

In [9]:
sample_oof.head()

Unnamed: 0,user_id,anime_id,score
0,0008e10fb39e55447333,0669cc0219d468761195,2
1,0008e10fb39e55447333,111adb8835b8a1a2cf54,10
2,0008e10fb39e55447333,1fc8683c393432a2f9c7,1
3,0008e10fb39e55447333,2290175205d55e81b197,8
4,0008e10fb39e55447333,28f173b60331d5cabb0d,9


# Preprocess

In [10]:
preprocess = Atmacup15Preprocess(source=train_test_anime)
preprocess.preprocess()
preprocess.get_procesed().head()

Unnamed: 0,user_id,anime_id,score,genres,japanese_name,type,episodes,aired,producers,licensors,...,user_ip_h_118,user_ip_h_119,user_ip_h_120,user_ip_h_121,user_ip_h_122,user_ip_h_123,user_ip_h_124,user_ip_h_125,user_ip_h_126,user_ip_h_127
0,0008e10fb39e55447333,0669cc0219d468761195,2,"Action, Adventure, Shounen",ジョジョの奇妙な冒険 黄金の風,TV,39,"Oct 6, 2018 to Jul 28, 2019","Warner Bros. Japan, KlockWorx, Medicos Enterta...",VIZ Media,...,0.0,0.022066,0.0,0.0,0.327652,0.0,0.0,0.0,0.0,0.121932
1,0008e10fb39e55447333,111adb8835b8a1a2cf54,10,"Action, Dementia, Demons, Horror, Supernatural",DEVILMAN crybaby,ONA,10,"Jan 5, 2018","Aniplex, Dynamic Planning, Netflix",Unknown,...,0.0,0.06966,0.0,0.0,0.114541,0.0,0.0,0.0,0.0,0.111935
2,0008e10fb39e55447333,1fc8683c393432a2f9c7,1,"Hentai, Yaoi",ぼくのぴこ,OVA,1,"Sep 7, 2006",Natural High,Unknown,...,0.0,0.0,0.007423,0.0,0.0,0.0,0.055869,0.0,0.0,0.0
3,0008e10fb39e55447333,2290175205d55e81b197,8,"Action, Slice of Life, Comedy, Supernatural",モブサイコ100,TV,12,"Jul 11, 2016 to Sep 27, 2016","Warner Bros. Japan, KlockWorx, BS Fuji, Hakuho...","Funimation, Crunchyroll",...,0.0,0.0212,0.0,0.0,0.254781,0.0,0.0,0.0,0.0,0.219887
4,0008e10fb39e55447333,28f173b60331d5cabb0d,9,"Comedy, Shounen, Sports",はじめの一歩 -Champion Road-,Special,1,"Apr 18, 2003",Unknown,"Discotek Media, Geneon Entertainment USA",...,0.0,0.0,0.0,0.0,0.0,0.129102,0.0,0.0,0.0,0.0


In [11]:
train_processed = preprocess.get_procesed()[:len(train)].reset_index(drop=True)
test_processed = preprocess.get_procesed()[len(train):].reset_index(drop=True)

# Context

In [12]:
context = Atmacup15Context(train_processed, test_processed, sample_oof, sample_submission)

# Fold

In [13]:
class UnknownUserKFold:
    def __init__(self, n_splits_cv: int, n_splits_uu: int):
        self.n_splits_cv = n_splits_cv
        self.n_splits_uu = n_splits_uu

    def split(self, X, y=None, groups=None):
        splits_cv = KFold(n_splits=self.n_splits_cv, shuffle=True, random_state=0).split(X)
        splits_uu = GroupKFold(n_splits=self.n_splits_uu).split(X, groups=groups)
        for _ in range(self.n_splits_cv):
            train_index, test_index = next(splits_cv)
            _, uu_index = next(splits_uu)
            train_index = np.setdiff1d(train_index, uu_index)
            test_index = np.union1d(test_index, uu_index)

            yield train_index, test_index

In [14]:
fold_df = pd.DataFrame()
fold_df["fold"] = [-1 for _ in range(len(train))]
cv = UnknownUserKFold(n_splits_cv=5, n_splits_uu=18)
for fold, (train_idx, valid_idx) in enumerate(cv.split(train, groups=train["user_id"])):
    fold_df.loc[valid_idx, "fold"] = fold

assert len(fold_df[fold_df["fold"] == -1]) == 0

fold_df.head()

Unnamed: 0,fold
0,1
1,2
2,4
3,1
4,2


In [15]:
cross_validator = CrossValidator(fold_df)

# Experiment

In [16]:
exp_conf = Atmacup15ExperimentConfig(
    exp_name="lgb",
    version=conf.version.n,
    n_fold=conf.n_fold,
    seed=conf.seed,
    cross_validator=cross_validator,
    scoring=RMSE(),
    file_logger=conf.file_logger,
    std_logger=conf.logger,
    notification=conf.notification,
    is_debug=conf.debug
)

In [17]:
exp = Atmacup15Experiment(context=context, config=exp_conf, folds=[i for i in range(conf.n_fold)])

In [18]:
if not conf.remake:
    result = exp.run()
else:
    result = exp.remake()

In [19]:
result.submission_df.head()

Unnamed: 0,score
0,7.999914
1,8.520075
2,6.399567
3,7.46351
4,4.118733
