In [1]:
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import feather
from mcs_kfold import MCSKFold
from sklearn.model_selection import GroupKFold, StratifiedKFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

sys.path.append("../")
from utils import get_logger, log_evaluation, top2accuracy, eval_func, load_datasets, track_experiment

In [7]:
def load_datasets(feats, debug=False, n=1000):
    if debug:
        train_feats = [feather.read_dataframe(f'../data/features/{feat}_train.feather').head(n) for feat in feats]
        test_feats = [feather.read_dataframe(f'../data/features/{feat}_test.feather').head(n) for feat in feats]
    else:
        train_feats = [feather.read_dataframe(f'../data/features/{feat}_train.feather') for feat in feats]
        test_feats = [feather.read_dataframe(f'../data/features/{feat}_test.feather') for feat in feats]

    train = pd.concat(train_feats, axis=1)
    test = pd.concat(test_feats, axis=1)
    return train, test

In [24]:
class Experiment:
    def __init__(self):
        # year使ってる特徴は除く
        self.features = [
            "Base_6",
            "BertPCA50",
            "Momentum",
            "Interaction",
            # "SimultaneousPlatformCount",
            "PublisherPCA",
            "DeveloperPCA",
            "Series",
            "NumSeries",
            "NumSeriesNormalized",
            # "SeriesLifespan"
        ]
        self.cv = "stratified"
        self.params = {
            "objective": "root_mean_squared_error",
            "metric": "root_mean_squared_error",
            "learning_rate": 0.01,
            "num_leaves": 22,
            "min_data_in_leaf": 100,
            "max_depth": 5,
            "subsample_freq": 1,
            "subsample": 0.7,
            "reg_alpha": 0.0001,
            "reg_lambda": 0.0001,
            "colsample_bytree": 0.3,
            "early_stopping_rounds": 100,
            "n_estimators": 20000,
            "seed": 42
        }
        # self.exp_id = exp_id
        # self.logger = get_logger(exp_id)
        # self.model_type = model_type

    def load_data(self):
        train = pd.read_csv("../data/raw/train_fixed.csv")
        test = pd.read_csv("../data/raw/test_fixed.csv")
        whole_df = pd.concat([train, test], ignore_index=True)
        train_idx = ~whole_df["Year_of_Release"].isnull()
        test_idx = whole_df["Year_of_Release"].isnull()

        X_train, X_test = load_datasets(self.features)
        X = pd.concat([X_train, X_test], ignore_index=True)

        del_cols = [col for col in X if "Year_of_Release" in col]
        # target情報が入ってるやつ除く
        X.drop(del_cols, axis=1, inplace=True)

        y = whole_df["Year_of_Release"].to_frame()
        # groups = train["Publisher"].to_frame()
        X_train, X_test = X.loc[train_idx], X.loc[test_idx]
        y_train = y.loc[train_idx]
        return X_train, X_test, y_train

    def fit_and_predict(self, X_train, X_test, y_train):
        if self.cv == "mcs":
            folds = MCSKFold(n_splits=5, shuffle_mc=True, max_iter=100)
        elif self.cv == "group":
            folds = GroupKFold(n_splits=10)
        elif self.cv == "stratified":
            folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
            y_to_stratify = y_train["Year_of_Release"]

        oof = np.zeros(len(X_train))
        predictions = np.zeros(len(X_test))
        feature_importance_df = pd.DataFrame()
        fold_scores = []

        # for fold, (train_idx, val_idx) in enumerate(folds.split(X_train, groups=groups)):
        for fold, (train_idx, val_idx) in enumerate(folds.split(X_train, y_to_stratify)):
            print("-" * 100)
            print(f"Fold {fold+1}")
            train_data = lgb.Dataset(X_train.iloc[train_idx], label=y_train.iloc[train_idx])
            val_data = lgb.Dataset(X_train.iloc[val_idx], label=y_train.iloc[val_idx])
            # callbacks = [log_evaluation(self.logger, period=100)]
            clf = lgb.train(self.params, train_data, valid_sets=[train_data, val_data], verbose_eval=1000, early_stopping_rounds=100)  #, feval=eval_func)
            oof[val_idx] = clf.predict(X_train.iloc[val_idx].values, num_iteration=clf.best_iteration)
            fold_score = mean_squared_error(y_train.iloc[val_idx].values, oof[val_idx]) ** .5
            fold_scores.append(fold_score)

            fold_importance_df = pd.DataFrame()
            fold_importance_df["feature"] = X_train.columns.values
            fold_importance_df["importance"] = clf.feature_importance(importance_type="gain")
            fold_importance_df["fold"] = fold + 1
            feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

            predictions += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits

        feature_importance_df = feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False).head(50)
        print("##### feature importance #####")
        print(feature_importance_df)
        cv_score_fold_mean = sum(fold_scores) / len(fold_scores)
        print(f"cv_score_fold_mean: {cv_score_fold_mean}")
        return oof, predictions, cv_score_fold_mean

    def save(self, predictions):
        spsbm = pd.read_csv("./data/raw/atmaCup8_sample-submission.csv")
        spsbm["Global_Sales"] = predictions
        spsbm.to_csv(f"./submissions/{self.exp_id}_sub.csv", index=False)

    def track(self, cv_score):
        # track_experiment(self.exp_id, "model", self.model_type)
        # track_experiment(self.exp_id, "features", self.features)
        # track_experiment(self.exp_id, "cv", self.cv)
        # track_experiment(self.exp_id, "params", self.params)
        track_experiment(self.exp_id, "cv_score", cv_score)

    def run(self):
        X_train, X_test, y_train = self.load_data()
        oof, predictions, cv_score = self.fit_and_predict(X_train, X_test, y_train)
        # predictions = self.correct_predictions(predictions)
        # self.save(predictions)
        # self.track(cv_score)
        return oof, predictions, cv_score

In [25]:
oof, predictions, cv_score = Experiment().run()

----------------------------------------------------------------------------------------------------
Fold 1
Training until validation scores don't improve for 100 rounds.
[1000]	training's rmse: 2.3783	valid_1's rmse: 2.50614
[2000]	training's rmse: 2.04113	valid_1's rmse: 2.28476
[3000]	training's rmse: 1.83944	valid_1's rmse: 2.18879
[4000]	training's rmse: 1.68824	valid_1's rmse: 2.13422
[5000]	training's rmse: 1.562	valid_1's rmse: 2.0944
[6000]	training's rmse: 1.4525	valid_1's rmse: 2.06687
[7000]	training's rmse: 1.35703	valid_1's rmse: 2.04533
[8000]	training's rmse: 1.27175	valid_1's rmse: 2.03076
[9000]	training's rmse: 1.19604	valid_1's rmse: 2.01921
[10000]	training's rmse: 1.12689	valid_1's rmse: 2.00963
[11000]	training's rmse: 1.06453	valid_1's rmse: 2.00052
Early stopping, best iteration is:
[11572]	training's rmse: 1.0308	valid_1's rmse: 1.99712
----------------------------------------------------------------------------------------------------
Fold 2
Training until va

In [26]:
oof

array([2010.05376527, 2011.45146244, 2008.37425534, ..., 2011.22987683,
       2010.72228428, 2008.84634593])

In [28]:
predictions[:10]

array([2008.03342002, 2008.70685979, 2008.62713822, 1987.88037035,
       2009.77774943, 2009.50881256, 2010.53197836, 2010.14579503,
       2009.21320358, 2009.99365054])

In [29]:
train = pd.read_csv("../data/raw/train_fixed.csv")
test = pd.read_csv("../data/raw/test_fixed.csv")
whole_df = pd.concat([train, test], ignore_index=True)
train_idx = ~whole_df["Year_of_Release"].isnull()
test_idx = whole_df["Year_of_Release"].isnull()

In [31]:
rounded_pred = np.round(predictions)

In [32]:
whole_df.loc[test_idx, "Year_of_Release"] = rounded_pred

In [33]:
train_year_predicted, test_year_predicted = whole_df[:len(train)], whole_df[len(train):]

In [34]:
train_year_predicted.to_csv(f"../data/raw/train_year_predicted.csv", index=False)
test_year_predicted.to_csv(f"../data/raw/test_year_predicted.csv", index=False)