In [41]:
import os
import random
import sys
import uuid
from pathlib import Path

import numpy as np
import pandas as pd

# 最大表示列数の指定（ここでは50列を指定）
pd.set_option("display.max_columns", 50)

Int64Index([   547,    548,    549,    550,    551,    552,    553,    554,
               555,    556,
            ...
            117666, 117667, 117668, 117669, 117670, 117671, 117672, 117673,
            117674, 117675],
           dtype='int64', length=26754)


Int64Index([   547,    548,    549,    550,    551,    552,    553,    554,
               555,    556,
            ...
            117666, 117667, 117668, 117669, 117670, 117671, 117672, 117673,
            117674, 117675],
           dtype='int64', length=26759)

In [20]:
from hydra import compose, initialize

with initialize(config_path="../yamls", version_base=None):
    config = compose(config_name="config.yaml")

In [66]:
train_df = pd.read_csv(Path(config.input_path) / "train.csv")
test_df = pd.read_csv(Path(config.input_path) / "test.csv")

sample_submission_df = pd.read_csv(Path(config.input_path) / "sample_submission.csv")
anime_df = pd.read_csv(Path(config.input_path) / "anime.csv")

# 整形
anime_df["genres"] = anime_df["genres"].str.replace(" ", "")

In [67]:
use_cols = []

In [68]:
# Merge the train data with the anime meta data
all_df = pd.concat([train_df, test_df])
all_df = all_df.merge(anime_df, on="anime_id", how="left")

In [69]:
# Encode the categorical variables
cat_cols = ["user_id", "anime_id", "type", "source", "rating"]
les = []
for col in cat_cols:
    le = LabelEncoder()
    le.fit(all_df[col].fillna(""))
    all_df[col] = le.transform(all_df[col].fillna(""))
    les.append(le)

In [70]:
# 'aired' 列を放送開始日と放送終了日に分割
date_df = pd.DataFrame()
date_df[["start_date", "end_date"]] = all_df["aired"].str.split(" to ", expand=True)


# 年だけを取得するための関数定義
def get_year(date_str):
    try:
        return pd.to_datetime(date_str).year
    except ValueError:
        return None


# 開始日と終了日を年に変換
all_df["start_year"] = date_df["start_date"].apply(get_year)

In [71]:
# ジャンルを作る
stacked_genres = anime_df["genres"].str.split(",").apply(pd.Series).stack().reset_index(drop=True)
unique_genres = stacked_genres.unique()
unique_genres

array(['Comedy', 'Sci-Fi', 'Seinen', 'SliceofLife', 'Space', 'Adventure',
       'Mystery', 'Historical', 'Supernatural', 'Fantasy', 'Ecchi',
       'School', 'Harem', 'Romance', 'Shounen', 'Action', 'Magic',
       'Sports', 'SuperPower', 'Drama', 'Thriller', 'Music', 'Shoujo',
       'Demons', 'Mecha', 'Game', 'Josei', 'Cars', 'Psychological',
       'Parody', 'Samurai', 'Military', 'ShoujoAi', 'Kids', 'MartialArts',
       'Horror', 'Dementia', 'Vampire', 'ShounenAi', 'Hentai', 'Yaoi',
       'Police'], dtype=object)

In [72]:
for genre in unique_genres:
    all_df[genre] = 0

# Set 1 if the anime includes the genre
all_df["genres_num"] = 0
for genre in unique_genres:
    all_df.loc[all_df["genres"].str.contains(genre), genre] = 1
    all_df["genres_num"] += all_df[genre]

In [73]:
all_df.head()

Unnamed: 0,user_id,anime_id,score,genres,japanese_name,type,episodes,aired,producers,licensors,studios,source,duration,rating,members,watching,completed,on_hold,dropped,plan_to_watch,start_year,Comedy,Sci-Fi,Seinen,SliceofLife,...,SuperPower,Drama,Thriller,Music,Shoujo,Demons,Mecha,Game,Josei,Cars,Psychological,Parody,Samurai,Military,ShoujoAi,Kids,MartialArts,Horror,Dementia,Vampire,ShounenAi,Hentai,Yaoi,Police,genres_num
0,0,47,2.0,"Action,Adventure,Shounen",ジョジョの奇妙な冒険 黄金の風,5,39,"Oct 6, 2018 to Jul 28, 2019","Warner Bros. Japan, KlockWorx, Medicos Enterta...",VIZ Media,David Production,6,23 min. per ep.,3,542642,64809,383733,10625,5735,77740,2018,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3
1,0,119,10.0,"Action,Dementia,Demons,Horror,Supernatural",DEVILMAN crybaby,2,10,"Jan 5, 2018","Aniplex, Dynamic Planning, Netflix",Unknown,Science SARU,6,25 min. per ep.,4,650309,29665,477257,13336,18054,111997,2018,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,5
2,0,229,1.0,"Hentai,Yaoi",ぼくのぴこ,3,1,"Sep 7, 2006",Natural High,Unknown,"Sugar Boy, Blue Cat",9,34 min.,5,137560,5153,113190,758,9431,9028,2006,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,2
3,0,257,8.0,"Action,SliceofLife,Comedy,Supernatural",モブサイコ100,5,12,"Jul 11, 2016 to Sep 27, 2016","Warner Bros. Japan, KlockWorx, BS Fuji, Hakuho...","Funimation, Crunchyroll",Bones,12,24 min. per ep.,2,1255830,68041,942402,26125,19213,200049,2016,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4
4,0,309,9.0,"Comedy,Shounen,Sports",はじめの一歩 -Champion Road-,4,1,"Apr 18, 2003",Unknown,"Discotek Media, Geneon Entertainment USA",Madhouse,6,1 hr. 30 min.,2,97346,1565,82189,502,379,12711,2003,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3


In [74]:
# 数値カラムについて、平均と分散をユーザーごとに計算
numeric_cols = list(set(all_df.select_dtypes(include="number").columns.tolist()) - set(cat_cols))
numeric_cols.remove("score")


# Compute the mean and variance for each user for the numeric features

# Compute the mean and variance for each user for the numeric features
user_stats = all_df.groupby("user_id")[numeric_cols].agg(["mean", "var"])
user_stats.columns = ["_".join(col).strip() for col in user_stats.columns.values]
user_stats.reset_index(inplace=True)

all_df = pd.merge(all_df, user_stats, on="user_id", how="left")

In [79]:
all_df.head()

Unnamed: 0,user_id,anime_id,score,genres,japanese_name,type,episodes,aired,producers,licensors,studios,source,duration,rating,members,watching,completed,on_hold,dropped,plan_to_watch,start_year,Comedy,Sci-Fi,Seinen,SliceofLife,...,Police_mean,Police_var,ShoujoAi_mean,ShoujoAi_var,SuperPower_mean,SuperPower_var,Music_mean,Music_var,Dementia_mean,Dementia_var,dropped_mean,dropped_var,completed_mean,completed_var,Space_mean,Space_var,Hentai_mean,Hentai_var,Josei_mean,Josei_var,Samurai_mean,Samurai_var,Yaoi_mean,Yaoi_var,genres_score
0,0,47,2.0,"Action,Adventure,Shounen",ジョジョの奇妙な冒険 黄金の風,5,39,"Oct 6, 2018 to Jul 28, 2019","Warner Bros. Japan, KlockWorx, Medicos Enterta...",VIZ Media,David Production,6,23 min. per ep.,3,542642,64809,383733,10625,5735,77740,2018,0,0,0,0,...,0.014706,0.014706,0.0,0.0,0.102941,0.093723,0.014706,0.014706,0.029412,0.028973,12435.602941,346054700.0,501709.294118,306205000000.0,0.014706,0.014706,0.014706,0.014706,0.0,0.0,0.161765,0.137621,0.014706,0.014706,0.553922
1,0,119,10.0,"Action,Dementia,Demons,Horror,Supernatural",DEVILMAN crybaby,2,10,"Jan 5, 2018","Aniplex, Dynamic Planning, Netflix",Unknown,Science SARU,6,25 min. per ep.,4,650309,29665,477257,13336,18054,111997,2018,0,0,0,0,...,0.014706,0.014706,0.0,0.0,0.102941,0.093723,0.014706,0.014706,0.029412,0.028973,12435.602941,346054700.0,501709.294118,306205000000.0,0.014706,0.014706,0.014706,0.014706,0.0,0.0,0.161765,0.137621,0.014706,0.014706,0.252941
2,0,229,1.0,"Hentai,Yaoi",ぼくのぴこ,3,1,"Sep 7, 2006",Natural High,Unknown,"Sugar Boy, Blue Cat",9,34 min.,5,137560,5153,113190,758,9431,9028,2006,0,0,0,0,...,0.014706,0.014706,0.0,0.0,0.102941,0.093723,0.014706,0.014706,0.029412,0.028973,12435.602941,346054700.0,501709.294118,306205000000.0,0.014706,0.014706,0.014706,0.014706,0.0,0.0,0.161765,0.137621,0.014706,0.014706,0.014706
3,0,257,8.0,"Action,SliceofLife,Comedy,Supernatural",モブサイコ100,5,12,"Jul 11, 2016 to Sep 27, 2016","Warner Bros. Japan, KlockWorx, BS Fuji, Hakuho...","Funimation, Crunchyroll",Bones,12,24 min. per ep.,2,1255830,68041,942402,26125,19213,200049,2016,1,0,0,1,...,0.014706,0.014706,0.0,0.0,0.102941,0.093723,0.014706,0.014706,0.029412,0.028973,12435.602941,346054700.0,501709.294118,306205000000.0,0.014706,0.014706,0.014706,0.014706,0.0,0.0,0.161765,0.137621,0.014706,0.014706,0.386029
4,0,309,9.0,"Comedy,Shounen,Sports",はじめの一歩 -Champion Road-,4,1,"Apr 18, 2003",Unknown,"Discotek Media, Geneon Entertainment USA",Madhouse,6,1 hr. 30 min.,2,97346,1565,82189,502,379,12711,2003,1,0,0,0,...,0.014706,0.014706,0.0,0.0,0.102941,0.093723,0.014706,0.014706,0.029412,0.028973,12435.602941,346054700.0,501709.294118,306205000000.0,0.014706,0.014706,0.014706,0.014706,0.0,0.0,0.161765,0.137621,0.014706,0.014706,0.372549


In [78]:
# ジャンルに一致する作品を見ているほどスコアが高くなる特徴量
all_df["genres_score"] = 0.0
for genre in unique_genres:
    all_df["genres_score"] += all_df[genre] * (1 / all_df["genres_num"]) * all_df[f"{genre}_mean"]

In [81]:
# 見た作品の年代の特徴量
all_df["diff_year"] = all_df["start_year"] - all_df["start_year_mean"]

In [93]:
unused_cols = [
    "score",
    "episodes",
    "duration",
    "genres",
    "japanese_name",
    "aired",
    "producers",
    "licensors",
    "studios",
    "user_id",
    "anime_id",
]
use_cols = list(set(all_df.columns) - set(unused_cols))

In [94]:
train_merged = all_df[: len(train_df)].reset_index(drop=True)
test_merged = all_df[len(train_df) :].reset_index(drop=True)

In [105]:
import pickle

import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

# Initialize the StratifiedKFold object
skf = StratifiedKFold(n_splits=5)

# Training and evaluation with LightGBM
scores_lgb = []
models_lgb = []


for fold, (train_index, val_index) in enumerate(skf.split(train_merged, train_merged["score"])):
    print(f"Training for fold {fold}...")

    # Prepare the train and validation data
    train_data = train_merged.loc[train_index]
    val_data = train_merged.loc[val_index]

    # Define the features and the target
    target = "score"

    # Prepare the LightGBM datasets
    lgb_train = lgb.Dataset(train_data[use_cols], train_data[target])
    lgb_val = lgb.Dataset(val_data[use_cols], val_data[target])

    # Define the parameters
    params = {
        "objective": "regression",
        "metric": "rmse",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "learning_rate": 0.05,
        "num_leaves": 64,
        "min_child_samples": 160,
        "max_depth": -1,
        "subsample_freq": 0,
        "bagging_seed": 0,
        "feature_fraction": 0.9,
        "bagging_fraction": 0.8,
        "reg_alpha": 0.1,
        "reg_lambda": 0.1,
        "device": "gpu",  # Use GPU
    }

    # Train the model
    callbacks = [lgb.early_stopping(stopping_rounds=100), lgb.log_evaluation(period=100)]
    model_lgb = lgb.train(
        params, lgb_train, num_boost_round=1000, valid_sets=[lgb_train, lgb_val], callbacks=callbacks
    )

    # Save the model
    with open(f"model_lgb_{fold}.pkl", "wb") as f:
        pickle.dump(model_lgb, f)

    # Predict the validation data
    val_pred_lgb = model_lgb.predict(val_data[use_cols], num_iteration=model_lgb.best_iteration)

    # Evaluate the model
    score_lgb = np.sqrt(mean_squared_error(val_data[target], val_pred_lgb))
    scores_lgb.append(score_lgb)

    print(f"RMSE for fold {fold}: {score_lgb}")

Training for fold 0...
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 1.19643	valid_1's rmse: 1.40065
[200]	training's rmse: 1.12791	valid_1's rmse: 1.39782
Early stopping, best iteration is:
[181]	training's rmse: 1.1364	valid_1's rmse: 1.39773
RMSE for fold 0: 1.3977348309586035
Training for fold 1...
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 1.20248	valid_1's rmse: 1.40535
[200]	training's rmse: 1.13423	valid_1's rmse: 1.39475
[300]	training's rmse: 1.09815	valid_1's rmse: 1.38994
[400]	training's rmse: 1.06989	valid_1's rmse: 1.38827
[500]	training's rmse: 1.04455	valid_1's rmse: 1.38747
[600]	training's rmse: 1.02141	valid_1's rmse: 1.38587
[700]	training's rmse: 1.00065	valid_1's rmse: 1.38577
[800]	training's rmse: 0.981754	valid_1's rmse: 1.38552
Early stopping, best iteration is:
[776]	training's rmse: 0.986063	valid_1's rmse: 1.38523
RMSE for fold 1: 1.3852342505593167
Training for fold 2...
Trai

In [106]:
# Calculate the average score
average_score_lgb = np.mean(scores_lgb)

print(f"Average RMSE: {average_score_lgb}")

# Predict the test data and create the submission file
sample_submission_df["score"] = 0

for fold in range(5):
    with open(f"model_lgb_{fold}.pkl", "rb") as f:
        model_lgb = pickle.load(f)
    test_pred_lgb = model_lgb.predict(test_merged[use_cols], num_iteration=model_lgb.best_iteration)
    sample_submission_df["score"] += test_pred_lgb / 5

sample_submission_df.to_csv("submission.csv", index=False)

Average RMSE: 1.3861729696029266


In [None]:
# 存在しないユーザーのスコアは1.0になってしまっているので大幅に悪化
# 悪化しないようにlgbで埋める

In [108]:
no_train_user_index = test_df[~test_df["user_id"].isin(train_df["user_id"])].index
no_train_user_index

Int64Index([   547,    548,    549,    550,    551,    552,    553,    554,
               555,    556,
            ...
            117666, 117667, 117668, 117669, 117670, 117671, 117672, 117673,
            117674, 117675],
           dtype='int64', length=26754)

In [110]:
lgb_sub_df = pd.read_csv("submission.csv")
sub_df = pd.read_csv("../output/sub_000_baseline_07cbe2aa.csv")

sub_df.iloc[no_train_user_index] = lgb_sub_df.iloc[no_train_user_index]

In [111]:
sub_df.to_csv("sub_000_baseline_07cbe2aa_padded.csv", index=False)

In [113]:
total_len = len(sub_df)
no_train_len = len(no_train_user_index)
train_len = total_len - no_train_len

In [115]:
1.3861729696029266 * no_train_len / total_len + 1.163 * train_len / total_len

1.2137390600356632