<a href="https://colab.research.google.com/github/tak34/atmacup-15/blob/main/try3_2_anime2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 概要


- ディスカッションで上がっていたanime2vecを使う。
- 前処理として、アニメの作品名から原作が同一と思われるものをグループ化し、そのグループ化したものでanime2vecを行う。

In [None]:
!pip install Levenshtein



In [None]:
import random
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GroupKFold, StratifiedKFold
from gensim.models import word2vec

import time
from contextlib import contextmanager
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold
import os
from glob import glob

from tqdm.notebook import tqdm
from itertools import combinations
from scipy.cluster.hierarchy import DisjointSet
import Levenshtein

In [None]:
SAVE = True
SAVE_DIR = "/content/drive/MyDrive/Kaggle/atmacup#15/proc/try3"

In [None]:
# 各自の環境に合わせてパスを書き換えてください
# INPUT_DIR: 配布されたデータが格納されているディレクトリ
# OUTPUT_DIR: この notebook で作ったファイルを格納するディレクトリ
INPUT_DIR = "/content/drive/MyDrive/Kaggle/atmacup#15/raw"

# os.makedirs(OUTPUT_DIR, exist_ok=True)

# INPUT_DIR には atmacup15 で配布されたデータが入っています.
glob(os.path.join(INPUT_DIR, "*"))

def read_csv(name: str, **kwrgs) -> pd.DataFrame:
    p = os.path.join(INPUT_DIR, name + ".csv")
    return pd.read_csv(p, **kwrgs)

anime_df = read_csv("anime")
train_df = read_csv("train")
test_df = read_csv("test")

# Utility Functions

In [None]:
SEED = 0

@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.0f} s')

def seed_everything(seed: int):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)


seed_everything(SEED)

# 前処理

アニメの作品名から、原作が同じと思われるものをグループ化する。ディスカッションに上がっていた手法を流用する。
https://www.guruguru.science/competitions/21/discussions/04b127cc-c527-4f9d-9057-109ea54a05eb/

In [None]:
def get_original_work_name(df, threshold=0.3):

    _feature = df.japanese_name.tolist()
    _n = df.shape[0]

    _disjoint_set = DisjointSet(list(range(_n)))
    for i, j in tqdm(combinations(range(_n), 2)):
        if _feature[i] is np.nan or _feature[j] is np.nan:
            lv_dist, jw_dist = 0.5, 0.5
        else:
            # 「劇場版○○」が多くてノイズになるので消す
            if "劇場版" in _feature[i]:
                name1 = _feature[i][3:18]
            else:
                name1 = _feature[i][:15]
            if "劇場版" in _feature[j]:
                name2 = _feature[j][3:18]
            else:
                name2 = _feature[j][:15]
            lv_dist = 1 - Levenshtein.ratio(name1, name2)
            jw_dist = 1 - Levenshtein.jaro_winkler(name1, name2)
        _d = (lv_dist + jw_dist) / 2

        if _d < threshold:
            _disjoint_set.merge(i, j)

    _labels = [None] * _n
    for subset in _disjoint_set.subsets():
        label = _feature[list(subset)[0]]
        for element in subset:
            _labels[element] = label
    df["original_work_name"] = _labels

    return df


processed_anime_df = get_original_work_name(anime_df)
print(f"raw - japanese_name nunique: {anime_df.japanese_name.nunique()}")
print(f"processed - japanese_name nunique: {processed_anime_df.original_work_name.nunique()}")
display(processed_anime_df.head(4))

0it [00:00, ?it/s]

raw - japanese_name nunique: 1931
processed - japanese_name nunique: 1321


Unnamed: 0,anime_id,genres,japanese_name,type,episodes,aired,producers,licensors,studios,source,duration,rating,members,watching,completed,on_hold,dropped,plan_to_watch,original_work_name
0,000ba7f7e34e107e7544,"Comedy, Sci-Fi, Seinen, Slice of Life, Space",宇宙兄弟,TV,99,"Apr 1, 2012 to Mar 22, 2014","Aniplex, Dentsu, YTV, Trinity Sound",Sentai Filmworks,A-1 Pictures,Manga,24 min. per ep.,PG-13 - Teens 13 or older,150428,16552,37234,13009,6948,76685,宇宙兄弟
1,00427279d72064e7fb69,"Adventure, Slice of Life, Mystery, Historical,...",蟲師,TV,26,"Oct 23, 2005 to Jun 19, 2006","Avex Entertainment, Marvelous, SKY Perfect Wel...",Funimation,Artland,Manga,25 min. per ep.,PG-13 - Teens 13 or older,620736,55482,235371,42786,20017,267080,蟲師
2,00444b67aaabdf740a68,"Adventure, Slice of Life, Mystery, Historical,...",蟲師 続章,TV,10,"Apr 5, 2014 to Jun 21, 2014","Aniplex, Kodansha, Delfi Sound",Aniplex of America,Artland,Manga,24 min. per ep.,PG-13 - Teens 13 or older,226522,12585,113559,6095,2606,91677,蟲師
3,00839a3507ab168abe75,"Comedy, Ecchi, Fantasy, School",星刻の竜騎士,TV,12,"Apr 5, 2014 to Jun 21, 2014","Media Factory, AT-X, Sony Music Communications...",Funimation,C-Station,Light novel,24 min. per ep.,R+ - Mild Nudity,170220,8723,118202,3753,8034,31508,星刻の竜騎士


In [None]:
anime2gensaku = processed_anime_df[["anime_id", "original_work_name"]]

train_df = train_df.merge(anime2gensaku,
                    on="anime_id",
                    how="left")
test_df = test_df.merge(anime2gensaku,
                  on="anime_id",
                  how="left")

train_df.head()

Unnamed: 0,user_id,anime_id,score,original_work_name
0,0008e10fb39e55447333,0669cc0219d468761195,2,ジョジョの奇妙な冒険 ダイヤモンドは砕けない
1,0008e10fb39e55447333,111adb8835b8a1a2cf54,10,DEVILMAN crybaby
2,0008e10fb39e55447333,1fc8683c393432a2f9c7,1,ぼくのぴこ
3,0008e10fb39e55447333,2290175205d55e81b197,8,モブサイコ100 II
4,0008e10fb39e55447333,28f173b60331d5cabb0d,9,はじめの一歩 Rising


In [None]:
# バリデーションの時にデータ結合するのに使うので保存しておく
if SAVE:
    anime2gensaku.to_pickle("/content/drive/MyDrive/Kaggle/atmacup#15/proc/try3-2_anime2gensaku.pkl")

# word2vec

## with score

In [None]:
def add_w2v_features(train_df, consider_score=True):
    anime_ids = train_df["original_work_name"].unique().tolist()
    user_anime_list_dict = {user_id: anime_ids.tolist() for user_id, anime_ids in train_df.groupby('user_id')["original_work_name"]}

    # スコアを考慮する場合
    # 今回は1～10のレーティングなので、スコアが5のアニメは5回、スコアが10のアニメは10回、タイトルをリストに追加する
    if consider_score:
        title_sentence_list = []
        for user_id, user_df in train_df.groupby('user_id'):
            user_title_sentence_list = []
            for anime_id, anime_score in user_df[["original_work_name", 'score']].values:
                for i in range(anime_score):
                    user_title_sentence_list.append(anime_id)
            title_sentence_list.append(user_title_sentence_list)
    # スコアを考慮しない場合
    # タイトルをそのままリストに追加する
    else:
        title_sentence_list = train_df.groupby('user_id')["original_work_name"].apply(list).tolist()

    # ユーザごとにshuffleしたリストを作成
    shuffled_sentence_list = [random.sample(sentence, len(sentence)) for sentence in title_sentence_list]  ## <= 変更点

    # 元のリストとshuffleしたリストを合わせる
    train_sentence_list = title_sentence_list + shuffled_sentence_list

    # word2vecのパラメータ
    vector_size = 64
    w2v_params = {
        "vector_size": vector_size,  ## <= 変更点
        "seed": SEED,
        "min_count": 1,
        "workers": 1
    }

    # word2vecのモデル学習
    model = word2vec.Word2Vec(train_sentence_list, **w2v_params)

    # ユーザーごとの特徴ベクトルと対応するユーザーID
    user_factors = {user_id: np.mean([model.wv[anime_id] for anime_id in user_anime_list], axis=0) for user_id, user_anime_list in user_anime_list_dict.items()}

    # アイテムごとの特徴ベクトルと対応するアイテムID
    item_factors = {aid: model.wv[aid] for aid in anime_ids}

    # データフレームを作成
    user_factors_df = pd.DataFrame(user_factors).T.reset_index().rename(columns={"index": "user_id"})
    item_factors_df = pd.DataFrame(item_factors).T.reset_index().rename(columns={"index": "original_work_name"})

    # データフレームのカラム名をリネーム
    user_factors_df.columns = ["user_id"] + [f"user_factor_{i}" for i in range(vector_size)]
    item_factors_df.columns = ["original_work_name"] + [f"item_factor_{i}" for i in range(vector_size)]

    return user_factors_df, item_factors_df

In [None]:
df_w2v_user_score, df_w2v_anime_score = add_w2v_features(train_df, consider_score=True)

In [None]:
df_w2v_user_score.head(2)

Unnamed: 0,user_id,user_factor_0,user_factor_1,user_factor_2,user_factor_3,user_factor_4,user_factor_5,user_factor_6,user_factor_7,user_factor_8,...,user_factor_54,user_factor_55,user_factor_56,user_factor_57,user_factor_58,user_factor_59,user_factor_60,user_factor_61,user_factor_62,user_factor_63
0,0008e10fb39e55447333,0.05666,0.713694,-0.637701,0.244706,-0.476037,0.459011,0.127433,-0.334726,-0.815264,...,0.678634,1.454493,0.522011,0.172755,0.104297,-0.013445,0.156669,0.330454,0.394639,0.141609
1,001a7aed2546342e2602,0.185698,-0.150132,0.246368,-0.116231,0.038822,-0.028468,0.037244,-0.070718,-0.259685,...,0.199198,0.262733,-0.023031,-0.186884,0.025904,-0.016114,0.118161,-0.120758,0.264627,-0.17544


In [None]:
df_w2v_anime_score.head(2)

Unnamed: 0,original_work_name,item_factor_0,item_factor_1,item_factor_2,item_factor_3,item_factor_4,item_factor_5,item_factor_6,item_factor_7,item_factor_8,...,item_factor_54,item_factor_55,item_factor_56,item_factor_57,item_factor_58,item_factor_59,item_factor_60,item_factor_61,item_factor_62,item_factor_63
0,ジョジョの奇妙な冒険 ダイヤモンドは砕けない,-0.646368,3.233414,-2.535191,0.583938,2.409044,-0.187623,-1.502004,-1.298688,-1.57604,...,0.451706,3.027462,0.517448,-0.180926,1.857439,0.490624,1.11493,1.67354,0.107289,0.914681
1,DEVILMAN crybaby,0.833303,0.138963,-0.344531,1.346287,-1.129407,0.94417,0.168053,-0.760003,-1.003956,...,-1.326058,1.431511,1.512789,2.306756,-0.083732,-0.72948,1.449708,-0.019035,1.222205,0.479021


In [None]:
if SAVE:
    os.makedirs(SAVE_DIR, exist_ok=True)
    df_w2v_user_score.to_pickle(SAVE_DIR + "/try3-2_df_w2v_user_score_mod.pkl")
    df_w2v_anime_score.to_pickle(SAVE_DIR + "/try3-2_df_w2v_anime_score_mod.pkl")

## without score

In [None]:
def add_w2v_features_withoutScore(train_df):
    anime_ids = train_df["original_work_name"].unique().tolist()
    user_anime_list_dict = {user_id: anime_ids.tolist() for user_id, anime_ids in train_df.groupby('user_id')["original_work_name"]}

    title_sentence_list = train_df.groupby('user_id')["original_work_name"].apply(list).tolist()

    # ユーザごとにshuffleしたリストを作成
    shuffled_sentence_list = [random.sample(sentence, len(sentence)) for sentence in title_sentence_list]  ## <= 変更点

    # 元のリストとshuffleしたリストを合わせる
    train_sentence_list = title_sentence_list + shuffled_sentence_list

    # word2vecのパラメータ
    vector_size = 64
    w2v_params = {
        "vector_size": vector_size,  ## <= 変更点
        "seed": SEED,
        "min_count": 1,
        "workers": 1
    }

    # word2vecのモデル学習
    model = word2vec.Word2Vec(train_sentence_list, **w2v_params)

    # ユーザーごとの特徴ベクトルと対応するユーザーID
    user_factors = {user_id: np.mean([model.wv[anime_id] for anime_id in user_anime_list], axis=0) for user_id, user_anime_list in user_anime_list_dict.items()}

    # アイテムごとの特徴ベクトルと対応するアイテムID
    item_factors = {aid: model.wv[aid] for aid in anime_ids}

    # データフレームを作成
    user_factors_df = pd.DataFrame(user_factors).T.reset_index().rename(columns={"index": "user_id"})
    item_factors_df = pd.DataFrame(item_factors).T.reset_index().rename(columns={"index": "original_work_name"})

    # データフレームのカラム名をリネーム
    user_factors_df.columns = ["user_id"] + [f"user_factor_withoutScore_{i}" for i in range(vector_size)]
    item_factors_df.columns = ["original_work_name"] + [f"item_factor_withoutScore_{i}" for i in range(vector_size)]

    return user_factors_df, item_factors_df

In [None]:
train_test = pd.concat((train_df[["user_id", "original_work_name"]], test_df)).reset_index(drop=True)
df_w2v_user_withoutScore, df_w2v_anime_withoutScore = add_w2v_features_withoutScore(train_test)

In [None]:
df_w2v_user_withoutScore.head(2)

Unnamed: 0,user_id,user_factor_withoutScore_0,user_factor_withoutScore_1,user_factor_withoutScore_2,user_factor_withoutScore_3,user_factor_withoutScore_4,user_factor_withoutScore_5,user_factor_withoutScore_6,user_factor_withoutScore_7,user_factor_withoutScore_8,...,user_factor_withoutScore_54,user_factor_withoutScore_55,user_factor_withoutScore_56,user_factor_withoutScore_57,user_factor_withoutScore_58,user_factor_withoutScore_59,user_factor_withoutScore_60,user_factor_withoutScore_61,user_factor_withoutScore_62,user_factor_withoutScore_63
0,0008e10fb39e55447333,-0.121361,0.079879,0.313816,-0.397184,0.015738,-0.503213,-0.146642,0.089845,0.192626,...,-0.122901,0.236327,-0.290799,0.207217,-0.053797,0.170556,-0.14713,0.421226,0.095676,0.644251
1,001a7aed2546342e2602,0.271871,0.025126,-0.015849,-0.128581,0.091824,0.094635,-0.382575,0.207033,-0.088976,...,-0.16033,0.03721,-0.067841,-0.072007,-0.089713,0.061912,-0.033862,-0.21084,-0.351439,0.211457


In [None]:
df_w2v_anime_withoutScore.head(2)

Unnamed: 0,original_work_name,item_factor_withoutScore_0,item_factor_withoutScore_1,item_factor_withoutScore_2,item_factor_withoutScore_3,item_factor_withoutScore_4,item_factor_withoutScore_5,item_factor_withoutScore_6,item_factor_withoutScore_7,item_factor_withoutScore_8,...,item_factor_withoutScore_54,item_factor_withoutScore_55,item_factor_withoutScore_56,item_factor_withoutScore_57,item_factor_withoutScore_58,item_factor_withoutScore_59,item_factor_withoutScore_60,item_factor_withoutScore_61,item_factor_withoutScore_62,item_factor_withoutScore_63
0,ジョジョの奇妙な冒険 ダイヤモンドは砕けない,-0.631267,0.380893,1.181644,-1.534547,-0.74271,-1.357404,-0.03112,0.210793,0.808889,...,0.390713,0.483954,0.598414,0.196638,-0.205169,-0.263725,-0.915036,-0.019858,-0.138486,1.550796
1,DEVILMAN crybaby,0.197928,-0.047567,0.93819,-0.81662,-0.427981,-0.175476,-0.096989,0.627747,1.132305,...,0.012483,-0.522943,-0.645696,-0.062002,-0.164873,-1.305316,0.238633,0.209453,-0.72844,0.134086


In [None]:
if SAVE:
    os.makedirs(SAVE_DIR, exist_ok=True)
    df_w2v_user_withoutScore.to_pickle(SAVE_DIR + "/try3-2_df_w2v_user_withoutScore_mod.pkl")
    df_w2v_anime_withoutScore.to_pickle(SAVE_DIR + "/try3-2_df_w2v_anime_withoutScore_mod.pkl")