In [60]:
import os
import pickle
import random
import sys
import uuid
from pathlib import Path

import implicit
import lightgbm as lgb
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, random
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

# 最大表示列数の指定（ここでは50列を指定）
pd.set_option("display.max_columns", 50)

sys.path.append(os.pardir)
from hydra import compose, initialize

from utils import load_datasets
from utils.embedding import TextEmbedder

with initialize(config_path="../yamls", version_base=None):
    config = compose(config_name="config.yaml")


train_df = pd.read_csv(Path(config.input_path) / "train.csv")
test_df = pd.read_csv(Path(config.input_path) / "test.csv")

sample_submission_df = pd.read_csv(Path(config.input_path) / "sample_submission.csv")
anime_df = pd.read_csv(Path(config.input_path) / "anime.csv")

# 整形
anime_df["genres"] = anime_df["genres"].str.replace(" ", "")

# Merge the train data with the anime meta data
all_df = pd.concat([train_df, test_df])
all_df = all_df.merge(anime_df, on="anime_id", how="left")


import igraph as ig
import pandas as pd

In [23]:
df = all_df[["user_id", "anime_id"]].copy()

df["user_label"], user_idx = pd.factorize(df["user_id"])
df["anime_label"], anime_idx = pd.factorize(df["anime_id"])
df["anime_label"] += len(user_idx)  # userとanimeの番号が別になるようにずらす

# ユニークなIDを持つ頂点のリストを作成
users = df["user_label"].unique().tolist()
animes = df["anime_label"].unique().tolist()
vertices = users + animes
vertices = sorted(vertices)

# エッジを作成
edges = list(zip(df["user_label"], df["anime_label"]))

Index(['0008e10fb39e55447333', '001a7aed2546342e2602', '003d4b0257cc7849ffe1',
       '0054e700b5be6e074fb7', '0059344eed7e8ca0b6c5', '005b886c0776f8a47ea7',
       '0064eae414e82b36d66a', '006612dafd9e4a003d16', '00b1b1b700264f72afe8',
       '00dabe82bace31f2e2f1',
       ...
       'f4399d422be3dfb41a8a', 'f514c83864fc936de148', 'f8a63ab1cdc4f87436eb',
       'f9562e4d1c6ff34e3ee4', 'f9b77d44eba4fc7e2c04', 'fa11453a6cca09c82953',
       'fa532dafc50ad8439e1d', 'fcf79144bf18fdb90aa5', 'fd64597be5e54f4ac9d3',
       'ffe85a36cd20500faa58'],
      dtype='object', length=1998)

In [24]:
# グラフを作成
g = ig.Graph(vertex_attrs={"name": vertices}, edges=edges, directed=False)

In [28]:
%%time

# PageRank
pageranks = g.pagerank()
print("PageRank:", len(pageranks))

# Betweenness Centrality
betweenness = g.betweenness()
print("betweenness:", len(betweenness))

# Eigenvector Centrality
eigenvector_centrality = g.eigenvector_centrality()
print("eigenvector_centrality:", len(eigenvector_centrality))

# HITS (Hub and Authority Scores)
hub_score = g.hub_score()
print("hub_score:", len(hub_score))


authority_score = g.authority_score()
print("authority_score:", len(authority_score))

constraint = g.constraint()
print("constraint:", len(constraint))

# Degree
degree = g.degree()
print("degree:", len(degree))

coreness = g.coreness()
print("coreness:", len(coreness))

eccentricity = g.eccentricity()
print("eccentricity:", len(eccentricity))

harmonic_centrality = g.harmonic_centrality()
print("harmonic_centrality:", len(harmonic_centrality))

eigenvector_centrality: 3954
CPU times: user 31.5 ms, sys: 1.82 ms, total: 33.3 ms
Wall time: 38.8 ms


In [66]:
node_df = pd.DataFrame(
    {
        "degree": degree,
        "pageranks": pageranks,
        "betweenness": betweenness,
        "eigenvector_centrality": eigenvector_centrality,
        "hub_score": hub_score,
        "authority_score": authority_score,
        "constraint": constraint,
        "coreness": coreness,
        "eccentricity": eccentricity,
        "harmonic_centrality": harmonic_centrality,
    }
)
node_df.head()

Unnamed: 0,degree,pageranks,betweenness,eigenvector_centrality,hub_score,authority_score,constraint,coreness,eccentricity,harmonic_centrality
0,68,0.000153,578.147468,0.085765,0.033557,0.14843,0.014706,58,5.0,0.42068
1,282,0.000524,9472.177793,0.278212,0.108854,0.481491,0.003546,104,4.0,0.462307
2,59,0.000141,418.359773,0.054543,0.021341,0.094395,0.016949,53,5.0,0.416093
3,11,5.5e-05,7.72155,0.021713,0.008496,0.037578,0.090909,11,5.0,0.392664
4,17,6.6e-05,22.385993,0.017529,0.006859,0.030338,0.058824,17,5.0,0.372047


In [70]:
node_cols = node_df.columns

In [72]:
user_df = node_df[: len(user_idx)].copy()
anime_df = node_df[len(user_idx) :].copy().reset_index(drop=True)
user_df.columns = [f"user_{col}" for col in node_cols]
anime_df.columns = [f"anime_{col}" for col in node_cols]
user_df["user_id"] = user_idx
anime_df["anime_id"] = anime_idx

In [74]:
df = df.merge(user_df, on="user_id", how="left").merge(anime_df, on="anime_id", how="left")

In [75]:
use_cols = []
for col in node_cols:
    df[f"{col}_diff"] = df[f"user_{col}"] / df[f"anime_{col}"]
    use_cols.append(f"{col}_diff")
    use_cols.append(f"user_{col}")
    use_cols.append(f"anime_{col}")

In [76]:
df[use_cols].head()

Unnamed: 0,degree_diff,user_degree,anime_degree,pageranks_diff,user_pageranks,anime_pageranks,betweenness_diff,user_betweenness,anime_betweenness,eigenvector_centrality_diff,user_eigenvector_centrality,anime_eigenvector_centrality,hub_score_diff,user_hub_score,anime_hub_score,authority_score_diff,user_authority_score,anime_authority_score,constraint_diff,user_constraint,anime_constraint,coreness_diff,user_coreness,anime_coreness,eccentricity_diff,user_eccentricity,anime_eccentricity,harmonic_centrality_diff,user_harmonic_centrality,anime_harmonic_centrality
0,0.338308,68,201,0.412474,0.000153,0.000372,0.222554,578.147468,2597.788504,0.399847,0.085765,0.214494,0.156446,0.033557,0.214494,1.021934,0.14843,0.145244,2.955882,0.014706,0.004975,0.557692,58,104,1.25,5.0,4.0,0.937798,0.42068,0.448583
1,0.246377,68,276,0.309112,0.000153,0.000496,0.11284,578.147468,5123.586137,0.300179,0.085765,0.285711,0.11745,0.033557,0.285711,0.767203,0.14843,0.193468,4.058824,0.014706,0.003623,0.557692,58,104,1.25,5.0,4.0,0.911081,0.42068,0.461738
2,0.647619,68,105,0.655606,0.000153,0.000234,0.250428,578.147468,2308.635695,0.904545,0.085765,0.094815,0.353916,0.033557,0.094815,2.31185,0.14843,0.064204,1.544118,0.014706,0.009524,0.753247,58,77,1.25,5.0,4.0,0.979214,0.42068,0.42961
3,0.131528,68,517,0.171283,0.000153,0.000896,0.031161,578.147468,18553.410902,0.153414,0.085765,0.559041,0.060025,0.033557,0.559041,0.392097,0.14843,0.378553,7.602941,0.014706,0.001934,0.557692,58,104,1.25,5.0,4.0,0.836634,0.42068,0.502825
4,0.985507,68,69,1.005671,0.000153,0.000153,1.681995,578.147468,343.727253,1.306292,0.085765,0.065655,0.511106,0.033557,0.065655,3.338642,0.14843,0.044458,1.014706,0.014706,0.014493,0.920635,58,63,1.25,5.0,4.0,1.006019,0.42068,0.418163
