In [52]:
!pip install git+https://github.com/pyg-team/pytorch_geometric.git

Collecting git+https://github.com/pyg-team/pytorch_geometric.git
  Cloning https://github.com/pyg-team/pytorch_geometric.git to /tmp/pip-req-build-s47rh0ep
  Running command git clone --filter=blob:none --quiet https://github.com/pyg-team/pytorch_geometric.git /tmp/pip-req-build-s47rh0ep
  Resolved https://github.com/pyg-team/pytorch_geometric.git to commit 34890a592baa02ad250fe34fa66bbd53ced17801
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: torch_geometric
  Building wheel for torch_geometric (pyproject.toml) ... [?25ldone
[?25h  Created wheel for torch_geometric: filename=torch_geometric-2.4.0-py3-none-any.whl size=971850 sha256=5a2999279c9eeba07e6e9501be7cecd1244da99bec8e3b7b128c6894e2e21bee
  Stored in directory: /tmp/pip-ephem-wheel-cache-zady3vhj/wheels/d3/78/eb/9e26525b948d19533f1688fb6c209cec8a0ba793d39b49ae8f
Successf

In [65]:
import os
import pickle
import random
import shutil
import sys
import uuid
from pathlib import Path

import hydra
import implicit
import lightgbm as lgb
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import wandb
from hydra import compose, initialize
from omegaconf import DictConfig, OmegaConf
from scipy.sparse import csr_matrix, random
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedGroupKFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, Dataset
from tqdm.auto import tqdm

# 最大表示列数の指定（ここでは50列を指定）
pd.set_option("display.max_columns", 50)

sys.path.append(os.pardir)
from utils import evaluate_score, load_datasets, load_sample_sub, load_target
from utils.embedding import TextEmbedder


# 同様のランダムシード設定関数
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


with initialize(config_path="../yamls", version_base=None):
    config = compose(config_name="config.yaml")

In [2]:
import torch_geometric

torch_geometric.__version__

'2.4.0'

In [3]:
train_df = pd.read_csv(Path(config.input_path) / "train.csv")
test_df = pd.read_csv(Path(config.input_path) / "test.csv")

all_df = pd.concat([train_df[["user_id", "anime_id"]], test_df[["user_id", "anime_id"]]]).reset_index(drop=True)
all_df["user_label"], user_idx = pd.factorize(all_df["user_id"])
all_df["anime_label"], anime_idx = pd.factorize(all_df["anime_id"])
all_df["is_train"] = True
all_df.loc[len(train_df) :, "is_train"] = False

In [4]:
edge_index = torch.stack(
    [torch.tensor(all_df["user_label"].values), torch.tensor(all_df["anime_label"].values)], dim=0
)
assert edge_index.shape == (2, len(all_df))
print(edge_index[:, :10])

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]])


In [5]:
X_train_all, X_test = load_datasets(config.gcn.feats)

In [6]:
print(X_train_all.shape)

(136401, 519)


In [52]:
from cuml.preprocessing import StandardScaler

features_df = pd.concat([X_train_all, X_test]).reset_index(drop=True)
features_df["user_label"] = all_df["user_label"]
user_features = features_df.groupby("user_label").mean().to_numpy()
scaler = StandardScaler()
scaled_user_features = np.nan_to_num(scaler.fit_transform(user_features), 0)  # nan をzero埋め


features_df = pd.concat([X_train_all, X_test]).reset_index(drop=True)
features_df["anime_label"] = all_df["anime_label"]
anime_features = features_df.groupby("anime_label").mean().to_numpy()
scaler = StandardScaler()
scaled_anime_features = np.nan_to_num(scaler.fit_transform(anime_features), 0)

In [54]:
np.isnan(scaled_user_features).sum()

0

In [55]:
import torch_geometric.transforms as T
from torch_geometric.data import HeteroData

# Create the heterogeneous graph data object:
data = HeteroData()


# Add the user nodes:
data["user"].x = torch.from_numpy(scaled_user_features).to(torch.float)

# Add the movie nodes:
data["anime"].x = torch.from_numpy(scaled_anime_features).to(torch.float)

# Add the rating edges:
data["user", "score", "anime"].edge_index = edge_index

# テストの部分は np.nan　で埋めておく
all_df["score"] = np.nan
all_df.loc[: len(train_df), "score"] = train_df["score"]
score = torch.from_numpy(all_df["score"].values).to(torch.float)
data["user", "score", "anime"].edge_label = score

data = T.ToUndirected()(data)

del data["anime", "rev_score", "user"].edge_label

In [56]:
data

HeteroData(
  user={ x=[1998, 519] },
  anime={ x=[1956, 519] },
  (user, score, anime)={
    edge_index=[2, 254077],
    edge_label=[254077],
  },
  (anime, rev_score, user)={ edge_index=[2, 254077] }
)

In [57]:
data.metadata()

(['user', 'anime'],
 [('user', 'score', 'anime'), ('anime', 'rev_score', 'user')])

In [58]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = Model(hidden_channels=config.gcn.hidden_channels, num_layers=config.gcn.num_layers).to(device)

In [59]:
def k_fold(num_fold, edge_index):
    skf = StratifiedKFold(num_fold, shuffle=True, random_state=12345)
    train_len = all_df["is_train"].sum()
    train_indices, val_indices, test_indices = [], [], []
    for _, idx in skf.split(torch.zeros(train_len), all_df.iloc[:train_len]["user_id"]):
        val_indices.append(torch.from_numpy(idx).to(torch.long))
        test_indices.append(torch.tensor(range(train_len, len(all_df))).to(torch.long))

    for i in range(num_fold):
        train_mask = torch.ones(train_len, dtype=torch.bool)
        train_mask[val_indices[i]] = 0
        train_indices.append(train_mask.nonzero(as_tuple=False).view(-1))

    return train_indices, val_indices, test_indices

In [60]:
output_path = Path(f".")
os.makedirs(output_path, exist_ok=True)

In [67]:
import torch.nn.functional as F

data = data.to(device)

oof_pred = np.zeros(len(train_df))
test_preds = []

for fold, (train_idx, val_idx, test_idx) in enumerate(zip(*k_fold(config.train.num_folds, all_df))):
    optimizer = torch.optim.Adam(model.parameters(), lr=config.train.lr)
    best_val_loss = float("inf")
    early_stopping_counter = 0

    for epoch in tqdm(range(config.gcn.num_epochs if config.debug is False else 6), desc=f"Fold-{fold+1}"):
        # train
        model.train()
        optimizer.zero_grad()

        pred = model(
            data.x_dict,
            data.edge_index_dict,
            data["user", "anime"].edge_index[:, train_idx],
        )
        target = data["user", "anime"].edge_label[train_idx]
        loss = F.mse_loss(pred, target).sqrt()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # validation
        with torch.no_grad():
            pred = model(
                data.x_dict,
                data.edge_index_dict,
                data["user", "anime"].edge_index[:, val_idx],
            )
            target = data["user", "anime"].edge_label[val_idx]
            val_loss = F.mse_loss(pred, target).sqrt()

        # wandb.log({"epoch": epoch, f"loss/train/fold-{fold}": loss.item(), f"loss/valid/fold-{fold}": val_loss.item()})
        if epoch % config.train.early_stopping == 0:
            tqdm.write(f"Epoch: {epoch}, Loss: {loss.item()}, Val Loss: {val_loss.item()}")

        # early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), output_path / f"model_best_{fold}.pt")
            early_stopping_counter = 0
        else:
            early_stopping_counter += 1
            if early_stopping_counter >= config.train.early_stopping:
                model.load_state_dict(torch.load(output_path / f"model_best_{fold}.pt"))
                break

    # testing
    with torch.no_grad():
        oof_pred[val_idx.cpu().detach().numpy()] = (
            model(
                data.x_dict,
                data.edge_index_dict,
                data["user", "anime"].edge_index[:, val_idx],
            )
            .cpu()
            .detach()
            .numpy()
        )
        test_pred = (
            model(
                data.x_dict,
                data.edge_index_dict,
                data["user", "anime"].edge_index[:, test_idx],
            )
            .cpu()
            .detach()
            .numpy()
        )
        test_preds.append(test_pred)



Fold-1:   0%|          | 0/1000 [00:00<?, ?it/s]

Epoch: 0, Loss: 1.133927822113037, Val Loss: 5.854353427886963
Epoch: 20, Loss: 1.4798604249954224, Val Loss: 1.3904088735580444
Epoch: 40, Loss: 1.2671153545379639, Val Loss: 1.27997887134552
Epoch: 60, Loss: 1.2213834524154663, Val Loss: 1.2445217370986938
Epoch: 80, Loss: 1.194065809249878, Val Loss: 1.2212308645248413
Epoch: 100, Loss: 1.1778390407562256, Val Loss: 1.2109402418136597
Epoch: 120, Loss: 1.1679613590240479, Val Loss: 1.2047228813171387
Epoch: 140, Loss: 1.160181999206543, Val Loss: 1.1991915702819824
Epoch: 160, Loss: 1.1547129154205322, Val Loss: 1.196568250656128
Epoch: 180, Loss: 1.150547981262207, Val Loss: 1.1943985223770142
Epoch: 200, Loss: 1.1478009223937988, Val Loss: 1.1923835277557373
Epoch: 220, Loss: 1.1478805541992188, Val Loss: 1.1952391862869263
Epoch: 240, Loss: 1.1430586576461792, Val Loss: 1.1924833059310913
Epoch: 260, Loss: 1.1420588493347168, Val Loss: 1.1891766786575317
Epoch: 280, Loss: 1.144120693206787, Val Loss: 1.1925688982009888


Fold-2:   0%|          | 0/1000 [00:00<?, ?it/s]

Epoch: 0, Loss: 1.1521456241607666, Val Loss: 3.89753794670105
Epoch: 20, Loss: 1.3169132471084595, Val Loss: 1.2840243577957153
Epoch: 40, Loss: 1.2092763185501099, Val Loss: 1.204483151435852
Epoch: 60, Loss: 1.167975902557373, Val Loss: 1.1736878156661987
Epoch: 80, Loss: 1.154362440109253, Val Loss: 1.165607213973999
Epoch: 100, Loss: 1.1480064392089844, Val Loss: 1.1631919145584106
Epoch: 120, Loss: 1.146430492401123, Val Loss: 1.162269949913025
Epoch: 140, Loss: 1.1465705633163452, Val Loss: 1.1622613668441772
Epoch: 160, Loss: 1.141716480255127, Val Loss: 1.1624072790145874


Fold-3:   0%|          | 0/1000 [00:00<?, ?it/s]

Epoch: 0, Loss: 1.1461641788482666, Val Loss: 5.119396686553955
Epoch: 20, Loss: 1.3336976766586304, Val Loss: 1.3535492420196533
Epoch: 40, Loss: 1.2021257877349854, Val Loss: 1.2157375812530518
Epoch: 60, Loss: 1.1639727354049683, Val Loss: 1.185653567314148
Epoch: 80, Loss: 1.15152108669281, Val Loss: 1.1779953241348267
Epoch: 100, Loss: 1.1455453634262085, Val Loss: 1.1760491132736206
Epoch: 120, Loss: 1.141641616821289, Val Loss: 1.1754652261734009
Epoch: 140, Loss: 1.1386972665786743, Val Loss: 1.1758793592453003


Fold-4:   0%|          | 0/1000 [00:00<?, ?it/s]

Epoch: 0, Loss: 1.1494193077087402, Val Loss: 2.873525381088257
Epoch: 20, Loss: 1.2010442018508911, Val Loss: 1.2054545879364014
Epoch: 40, Loss: 1.1559453010559082, Val Loss: 1.1673192977905273
Epoch: 60, Loss: 1.1423505544662476, Val Loss: 1.1643275022506714
Epoch: 80, Loss: 1.137851357460022, Val Loss: 1.1647635698318481


Fold-5:   0%|          | 0/1000 [00:00<?, ?it/s]

Epoch: 0, Loss: 1.148833155632019, Val Loss: 3.493307590484619
Epoch: 20, Loss: 1.3104339838027954, Val Loss: 1.240037202835083
Epoch: 40, Loss: 1.1991591453552246, Val Loss: 1.2028836011886597
Epoch: 60, Loss: 1.1729265451431274, Val Loss: 1.1754369735717773
Epoch: 80, Loss: 1.1630562543869019, Val Loss: 1.1700106859207153
Epoch: 100, Loss: 1.156399130821228, Val Loss: 1.1664741039276123
Epoch: 120, Loss: 1.149576187133789, Val Loss: 1.1639355421066284
Epoch: 140, Loss: 1.144642949104309, Val Loss: 1.162463903427124
Epoch: 160, Loss: 1.1416550874710083, Val Loss: 1.1619398593902588
Epoch: 180, Loss: 1.139359712600708, Val Loss: 1.1617732048034668
Epoch: 200, Loss: 1.1373257637023926, Val Loss: 1.161936640739441


In [70]:
# calculate mean of predictions across all folds
mean_test_preds = np.mean(test_preds, axis=0)
# clip
oof_pred = np.clip(oof_pred, 1.0, 10.0)
mean_test_preds = np.clip(mean_test_preds, 1.0, 10.0)
# calculate RMSE for oof predictions
oof_rmse = mean_squared_error(train_df["score"], oof_pred, squared=False)
# wandb.log({"oof_rmse": oof_rmse})
print({"oof_rmse": oof_rmse})
# wandb.finish()

sample_submission_df["score"] = mean_test_preds

sample_submission_df.to_csv(output_path / "sub.csv", index=False)

oof_df = pd.DataFrame({"score": oof_pred})
oof_df.to_csv(output_path / "oof.csv", index=False)

if config.debug:
    shutil.rmtree(output_path)

{'oof_rmse': 1.1691676942799851}


NameError: name 'sub' is not defined

In [72]:
len(mean_test_preds)

117676