In [1]:
import pandas as pd
import pickle
import os
import torch
from torch import nn
import rtdl
from typing import List, Dict, Any
from tqdm import tqdm
from transformers import get_scheduler

from sentence_transformers import SentenceTransformer, losses
from sentence_transformers.readers import InputExample
from torch.utils.data import DataLoader

In [2]:
# --- デバイスの設定 ---
if torch.cuda.is_available():
    print("GPU is available!")
    print("Device Name:", torch.cuda.get_device_name(0))
    device = torch.device("cuda")
else:
    print("GPU is not available. Training on CPU.")
    device = torch.device("cpu")

GPU is available!
Device Name: NVIDIA GeForce GTX 1050 with Max-Q Design


In [3]:
# --- 定数とデータの読み込み ---
# 生のInputExample (辞書を含む) を読み込む
TMP_DATA_DIR = 'tmp_data/'
OUTPUT_PATH = 'tmp_data/03_ftt_instacart_recommender'
train_examples_path = os.path.join(TMP_DATA_DIR, '02_train_examples.pkl')
with open(train_examples_path, 'rb') as f:
    train_examples = pickle.load(f)

In [4]:
# --- モデルの定義とインスタンス化 ---
class TabularFTTransformer(nn.Module):
    def __init__(self, ft_transformer_model: nn.Module, numerical_feature_names: List[str], categorical_feature_names: List[str]):
        super().__init__()
        self.ft_transformer, self.numerical_feature_names, self.categorical_feature_names = ft_transformer_model, numerical_feature_names, categorical_feature_names
    def tokenize(self, item_feature_dicts: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        x_num = torch.tensor([[item.get(feat, 0.0) for feat in self.numerical_feature_names] for item in item_feature_dicts], dtype=torch.float32)
        x_cat = torch.tensor([[int(item.get(feat, 0)) for feat in self.categorical_feature_names] for item in item_feature_dicts], dtype=torch.long)
        return {'x_num': x_num, 'x_cat': x_cat}
    def forward(self, features: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
        embeddings = self.ft_transformer(features['x_num'], features['x_cat'])
        return {'sentence_embedding': embeddings}
    def get_sentence_embedding_dimension(self) -> int:
        return self.ft_transformer.d_out

In [5]:
# (モデルのインスタンス化とGPUへの移動)
item_profiles_df = pd.read_csv(os.path.join(TMP_DATA_DIR, '02_item_profiles_scaled.csv'))
numerical_feature_names = ['reorder_rate', 'avg_add_to_cart_order', 'total_orders', 'unique_users', 'avg_days_since_prior_order']
categorical_feature_names = ['aisle_id', 'department_id']
cat_cardinalities = [int(item_profiles_df[col].max()) + 1 for col in categorical_feature_names]
D_OUT = 64
core_ft_transformer = rtdl.FTTransformer.make_default(n_num_features=len(numerical_feature_names), cat_cardinalities=cat_cardinalities, last_layer_query_idx=[-1], d_out=D_OUT)
ftt_wrapper = TabularFTTransformer(ft_transformer_model=core_ft_transformer, numerical_feature_names=numerical_feature_names, categorical_feature_names=categorical_feature_names)
model = SentenceTransformer(modules=[ftt_wrapper])
model.to(device)

SentenceTransformer(
  (0): TabularFTTransformer(
    (ft_transformer): FTTransformer(
      (feature_tokenizer): FeatureTokenizer(
        (num_tokenizer): NumericalFeatureTokenizer()
        (cat_tokenizer): CategoricalFeatureTokenizer(
          (embeddings): Embedding(157, 192)
        )
      )
      (cls_token): CLSToken()
      (transformer): Transformer(
        (blocks): ModuleList(
          (0): ModuleDict(
            (attention): MultiheadAttention(
              (W_q): Linear(in_features=192, out_features=192, bias=True)
              (W_k): Linear(in_features=192, out_features=192, bias=True)
              (W_v): Linear(in_features=192, out_features=192, bias=True)
              (W_out): Linear(in_features=192, out_features=192, bias=True)
              (dropout): Dropout(p=0.2, inplace=False)
            )
            (ffn): FFN(
              (linear_first): Linear(in_features=192, out_features=512, bias=True)
              (activation): ReGLU()
              (dropout)

In [6]:
# --- データローダーと損失関数の設定 ---
def smart_collate_fn(batch):
    anchor_dicts = [example.texts[0] for example in batch]
    positive_dicts = [example.texts[1] for example in batch]
    # ★ テンソル化はcollate_fnの中で、CPU上で行う
    anchor_features = model[0].tokenize(anchor_dicts)
    positive_features = model[0].tokenize(positive_dicts)
    features = [anchor_features, positive_features]
    labels = torch.zeros(len(batch))
    return features, labels

In [7]:
train_dataloader = DataLoader(
    train_examples,
    shuffle=True,
    batch_size=1024,
    collate_fn=smart_collate_fn,
    num_workers=2,
    pin_memory=True
)
train_loss = losses.MultipleNegativesRankingLoss(model=model)
train_loss.to(device)

MultipleNegativesRankingLoss(
  (model): SentenceTransformer(
    (0): TabularFTTransformer(
      (ft_transformer): FTTransformer(
        (feature_tokenizer): FeatureTokenizer(
          (num_tokenizer): NumericalFeatureTokenizer()
          (cat_tokenizer): CategoricalFeatureTokenizer(
            (embeddings): Embedding(157, 192)
          )
        )
        (cls_token): CLSToken()
        (transformer): Transformer(
          (blocks): ModuleList(
            (0): ModuleDict(
              (attention): MultiheadAttention(
                (W_q): Linear(in_features=192, out_features=192, bias=True)
                (W_k): Linear(in_features=192, out_features=192, bias=True)
                (W_v): Linear(in_features=192, out_features=192, bias=True)
                (W_out): Linear(in_features=192, out_features=192, bias=True)
                (dropout): Dropout(p=0.2, inplace=False)
              )
              (ffn): FFN(
                (linear_first): Linear(in_features=192, out_f

In [8]:
# モデル学習
epochs = 1
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_dataloader) * epochs
scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=int(total_steps * 0.1),
    num_training_steps=total_steps
)

for epoch in range(epochs):
    model.train()
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{epochs}", leave=True)
    
    for batch in progress_bar:
        features, labels = batch
        # ★ バッチごとに、CPUテンソルをGPUに転送
        features_on_device = [{key: val.to(device) for key, val in feature.items()} for feature in features]
        labels_on_device = labels.to(device)

        loss_value = train_loss(features_on_device, labels_on_device)

        optimizer.zero_grad()
        loss_value.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'loss': loss_value.item()})


os.makedirs(OUTPUT_PATH, exist_ok=True)
torch.save(model.state_dict(), os.path.join(OUTPUT_PATH, "pytorch_model.bin"))

Epoch 1/1: 100%|██████████| 4883/4883 [14:48<00:00,  5.50it/s, loss=6.52]
