In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import time

import pandas as pd
from collections import defaultdict
import gc

import torch.nn.functional as F
from torch.utils.data import Dataset, default_collate, DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer, TrainerCallback

from torch.utils.data import Dataset, DataLoader, default_collate
from utils import concat_topic_texts, concat_content_texts
import torch
from dataset import init_tokenizer
import gc
from model import Model
from tqdm import tqdm
import numpy as np
import torch.nn as nn

from utils import clean_text, f2_score, get_pos_score
from model import Model
from pathlib import Path
from tqdm import tqdm
from typing import Dict
import numpy as np
import os
from model import Model
import torch.nn as nn
from transformers import HfArgumentParser, TrainingArguments
from data_args import DataArguments
from model_args import ModelArguments
import cupy as cp
from cuml.metrics import pairwise_distances
from cuml.neighbors import NearestNeighbors
from pandarallel import pandarallel
from numpy import dot
from numpy.linalg import norm
from sklearn.model_selection import train_test_split
import xgboost
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')

os.environ["TOKENIZERS_PARALLELISM"] = "false"
pandarallel.initialize(nb_workers=30)
NUM_WORKERS = 30
BATCH_SIZE = 128
TEST_MODE = False
from optimize_f2 import optimize_cosine_thresh, optimize_xgb_thresh

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
INFO: Pandarallel will run on 30 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
data_df = pd.read_csv("./data/supervised_correlations.csv")
data_df = data_df[["topic_id", "fold"]].drop_duplicates().reset_index(drop=True)
data_folder = Path("./data")

content_df = pd.read_csv(data_folder / "content.csv")
topic_df = pd.read_csv(data_folder / "topics.csv")
subs_df = pd.read_csv(data_folder / "sample_submission.csv")
corrs_df = pd.read_csv(data_folder / "correlations.csv")

In [4]:
# parser = HfArgumentParser((TrainingArguments, DataArguments, ModelArguments))
# training_args, data_args, model_args = parser.parse_args_into_dataclasses()
# training_args: TrainingArguments
# data_args: DataArguments
# model_args: ModelArguments

# Fillna titles
topic_df["title"].fillna("", inplace=True)
content_df["title"].fillna("", inplace=True)
# Fillna descriptions
topic_df["description"].fillna("", inplace=True)
content_df["description"].fillna("", inplace=True)
content_df["text"].fillna("", inplace=True)
# clean text
print("Cleaning text data for topics")
topic_df["title"] = topic_df["title"].parallel_apply(clean_text)
topic_df["description"] = topic_df["description"].parallel_apply(clean_text)
print("Cleaning text data for content")
content_df["title"] = content_df["title"].parallel_apply(clean_text)
content_df["description"] = content_df["description"].parallel_apply(clean_text)
# parent and children information
parents = defaultdict(lambda: [])
children = defaultdict(lambda: [])
topic_title_dict = {}
all_topic_ids = set(topic_df.id.values)
for i, row in tqdm(topic_df.iterrows()):
    if row["parent"] in all_topic_ids:
        parents[row["id"]].append(row["parent"])
        children[row["parent"]].append(row["id"])
    topic_title_dict[row["id"]] = row["title"]

topic_df["length"] = topic_df.apply(
    lambda x: len(x["title"] + x["description"]), axis=1
)
topic_df = topic_df.sort_values("length").reset_index(drop=True)
content_df["length"] = content_df.apply(
    lambda x: len(x["title"] + x["description"]), axis=1
)
content_df = content_df.sort_values("length").reset_index(drop=True)

topic_ids = topic_df["id"].values
content_ids = content_df["id"].values

Cleaning text data for topics
Cleaning text data for content


76972it [00:04, 15708.62it/s]


In [5]:
def collate_fn(inputs):
    inputs = default_collate(inputs)
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:, :mask_len]
    return inputs


class TopicDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=512):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = concat_topic_texts(
            row, topic_title_dict, parents, children, self.tokenizer.sep_token
        )
        # topic
        inputs = self.tokenizer.encode_plus(
            text,
            return_tensors=None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
        )
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
        return inputs


class ContentDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=512):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = concat_content_texts(row)
        # topic
        inputs = self.tokenizer.encode_plus(
            text,
            return_tensors=None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
        )
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
        return inputs


@torch.no_grad()
@torch.cuda.amp.autocast(True)
def extract_embs(
    tokenizer_name, weights_path, contents_df, topics_df, num_workers=2, batch_size=64
):
    tokenizer = init_tokenizer(tokenizer_name)

    model = Model(tokenizer_name, tokenizer_name, "siamese", True)
    model = model.cuda()
    state_dict = torch.load(weights_path)
    if isinstance(model, nn.DataParallel):
        model.module.load_state_dict(state_dict)
    else:
        model.load_state_dict(state_dict)
    print("Loaded ", weights_path)
    del state_dict
    gc.collect()
    model.eval()

    topics_dataset = TopicDataset(topics_df, tokenizer, max_len=128)
    topics_dataloader = DataLoader(
        topics_dataset,
        num_workers=num_workers,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=collate_fn,
    )
    topics_embs = []
    for inputs in tqdm(topics_dataloader):
        inputs = {k: v.cuda(non_blocking=True) for k, v in inputs.items()}
        out = model.feature(inputs)
        topics_embs.append(out.cpu().numpy())
    topics_embs = np.concatenate(topics_embs)
    del topics_dataset, topics_dataloader
    gc.collect()

    contents_embs = []
    if len(contents_df) > 0:
        contents_dataset = ContentDataset(contents_df, tokenizer, max_len=128)
        contents_dataloader = DataLoader(
            contents_dataset,
            num_workers=num_workers,
            batch_size=batch_size,
            shuffle=False,
            collate_fn=collate_fn,
        )
        for inputs in tqdm(contents_dataloader):
            inputs = {k: v.cuda(non_blocking=True) for k, v in inputs.items()}
            out = model.feature(inputs)
            contents_embs.append(out.cpu().numpy())
        del contents_dataset, contents_dataloader
        gc.collect()
        contents_embs = np.concatenate(contents_embs)

    del model, tokenizer
    gc.collect()
    torch.cuda.empty_cache()

    return contents_embs, topics_embs

In [29]:
xgb_cfg = {
    "n_estimators": 1000,
    "learning_rate": 1e-2,
    "objective": "binary:logistic",
    "nthread": os.cpu_count(),
    "tree_method": "gpu_hist",
    "gpu_id": 0
}
fit_params = {
    "verbose": False,
    "eval_metric": "auc",
}

In [40]:
tokenizer_names = [
    "sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
    "sentence-transformers/all-MiniLM-L6-v2",
    "sentence-transformers/paraphrase-xlm-r-multilingual-v1",
    "sentence-transformers/all-roberta-large-v1",
    # "sentence-transformers/distiluse-base-multilingual-cased",
]
weights_folders = [
    "./mpnet-finetune", 
    "./minilm",
    "./xlm-r",
    "./roberta-large" 
    # "./distiluse",
]

if len(tokenizer_names) == 1:
    xgb_path = "xgb" + tokenizer_names[0].split("/")[-1]
elif len(tokenizer_names) == 2:
    xgb_path = "xgb-mpnet-minilm"
elif len(tokenizer_names) == 3:
    xgb_path = "xgb-mpnet-minilm-xlmr"
elif len(tokenizer_names) == 4:
    xgb_path = "xgb-mpnet-minilm-xlmr-robertalarge"

topk = 50
folds = [0, 1, 2, 3, 4, 7]

In [41]:
def get_xgb_features(row, dims):
    topic_emb = topic_id_to_emb[row.topic_id]
    content_emb = content_id_to_emb[row.content_id]
    cur_dim = 0
    dists = []
    for d in dims:
        dists.append(dot(topic_emb[cur_dim:cur_dim+d], content_emb[cur_dim:cur_dim+d]))
        cur_dim += d
    dists.append(row["target"])    
    dists = np.array(dists)
    # return dists[None, :]
    feature = np.concatenate([topic_emb, content_emb, dists])
    return feature[None, :]

In [42]:
# !rm xgb*

In [43]:
oof_distances = []
oof_indices = []
oof_topic_ids = []
xgb_preds = []

for fold in folds:
    print("=" * 100)
    print("FOLD ", fold)

    val_topic_ids = list(data_df[data_df["fold"] == fold].topic_id)
    val_topic_df = topic_df[topic_df.id.isin(val_topic_ids)].reset_index(drop=True)
    gt = corrs_df[corrs_df.topic_id.isin(val_topic_ids)].sort_values("topic_id")
    content_ids = content_df.id.values
    
    topic_embs = []
    content_embs = []
    dims = []
    
    for tokenizer_name, weights_folder in zip(tokenizer_names, weights_folders):
        weights_path = weights_folder + f"-fold{fold}/pytorch_model.bin"
        contents_path = weights_folder + f"-fold{fold}/content-embs.pth"
        topics_path = weights_folder + f"-fold{fold}/topic-embs.pth"
        
        if os.path.exists(contents_path) and os.path.exists(topics_path):
            contents_dict = torch.load(contents_path)
            model_content_embs = np.stack(contents_dict.values(), 0)
            print("Loaded ", contents_path)
            topics_dict = torch.load(topics_path)
            model_topic_embs = np.stack(topics_dict.values(), 0)
            print("Loaded ", topics_path)
        else:
            model_content_embs, model_topic_embs = extract_embs(
                tokenizer_name, weights_path, content_df, val_topic_df, NUM_WORKERS, BATCH_SIZE
            )
            torch.save(
                {k: v for k, v in zip(content_ids, model_content_embs)}, contents_path
            )
            torch.save(
                {k: v for k, v in zip(val_topic_df.id.values, model_topic_embs)}, topics_path
            )
            
        topic_embs.append(model_topic_embs)
        content_embs.append(model_content_embs)
        dims.append(model_content_embs.shape[1])
        del model_content_embs, model_topic_embs
        gc.collect()

    topic_embs = np.concatenate(topic_embs, 1)
    content_embs = np.concatenate(content_embs, 1)
    topic_id_to_emb = {k: v for k, v in zip(val_topic_df.id.values, topic_embs)}
    content_id_to_emb = {k: v for k, v in zip(content_ids, content_embs)}

    # Transfer predictions to gpu
    topic_embs_gpu = cp.array(topic_embs)
    content_embs_gpu = cp.array(content_embs)
    print("Training KNN model ...")
    neighbors_model = NearestNeighbors(n_neighbors=topk, metric="cosine")
    neighbors_model.fit(content_embs_gpu)
    distances, indices = neighbors_model.kneighbors(
        topic_embs_gpu, return_distance=True
    )

    del content_embs, topic_embs, topic_embs_gpu, content_embs_gpu, neighbors_model
    gc.collect()
    torch.cuda.empty_cache()

    indices = indices.get()
    distances = distances.get()
    predictions = []
    for k in range(len(indices)):
        pred = indices[k]
        pred = content_ids[pred]
        assert pred.shape[0] == topk
        predictions.append(" ".join(pred))
    knn_preds = pd.DataFrame(
        {
            "topic_id": val_topic_df.id.values,
            "content_ids": predictions,
        }
    ).sort_values("topic_id")
    recall = get_pos_score(gt["content_ids"], knn_preds["content_ids"], topk)
    print(f"Recall: {recall}")

    pairs = set()
    gt_dict = gt.set_index("topic_id")["content_ids"].apply(lambda x: set(x.split(" "))).to_dict()
    for i, row in tqdm(knn_preds.iterrows()):
        topic_id = row.topic_id
        for content_id in row.content_ids.split(" "):
            if content_id in gt_dict[topic_id]:
                pairs.add((topic_id, content_id, 1))
            else:
                pairs.add((topic_id, content_id, 0))
    
    val_df = pd.DataFrame({
        "topic_id": [pair[0] for pair in pairs],
        "content_id": [pair[1] for pair in pairs],
        "target": [pair[2] for pair in pairs],
    })
    val_df = val_df.sort_values("topic_id").reset_index(drop=True)
    
    # split val_topic_ids
    train_ids, val_ids = train_test_split(val_topic_ids, test_size=0.2, random_state=42)
    train_idxs = val_df[val_df.topic_id.isin(train_ids)].index.values
    val_idxs = val_df[val_df.topic_id.isin(val_ids)].index.values    

    # xgboost
    features = np.concatenate(val_df.parallel_apply(lambda x: get_xgb_features(x, dims), axis=1).values)
    xgb_ckpt_path = f"{xgb_path}-fold{fold}"        
    xgb = XGBClassifier(**xgb_cfg)
    if os.path.exists(xgb_ckpt_path):
        xgb.load_model(xgb_ckpt_path)
    else:
        x_train, y_train = features[train_idxs][:, :-1], features[train_idxs][:, -1]
        x_valid, y_valid = features[val_idxs][:, :-1], features[val_idxs][:, -1]    
        start = time.time()
        xgb.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], **fit_params)
        print(f"xgboost training time: {time.time() - start} seconds.")
        xgb.save_model(xgb_ckpt_path)
    start = time.time()
    val_df["ensemble_score"] = xgb.predict_proba(
        features[:, :-1], ntree_limit=xgb.best_ntree_limit)[:, 1]
    print(f"xgboost inference time: {time.time() - start} seconds.")

    val_df = val_df.sort_values("ensemble_score").reset_index(drop=True)
    xgb_preds.append(val_df[val_df.topic_id.isin(val_ids)])
        
    oof_indices.append(indices)
    oof_distances.append(distances)
    oof_topic_ids.append(val_topic_df.id.values)

oof_distances = np.concatenate(oof_distances, 0)
oof_indices = np.concatenate(oof_indices, 0)
oof_topic_ids = np.concatenate(oof_topic_ids, 0)
xgb_preds = pd.concat(xgb_preds, 0)
val_topic_ids = xgb_preds["topic_id"].unique()
val_gt = corrs_df[corrs_df.topic_id.isin(val_topic_ids)].sort_values("topic_id")

FOLD  0
Loaded  ./mpnet-finetune-fold0/content-embs.pth
Loaded  ./mpnet-finetune-fold0/topic-embs.pth
Loaded  ./minilm-fold0/content-embs.pth
Loaded  ./minilm-fold0/topic-embs.pth
Loaded  ./xlm-r-fold0/content-embs.pth
Loaded  ./xlm-r-fold0/topic-embs.pth
Loaded  ./roberta-large-fold0/content-embs.pth
Loaded  ./roberta-large-fold0/topic-embs.pth
Training KNN model ...
Recall: 0.93572


6152it [00:00, 11390.19it/s]


xgboost inference time: 1.0979394912719727 seconds.
FOLD  1
Loaded  ./mpnet-finetune-fold1/content-embs.pth
Loaded  ./mpnet-finetune-fold1/topic-embs.pth
Loaded  ./minilm-fold1/content-embs.pth
Loaded  ./minilm-fold1/topic-embs.pth
Loaded  ./xlm-r-fold1/content-embs.pth
Loaded  ./xlm-r-fold1/topic-embs.pth
Loaded  ./roberta-large-fold1/content-embs.pth
Loaded  ./roberta-large-fold1/topic-embs.pth
Training KNN model ...
Recall: 0.93233


6152it [00:00, 11336.64it/s]


xgboost inference time: 1.1615498065948486 seconds.
FOLD  2
Loaded  ./mpnet-finetune-fold2/content-embs.pth
Loaded  ./mpnet-finetune-fold2/topic-embs.pth
Loaded  ./minilm-fold2/content-embs.pth
Loaded  ./minilm-fold2/topic-embs.pth
Loaded  ./xlm-r-fold2/content-embs.pth
Loaded  ./xlm-r-fold2/topic-embs.pth
Loaded  ./roberta-large-fold2/content-embs.pth
Loaded  ./roberta-large-fold2/topic-embs.pth
Training KNN model ...
Recall: 0.93519


6152it [00:00, 11478.15it/s]


xgboost inference time: 1.1272263526916504 seconds.
FOLD  3
Loaded  ./mpnet-finetune-fold3/content-embs.pth
Loaded  ./mpnet-finetune-fold3/topic-embs.pth
Loaded  ./minilm-fold3/content-embs.pth
Loaded  ./minilm-fold3/topic-embs.pth
Loaded  ./xlm-r-fold3/content-embs.pth
Loaded  ./xlm-r-fold3/topic-embs.pth
Loaded  ./roberta-large-fold3/content-embs.pth
Loaded  ./roberta-large-fold3/topic-embs.pth
Training KNN model ...
Recall: 0.93215


6152it [00:00, 11302.28it/s]


xgboost inference time: 1.0869686603546143 seconds.
FOLD  4
Loaded  ./mpnet-finetune-fold4/content-embs.pth
Loaded  ./mpnet-finetune-fold4/topic-embs.pth
Loaded  ./minilm-fold4/content-embs.pth
Loaded  ./minilm-fold4/topic-embs.pth
Loaded  ./xlm-r-fold4/content-embs.pth
Loaded  ./xlm-r-fold4/topic-embs.pth
Loaded  ./roberta-large-fold4/content-embs.pth
Loaded  ./roberta-large-fold4/topic-embs.pth
Training KNN model ...
Recall: 0.93028


6152it [00:00, 11042.50it/s]


xgboost inference time: 1.1136953830718994 seconds.
FOLD  7
Loaded  ./mpnet-finetune-fold7/content-embs.pth
Loaded  ./mpnet-finetune-fold7/topic-embs.pth
Loaded  ./minilm-fold7/content-embs.pth
Loaded  ./minilm-fold7/topic-embs.pth
Loaded  ./xlm-r-fold7/content-embs.pth
Loaded  ./xlm-r-fold7/topic-embs.pth
Loaded  ./roberta-large-fold7/content-embs.pth
Loaded  ./roberta-large-fold7/topic-embs.pth
Training KNN model ...
Recall: 0.93122


6152it [00:00, 11482.73it/s]


xgboost inference time: 1.274050235748291 seconds.


In [44]:
xgb_thresh, best_xgb_preds = optimize_xgb_thresh(xgb_preds, val_gt, np.arange(0.05, 0.2, 0.01))
xgb_score = f2_score(val_gt["content_ids"], best_xgb_preds["content_ids"])
print(f"Xgb threshold: {xgb_thresh} - Xgb score: {xgb_score}")

cosine_thresh, cosine_preds = optimize_cosine_thresh(
    oof_distances, oof_indices, oof_topic_ids,
    corrs_df[corrs_df.topic_id.isin(oof_topic_ids)].sort_values("topic_id"), 
    np.arange(0.2, 0.35, 0.01), content_ids)
cosine_score = f2_score(val_gt["content_ids"], 
                        cosine_preds[cosine_preds.topic_id.isin(val_topic_ids)].sort_values("topic_id")["content_ids"])
print(f"Cosine threshold: {cosine_thresh} - Cosine score: {cosine_score}")

100%|██████████| 16/16 [00:01<00:00, 12.03it/s]


[0.7082, 0.718, 0.723, 0.7276, 0.7308, 0.7325, 0.7336, 0.7334, 0.732, 0.7314, 0.7312, 0.7307, 0.7294, 0.7287, 0.7276, 0.7267]
Xgb threshold: 0.11000000000000001 - Xgb score: 0.7336


100%|██████████| 15/15 [00:07<00:00,  2.06it/s]

Cosine threshold: 0.3100000000000001 - Cosine score: 0.7217





In [39]:
!ls

LICENSE			     roberta-large-fold0
README.md		     roberta-large-fold1
Untitled.ipynb		     roberta-large-fold2
__pycache__		     roberta-large-fold3
csv_samplers.py		     roberta-large-fold4
data			     roberta-large-fold5
data_args.py		     roberta-large-fold6
dataset.py		     roberta-large-fold7
dataset_callback.py	     roberta.zip
distiluse		     samplers.py
distiluse-fold0		     tokenizer.py
distiluse-fold1		     train.py
distiluse-fold2		     train_reranker.py
distiluse-fold3		     utils.py
distiluse-fold4		     xgb-mpnet-minilm-distiluse-xlmr-fold0
distiluse-fold5		     xgb-mpnet-minilm-distiluse-xlmr-fold1
engine.py		     xgb-mpnet-minilm-distiluse-xlmr-fold2
ensembling-retrievers.ipynb  xgb-mpnet-minilm-xlmr-fold0
evaluate.ipynb		     xgb-mpnet-minilm-xlmr-fold1
inference.py		     xgb-mpnet-minilm-xlmr-fold2
losses.py		     xgb-mpnet-minilm-xlmr-fold3
minilm-fold0		     xgb-mpnet-minilm-xlmr-fold4
minilm-fold1		     xgb-mpnet-minilm-xlmr-fold7
minilm-fold2		     xgb-mpnet-mini