In [None]:
# # !pip install annoy
# import os
# # print(os.environ["LD_LIBRARY_PATH"])
# os.environ["LD_LIBRARY_PATH"] = "/opt/conda/lib/python3.8/site-packages/torch/lib:/usr/local/cuda-11.3/lib64"
%env TOKENIZERS_PARALLELISM=false

In [None]:
import os
os.environ["LD_LIBRARY_PATH"]

In [None]:
# import os
# os.environ["LD_LIBRARY_PATH"] = "/opt/conda/lib/:/usr/local/cuda-11.3/lib64"

In [None]:
# clean text
# from textblob import TextBlob
import re
import string


def decontracted(phrase):

    # Specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    # ..

    # General
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    # ..

    return phrase

def remove_punctuations(text):
    for punctuation in list(string.punctuation): text = text.replace(punctuation, '')
    return text

def clean_number(text):
    text = re.sub(r'(\d+)([a-zA-Z])', '\g<1> \g<2>', text)
    text = re.sub(r'(\d+) (th|st|nd|rd) ', '\g<1>\g<2> ', text)
    text = re.sub(r'(\d+),(\d+)', '\g<1>\g<2>', text)
    return text

def clean_whitespace(text):
    text = text.strip()
    text = re.sub(r"\s+", " ", text)
    return text

def clean_repeat_words(text):
    return re.sub(r"(\w*)(\w)\2(\w*)", r"\1\2\3", text)

def clean_text(text):
    # text_blob = TextBlob(text)
    # text = str(text_blob.correct())
    text = str(text)
    text = decontracted(text)
    text = remove_punctuations(text)
    text = clean_number(text)
    text = clean_whitespace(text)
    
    return text

In [None]:
import torch
import pandas as pd
from dataset import AutoTokenizer, LANGUAGE_TOKENS, CATEGORY_TOKENS, LEVEL_TOKENS, KIND_TOKENS, OTHER_TOKENS, RELATION_TOKENS
from model import Model

from torch.utils.data import DataLoader, Dataset, default_collate

In [None]:
from pathlib import Path


TEST_MODE = False

# --------------------- VALIDATION SET --------------------------
from tqdm import tqdm
if not TEST_MODE:
    data_df = pd.read_csv("./data/supervised_correlations.csv")
    fold = 0
val_topic_ids = set(list(data_df[data_df["fold"] == fold].topics_ids))
# del data_df

data_folder = Path("./data")
# TODO: we have to process for test set ourselves
contents_df = pd.read_csv(data_folder/'content.csv')
contents_df = contents_df.fillna('')
contents_df['title_len'] = contents_df.title.str.len()
contents_df = contents_df.sort_values(by='title_len', axis=0).reset_index(drop=True).drop(columns=['title_len'])
topics_df = pd.read_csv(data_folder/'topics.csv')
topics_df = topics_df.fillna('')
topics_df['title_len'] = topics_df.title.str.len()
topics_df = topics_df.sort_values(by='title_len', axis=0).reset_index(drop=True).drop(columns=['title_len'])
subs_df = pd.read_csv(data_folder/'sample_submission.csv')
corrs_df = pd.read_csv(data_folder/'correlations.csv')


topics_df["title"] = topics_df["title"].apply(clean_text)
topics_df["description"] = topics_df["description"].apply(clean_text)

contents_df["title"] = contents_df["title"].apply(clean_text)
contents_df["description"] = contents_df["description"].apply(clean_text)
# contents_df["text"] = contents_df["text"].apply(clean_text)

In [None]:
# val_topic_ids = set(val_topic_ids)

In [None]:
# supervised_correlations = pd.read_csv("data/supervised_correlations.csv")

In [None]:
# supervised_correlations[(supervised_correlations["topics_ids"].isin(val_topic_ids))]

In [None]:
# supervised_correlations[(supervised_correlations["topics_ids"].isin(val_topic_ids)) & (supervised_correlations["target"] == 1)]

In [None]:
tokenizer_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

sep_token = tokenizer.sep_token

In [None]:
from tqdm import tqdm
from collections import defaultdict


topic_df = topics_df
content_df = contents_df
# parent and children information
parents = defaultdict(lambda: [])
children = defaultdict(lambda: [])
topic_title_dict = {}

all_topic_ids = set(topic_df.id.values)
for i, row in tqdm(topic_df.iterrows()):
    if row["parent"] in all_topic_ids:
        parents[row["id"]].append(row["parent"])
        children[row["parent"]].append(row["id"])

    topic_title_dict[row["id"]] = row["title"]

# get concatenated texts
topic_dict = {}
for i, (index, row) in tqdm(enumerate(topic_df.iterrows())):
    text = (
        "<|topic|>"
        + f"<|lang_{row['language']}|>"
        + f"<|category_{row['category']}|>"
        + f"<|level_{row['level']}|>"
    )
    text += (
        "<s_title>"
        + row["title"]
        + "</s_title>"
        + "<s_description>"
        + row["description"]
        + "</s_description>"
    )

    context_text = "<s_parent>" 
    max_successor = 10
    parent_id = parents.get(row["id"], [None])[0]

    i = 0
    while parent_id and i < max_successor:
        context_text += topic_title_dict[parent_id] + sep_token
        parent_id = parents.get(parent_id, [None])[0]
        i += 1

    context_text += "</s_parent>"

    if children.get(row["id"]):
        children_text = "<s_children>"
        for child_topic_id in children.get(row["id"]):
            children_text += topic_title_dict[child_topic_id] + sep_token
        children_text = children_text[:-(len(sep_token))] + "</s_children>"
    else:
        children_text = ""

    context_text += children_text
    topic_dict[row["id"]] = text + context_text

content_dict = {}
for i, (index, row) in tqdm(enumerate(content_df.iterrows())):
    text = "<|content|>" + f"<|lang_{row['language']}|>" + f"<|kind_{row['kind']}|>"
    text += (
        "<s_title>"
        + row["title"]
        + "</s_title>"
        + "<s_description>"
        + row["description"]
        + "</s_description>"
        + "<s_text>" + str(row["text"]) + "</s_text>"
    )
    content_dict[row["id"]] = text

In [None]:
topics_df # [topics_df.id == "t_b68dd8c98746"]

In [None]:
topics_df["topic_text"] = topic_dict.values()
topics_df["topic_text"] = topics_df["topic_text"] # .apply(lambda x: x[:2048])

contents_df["content_text"] = content_dict.values()
contents_df["content_text"] = contents_df["content_text"] # .apply(lambda x: x[:2048])

if TEST_MODE:
    topics_df = topics_df[topics_df.id.isin(subs_df.topic_id)]
else: # VAL_MODE
    topics_df = topics_df[topics_df.id.isin(val_topic_ids)]


In [None]:
len([item for item in topics_df.topic_text.values if "<s_parent>" in item])

In [None]:
from dataset import init_tokenizer

class InferenceDataset(Dataset):
    def __init__(self, texts, tokenizer_name='xlm-roberta-base', max_len=512):
        self.texts = texts

        self.tokenizer = init_tokenizer(tokenizer_name)
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        
        # topic
        inputs = self.tokenizer.encode_plus(
            text, 
            return_tensors = None, 
            add_special_tokens = True, 
            max_length = self.max_len,
            padding='max_length',
            truncation = True
        )
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype = torch.long)
            
        return inputs
    
def collate_fn(inputs):
    inputs = default_collate(inputs)
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
        
    return inputs

In [None]:
topic_dataset = InferenceDataset(texts=list(topics_df.topic_text.values), tokenizer_name='sentence-transformers/all-MiniLM-L6-v2', max_len=128)
topic_dataloader = DataLoader(topic_dataset, num_workers=16, batch_size=64, shuffle=False, collate_fn=collate_fn)

In [None]:

model = Model(tokenizer_name="sentence-transformers/all-MiniLM-L6-v2", model_name="sentence-transformers/all-MiniLM-L6-v2", objective="siamese", is_sentence_transformers=True)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)


weights_path = "./data/siamese_model_fold0_0.82832.pth"

state_dict = torch.load(weights_path)
model.load_state_dict(state_dict)

In [None]:
import torch.nn.functional as F

topic_embs = []

for inputs in tqdm(topic_dataloader):
    for k, v in inputs.items():
        inputs[k] = inputs[k].to(device)
    out = model.feature(inputs)
    topic_embs.extend(out.cpu().detach().numpy())

In [None]:
content_dataset = InferenceDataset(texts=list(contents_df.content_text.values), tokenizer_name='sentence-transformers/all-MiniLM-L6-v2', max_len=128)
content_dataloader = DataLoader(content_dataset, num_workers=16, batch_size=64, shuffle=False, collate_fn=collate_fn)

# # 
# del contents_df["text"]
# del contents_df

# import gc
# gc.collect()

content_embs = []

for inputs in tqdm(content_dataloader):
    for k, v in inputs.items():
        inputs[k] = inputs[k].to(device)
    out = model.feature(inputs)
    content_embs.extend(out.cpu().detach().numpy())

In [None]:
# # load from saved files
# torch.save(topic_embs, "./data/topic_embs.pt")
# torch.save(content_embs, "./data/content_embs.pt")

# # # topic_embs = torch.load("./data/topic_embs.pt")
# # # content_embs = torch.load("./data/content_embs.pt")

In [None]:
# Release memory
import gc

# del model
del state_dict
torch.cuda.empty_cache()
gc.collect()

In [None]:
# !pip install fuzzywuzzy annoy

### KNN

In [None]:
# topics = topics_df[topics_df.has_content==True][['id', 'title', 'language']].reset_index(drop=True)

topics = topics_df

test = topics
all_content_ids = contents_df.id.to_numpy()
all_content_titles = contents_df.title.to_numpy()
all_content_language = contents_df.language.to_numpy()
all_test_ids = list(topics.id)
all_test_title = list(topics.title)
all_test_language = list(test.language)

In [None]:
# !pip install cupy-cuda11x
# !pip install cuml-cu11 --extra-index-url=https://pypi.ngc.nvidia.com


In [None]:
import cupy as cp
from cuml.metrics import pairwise_distances
from cuml.neighbors import NearestNeighbors

In [None]:
# Transfer predictions to gpu
topic_embs_gpu = cp.array(topic_embs)
content_embs_gpu = cp.array(content_embs)

# Release memory
torch.cuda.empty_cache()
# gc.collect()

# KNN model
print(' ')
print('Training KNN model...')
neighbors_model = NearestNeighbors(n_neighbors = 50, metric = 'cosine')
neighbors_model.fit(content_embs_gpu)

In [None]:
distances, indices = neighbors_model.kneighbors(topic_embs_gpu, return_distance = True)
predictions = []
for k in tqdm(range(len(indices))):
    pred = indices[k]
    p = ' '.join([contents_df.loc[ind, 'id'] for ind in pred.get()])
    predictions.append(p)

In [None]:
pair_topics = []
pair_contents = []
pair_distances = []

for k in tqdm(range(len(indices))):
    pred = indices[k].get()
    dis = distances[k].get()

    for i in range(len(pred)):
        ind = pred[i]
        pair_topics.append(all_test_ids[k])
        pair_contents.append(contents_df.loc[ind, 'id'])
        pair_distances.append(dis[i])

In [None]:
pair_preds = pd.DataFrame({
    "topic_id": pair_topics,
    "content_ids": pair_contents,
    "distance": pair_distances
})

In [None]:
import numpy as np
def get_pos_score(y_true, y_pred, top_k):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()[:top_k]))
    int_true = np.array([len(x[0] & x[1]) / len(x[0]) for x in zip(y_true, y_pred)])
    return round(np.mean(int_true), 5)

In [None]:
knn_preds = pd.DataFrame({
    'topic_id': all_test_ids,
    'content_ids': predictions # [" ".join(p) for p in predictions]
}).sort_values("topic_id")

In [None]:
gt = corrs_df[corrs_df.topic_id.isin(val_topic_ids)].sort_values("topic_id")    
for k in [5, 10, 20, 50, 100, 200, 500, 1000, 1500, 2000]:
    print("top_k =", k, "max_score =", get_pos_score(gt["content_ids"], knn_preds.sort_values("topic_id")["content_ids"], k))


In [None]:
preds = knn_preds

In [None]:
preds

#### get neighbors using annoy

In [None]:
# ### from fuzzywuzzy import fuzz, process

# from annoy import AnnoyIndex


# content_forest = AnnoyIndex(content_embs[0].shape[0], metric='angular')
# for i, item in tqdm(enumerate(content_embs), total=len(content_embs)):
#     content_forest.add_item(i, item)
# content_forest.build(200)

In [None]:
# indexes_dict = {}
# fuzzy_dict = {}
# classification_dict = {}

In [None]:
# # TODO: find by distance instead

# nearest_content_count = 2000
# fuzzy_filter = 80
# THRESHOLD = 0
# # for fuzzy_filter in range(5, 50, 5):
# #     for t in range(1, 10, 2):
# #         THRESHOLD = t / 100
# preds = []
# for i, t_e in tqdm(enumerate(topic_embs), total=len(topic_embs), desc=f'Getting Preds'):
#     if i in indexes_dict:
#         indexes, distances = indexes_dict[i]
#     else:
#         indexes, distances = content_forest.get_nns_by_vector(
#             # F.normalize(torch.from_numpy(t_e), p=2, dim=0),
#             t_e,
#             nearest_content_count,
#             include_distances=True
#         )
#         # indexes = [i for i, d in zip(indexes, distances) if d < 10]
#         indexes_dict[i] = indexes, distances

#     topic_id = all_test_ids[i]
#     topic_text = all_test_title[i]
#     topic_lang = all_test_language[i]

#     # for idx in indexes:
#     #     if topic_lang != all_content_language[idx]:
#     #         indexes.remove(idx)
    
#     # filtered_indexes = []
#     # for idx in indexes:
#     #     if topic_lang != all_content_language[idx]:
#     #         continue
#     #     if (i, idx) in fuzzy_dict:
#     #         fuzzy_value = fuzzy_dict[(i, idx)]
#     #     else:
#     #         fuzzy_value = fuzz.token_set_ratio(all_content_titles[idx], topic_text)
#     #         fuzzy_dict[(i, idx)] = fuzzy_value
        
#     #     if fuzzy_value > fuzzy_filter:
#     #         filtered_indexes.append(idx)
        
#     #     if (i, idx) in classification_dict:
#     #         score = classification_dict[(i, idx)]
#     #     else:
#     #         topic_features = torch.from_numpy(t_e).to(device)
#     #         content_features = torch.from_numpy(content_embs[idx]).to(device)
#     #         score = torch.sigmoid(model.fc(torch.cat([topic_features, content_features, topic_features - content_features], -1))).item()
#     #         classification_dict[(i, idx)] = score
#     #     if score < THRESHOLD and idx in filtered_indexes:
#     #         filtered_indexes.remove(idx)
#     # ind2dis = {ind: d for ind, d in zip(indexes, distances)}
#     # if len(filtered_indexes) == 0:
#     #     indexes = filtered_indexes[:8] # list(set(filtered_indexes + indexes[:8-len(filtered_indexes)]))
#     # else:
#     #     indexes = filtered_indexes[:8]
#     content_ids = all_content_ids[indexes]
#     preds.append({
#         'topic_id': topic_id,
#         'content_ids': ' '.join(content_ids),
#         # 'distances': ' '.join([str(ind2dis[ind]) for ind in indexes]),
#     })
# preds = pd.DataFrame.from_records(preds)

# preds.to_csv('submission.csv', index=False)

# if not TEST_MODE:
#     from engine import f2_score
#     gt = corrs_df[corrs_df.topic_id.isin(val_topic_ids)].sort_values("topic_id")    
#     preds = preds.sort_values("topic_id")    
#     print("fuzzy_filter", fuzzy_filter, "THRESHOLD:", THRESHOLD, "f2_score", f2_score(gt["content_ids"], preds["content_ids"]))

In [None]:
# for k in [10, 20, 50, 100, 200, 500, 1000, 1500, 2000]:
#     print("top_k =", k, "max_score =", get_pos_score(gt["content_ids"], preds["content_ids"], k))

### filter by using cross-encoder

In [None]:
# cross_encoder_model = Model(
#     tokenizer_name="sentence-transformers/all-MiniLM-L6-v2",
#     model_name="sentence-transformers/all-MiniLM-L6-v2",
#     objective="classification",
#     is_sentence_transformers=True
# )
# device = "cuda" if torch.cuda.is_available() else "cpu"
# cross_encoder_model = cross_encoder_model.to(device)

# weights_path = "./data/classification_model.bin"

# state_dict = torch.load(weights_path)
# cross_encoder_model.load_state_dict(state_dict)
# cross_encoder_model.eval()
# print()

In [None]:
# from dataset import init_tokenizer

# class CrossEncoderDataset(Dataset):
#     def __init__(self, df, tokenizer_name='sentence-transformers/all-MiniLM-L6-v2', max_len=128):
#         self.df = df
#         self.topic_texts = []
#         self.content_texts = []
#         for i, row in tqdm(df.iterrows()):
#             if row["content_ids"]:
#                 for content_id in row["content_ids"].split(" "):
#                     self.topic_texts.append(topic_dict[row["topic_id"]])
#                     self.content_texts.append(content_dict[content_id])
                    
#         self.tokenizer = init_tokenizer(tokenizer_name)
#         self.max_len = max_len
        
#     def __len__(self):
#         return len(self.topic_texts)

#     def __getitem__(self, idx):
#         topic_text = self.topic_texts[idx]
#         content_text = self.content_texts[idx]
        
#         # topic
#         topic_inputs = self.tokenizer.encode_plus(
#             topic_text, 
#             return_tensors = None, 
#             add_special_tokens = True, 
#             max_length = self.max_len,
#             padding='max_length',
#             truncation = True
#         )
#         for k, v in topic_inputs.items():
#             topic_inputs[k] = torch.tensor(v, dtype = torch.long)
            
#         # content
#         content_inputs = self.tokenizer.encode_plus(
#             content_text, 
#             return_tensors = None, 
#             add_special_tokens = True, 
#             max_length = self.max_len,
#             padding='max_length',
#             truncation = True
#         )
#         for k, v in content_inputs.items():
#             content_inputs[k] = torch.tensor(v, dtype = torch.long)

#         combined_inputs = self.tokenizer.encode_plus(
#             topic_text,
#             content_text,
#             return_tensors = None, 
#             add_special_tokens = True, 
#             max_length = self.max_len,
#             padding='max_length',
#             truncation = True
#         )
#         for k, v in combined_inputs.items():
#             combined_inputs[k] = torch.tensor(v, dtype = torch.long)
            
#         return topic_inputs, content_inputs, combined_inputs, 0


# def cross_encoder_collate_fn(batch):
#     batch = default_collate(batch)

#     topic_inputs, content_inputs, combined_inputs, labels = batch
#     mask_len = int(topic_inputs["attention_mask"].sum(axis=1).max())
#     for k, v in topic_inputs.items():
#         topic_inputs[k] = topic_inputs[k][:,:mask_len]

#     mask_len = int(content_inputs["attention_mask"].sum(axis=1).max())
#     for k, v in content_inputs.items():
#         content_inputs[k] = content_inputs[k][:,:mask_len]

#     mask_len = int(combined_inputs["attention_mask"].sum(axis=1).max())
#     for k, v in combined_inputs.items():
#         combined_inputs[k] = combined_inputs[k][:,:mask_len]


#     return {
#         "topic_inputs": topic_inputs,
#         "content_inputs": content_inputs,
#         "combined_inputs": combined_inputs,
#         "labels": labels
#     }

In [None]:
# test with validation dataset

In [None]:
# val_df = data_df[data_df["fold"] == fold]
# val_df["topic_id"] = val_df["topics_ids"]
# val_df

##### re-calucate on validation set

In [None]:
# from sklearn.metrics import f1_score, accuracy_score, recall_score

# ceds = CrossEncoderDataset(val_df, tokenizer_name="sentence-transformers/all-MiniLM-L6-v2", max_len=256)

# ce_dataloader = DataLoader(ceds, batch_size=64, num_workers=16, shuffle=False, collate_fn=cross_encoder_collate_fn)

# res = []

# for inputs in tqdm(ce_dataloader):
#     for k, v in inputs.items():
#         inputs[k] = inputs[k].to(device)
#     out = cross_encoder_model(**inputs)
#     out = torch.sigmoid(out)
#     res.extend(out.cpu().detach().numpy())
#     # break

# f1_score(list(val_df.target.values), [int(r[0] > 0.5) for r in res])

In [None]:
# new_df = val_df.groupby("topic_id", group_keys=False).agg({"content_ids": " ".join})
# new_df["topic_id"] = new_df.index
# new_df

##### calculate on predictions

In [None]:
# ceds = CrossEncoderDataset(pair_preds, tokenizer_name="sentence-transformers/all-MiniLM-L6-v2", max_len=256)

# ce_dataloader = DataLoader(ceds, batch_size=32, num_workers=64, shuffle=False, collate_fn=cross_encoder_collate_fn)

# res = []

# for inputs in tqdm(ce_dataloader):
#     for k, v in inputs.items():
#         inputs[k] = inputs[k].to(device)
#     out = cross_encoder_model(**inputs)
#     out = torch.sigmoid(out)
#     res.extend(out.cpu().detach().numpy())
#     # break

In [None]:
# pred_topics = []
# pred_contents = []

# for i, row in tqdm(preds.iterrows()):
#     if row["content_ids"]:
#         for content_id in row["content_ids"].split(" "):
#             pred_topics.append(row["topic_id"])
#             pred_contents.append(content_id)

In [None]:
# # for topic_id, content_id, score in zip(pred_topics, pred_contents, res):
# new_pred_df = pd.DataFrame({
#     "topic_id": pred_topics,
#     "content_id": pred_contents,
#     "score": [r[0] for r in res]
# })

# # new_pred_df


In [None]:
# pair_preds["id"] = (pair_preds["topic_id"] + pair_preds["content_ids"]).astype(str) 
# new_pred_df["id"] = (new_pred_df["topic_id"] + new_pred_df["content_id"]).astype(str) 

In [None]:
# new_pred_df

In [None]:
# pair_preds

In [None]:
# new_pred_df = new_pred_df.sort_values("id")

In [None]:
# pair_preds = pair_preds.sort_values("id")

In [None]:
# merge_preds = pair_preds.merge(new_pred_df, on="id")
# merge_preds

In [None]:

# # original 0.005, 3
# merge_preds["ensemble_score"] = merge_preds["score"]**0.005 * (1 - merge_preds["distance"])**3
# merge_preds

In [None]:
# merge_preds["topic_id"] = merge_preds["topic_id_x"]

In [None]:
# topic_id_to_language = {}
# for i, row in tqdm(topic_df.iterrows()):
#     topic_id_to_language[row["id"]] = row["language"]

# content_id_to_language = {}
# for i, row in tqdm(content_df.iterrows()):
#     content_id_to_language[row["id"]] = row["language"]

In [None]:
# drop_indices = []

# # for i, row in tqdm(merge_preds.iterrows()):
# #     if topic_id_to_language[row["topic_id"]] != content_id_to_language[row["content_id"]]:
# #         drop_indices.append(i)

In [None]:
# len(drop_indices)

In [None]:
# outputs = merge_preds.drop(index=drop_indices)
# from utils import f2_score

# for i in range(260, 280):
#     threshold = i / 1000
#     thresholded_outputs = outputs[(outputs["ensemble_score"] >= threshold)].groupby('topic_id').agg({'content_id': " ".join})
#     thresholded_outputs["topic_id"] = thresholded_outputs.index
#     # TODO: we need to merge with those topics doesn't have any contents
#     no_content_topics = list(set(new_pred_df.topic_id.values).difference(set(thresholded_outputs.topic_id.values)))

#     no_content_topics_contents = []
#     for topic_id in no_content_topics:
#         top_contents = outputs[outputs.topic_id == topic_id].sort_values("ensemble_score").content_ids.values[-5:]
#         # top_contents = merge_preds[merge_preds.topic_id == topic_id].sort_values("score").content_ids.values[-5:]
#         # top_contents = merge_preds[merge_preds.topic_id == topic_id].sort_values("distance").content_ids.values[:5]
#         no_content_topics_contents.append(" ".join(top_contents))
        
        
#         # no_content_topics_contents.append(" ")

#     no_content_df = pd.DataFrame({
#         "topic_id": no_content_topics,
#         "content_id": no_content_topics_contents
#     })

#     final_predictions = pd.concat([thresholded_outputs, no_content_df])
#     final_predictions.sort_values("topic_id")

#     gt = corrs_df[corrs_df.topic_id.isin(val_topic_ids)].sort_values("topic_id")    
#     thresholded_score = f2_score(gt["content_ids"], final_predictions.sort_values("topic_id")["content_id"])
#     print("threshold =", threshold, "f2_score =", thresholded_score)

In [None]:
# cls_final_predictions = final_predictions

In [None]:
# final_predictions[final_predictions.topic_id.isin(no_content_topics)]

In [None]:
# gt[gt.topic_id.isin(no_content_topics)]

### XGBOOST

In [None]:
# merge_preds

In [None]:
# gt_topic_ids = []
# gt_content_ids = []
# gt_merge_ids = []

# for i, row in gt.iterrows():
#     topic_id = row.topic_id
#     for content_id in row.content_ids.split(" "):
#         gt_topic_ids.append(topic_id)
#         gt_content_ids.append(content_id)
#         gt_merge_ids.append(topic_id + content_id)

In [None]:
# gt_pair_df = pd.DataFrame({
#     "topic_id": gt_topic_ids,
#     "content_id": gt_content_ids,
#     "id": gt_merge_ids
# })

In [None]:
# merge_preds["target"] = 0

In [None]:
# # merge_preds[]["target"] = 1
# merge_preds.loc[merge_preds.id.isin(gt_pair_df.id.values), "target"] = 1

In [None]:
# np.unique(merge_preds.target.values, return_counts=True)

In [None]:
# merge_preds

In [None]:
# recreate training and validation set for xgbooost

In [None]:
# train_df = pd.read_csv("./data/new_train_supervised_df.csv")
# val_df = pd.read_csv("./data/new_val_supervised_df.csv")


In [None]:
# train_df

In [None]:
topic_dataset = InferenceDataset(texts=list(topic_df.topic_text.values), tokenizer_name='sentence-transformers/all-MiniLM-L6-v2', max_len=128)
topic_dataloader = DataLoader(topic_dataset, num_workers=16, batch_size=64, shuffle=False, collate_fn=collate_fn)

In [None]:
import torch.nn.functional as F

all_topic_embs = []

for inputs in tqdm(topic_dataloader):
    for k, v in inputs.items():
        inputs[k] = inputs[k].to(device)
    out = model.feature(inputs)
    all_topic_embs.extend(out.cpu().detach().numpy())

In [None]:
topic_id_to_emb = {}
for id, emb in zip(topic_df.id.values, all_topic_embs):
    topic_id_to_emb[id] = emb

content_id_to_emb = {}
for id, emb in zip(contents_df.id.values, content_embs):
    content_id_to_emb[id] = emb

In [None]:
from numpy import dot
from numpy.linalg import norm

# train_distances = []
# for i, row in tqdm(train_df.iterrows()):
#     a = topic_id_to_emb[row.topics_ids]
#     b = content_id_to_emb[row.content_ids]
#     train_distances.append(
#         1 - dot(a, b)/(norm(a)*norm(b))
#     )
# train_df["distance"] = train_distances
# train_df

In [None]:
pairs = set()

# for i, row in corrs_df[corrs_df.topic_id.isin(preds.topic_id.values)].iterrows():
#     topic_id = row.topic_id
#     for content_id in row.content_ids.split(" "):
#         pairs.add((topic_id, content_id, 1))
        
# for i, row in preds.iterrows():
#     topic_id = row.topic_id
#     for content_id in row.content_ids.split(" "):
#         if (topic_id, content_id, 1) in pairs:
#             continue
        
#         pairs.add((topic_id, content_id, 0))


for i, row in preds.iterrows():
    topic_id = row.topic_id
    for content_id in row.content_ids.split(" "):
        pairs.add((topic_id, content_id, 0))

for i, row in corrs_df[corrs_df.topic_id.isin(preds.topic_id.values)].iterrows():
    topic_id = row.topic_id
    for content_id in row.content_ids.split(" "):
        if (topic_id, content_id, 0) in pairs:
            pairs.remove((topic_id, content_id, 0))
            pairs.add((topic_id, content_id, 1))
        


In [None]:
topic_ids = [pair[0] for pair in pairs]
content_ids = [pair[1] for pair in pairs]
targets = [pair[2] for pair in pairs]

val_df = pd.DataFrame({
    "topics_ids": topic_ids,
    "content_ids": content_ids,
    "target": targets
})

In [None]:
val_distances = []
for i, row in tqdm(val_df.iterrows()):
    a = topic_id_to_emb[row.topics_ids]
    b = content_id_to_emb[row.content_ids]
    val_distances.append(
        1 - dot(a, b)/(norm(a)*norm(b))
    )
    
val_df["distance"] = val_distances
val_df

In [None]:
# x_train = []
# for i, row in tqdm(train_df.iterrows()):
#     feature = np.concatenate([
#         topic_id_to_emb[row.topics_ids],
#         content_id_to_emb[row.content_ids],
#         # [row.score],
#         [row.distance]
#     ])
    
#     x_train.append(feature)
    
# y_train = train_df.target.values.astype(np.uint8)

In [None]:
# x_valid = []
# for i, row in tqdm(val_df.iterrows()):
#     feature = np.concatenate([
#         topic_id_to_emb[row.topics_ids],
#         content_id_to_emb[row.content_ids],
#         # [row.score],
#         [row.distance]
#     ])
    
#     x_valid.append(feature)
    
# y_valid = val_df.target.values.astype(np.uint8)

In [None]:
# import numpy as np

# def unit_vector(vector):
#     """ Returns the unit vector of the vector.  """
#     return vector / np.linalg.norm(vector)

# def angle_between(v1, v2):
#     """ Returns the angle in radians between vectors 'v1' and 'v2'::

#             >>> angle_between((1, 0, 0), (0, 1, 0))
#             1.5707963267948966
#             >>> angle_between((1, 0, 0), (1, 0, 0))
#             0.0
#             >>> angle_between((1, 0, 0), (-1, 0, 0))
#             3.141592653589793
#     """
#     v1_u = unit_vector(v1)
#     v2_u = unit_vector(v2)
#     return np.arccos(np.clip(np.dot(v1_u, v2_u), -1.0, 1.0))

# def euclidian_distance(a, b):
#     return np.linalg.norm(a-b)


In [None]:
X = []
for i, row in tqdm(val_df.iterrows()):
    feature = np.concatenate([
        topic_id_to_emb[row.topics_ids],
        content_id_to_emb[row.content_ids],
        # [angle_between(topic_id_to_emb[row.topics_ids], content_id_to_emb[row.content_ids])],
        [row.distance]
    ])
    
    X.append(feature)
y = val_df.target.values.astype(np.uint8)

In [None]:
from sklearn.model_selection import train_test_split

# X = []
# for i, row in tqdm(merge_preds.iterrows()):
#     feature = np.concatenate([
#         topic_id_to_emb[row.topic_id],
#         content_id_to_emb[row.content_id],
#         # [row.score],
#         [angle_between(topic_id_to_emb[row.topic_id], content_id_to_emb[row.content_id])],
#         # [euclidian_distance(topic_id_to_emb[row.topic_id], content_id_to_emb[row.content_id])],
#         # [manhattan(topic_id_to_emb[row.topic_id], content_id_to_emb[row.content_id])],
#         [row.distance]
#     ])
    
#     X.append(feature)

# # X = np.stack([merge_preds.distance.values, merge_preds.score.values]).transpose()
# y = merge_preds.target.values.astype(np.uint8)

x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=.2)

In [None]:
x_train = np.array(x_train)
x_train = np.concatenate([
    x_train,
    np.concatenate([x_train[:, :384], x_train[:, 384:768], x_train[:, 768:]], 1)
], 0)
y_train = np.concatenate([y_train, y_train], 0)

# x_valid = np.array(x_valid)
# x_valid = np.concatenate([
#     x_valid,
#     np.concatenate([x_valid[:, :384], x_valid[:, 384:768], x_valid[:, 768:]], 1)
# ], 0)
# y_valid = np.concatenate([y_valid, y_valid], 0)


In [None]:
from xgboost import XGBClassifier

from sklearn.metrics import f1_score
def xgb_metric(preds, dmatrix):
    return "f1", f1_score(dmatrix.get_label(), preds >= 0.5)


xgb_cfg = {
        "n_estimators": 2000,
        "learning_rate": 1e-2,
        # "subsample": 0.6,
        # "colsample_bytree": 0.8,
        "objective": "binary:logistic",
        "nthread": os.cpu_count(),
        # "scale_pos_weight": (feats_df["cancer"] == 0).sum() / (feats_df["cancer"] == 1).sum(),
    }
fit_params = {
        "verbose": True,
        "eval_metric": xgb_metric,
    }

xgb = XGBClassifier(**xgb_cfg)

xgb.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], **fit_params)

ckpt_path = "xbg_augment_2000.pth"
xgb.save_model(ckpt_path)
fold_preds = xgb.predict_proba(x_valid, ntree_limit=xgb.best_ntree_limit)[:, 1]

In [None]:
xgb.load_model(ckpt_path)
# xgb.predict_proba(X, ntree_limit=xgb.best_ntree_limit)[:, 1]

In [None]:
val_df["topic_id"] =val_df["topics_ids"]  
val_df["ensemble_score"] = xgb.predict_proba(X, ntree_limit=xgb.best_ntree_limit)[:, 1]


In [None]:
xgb.predict_proba(X, ntree_limit=xgb.best_ntree_limit)

In [None]:

# outputs = merge_preds.drop(index=drop_indices)
from utils import f2_score

for i in range(110, 130):
    threshold = i / 1000
    thresholded_outputs = val_df[(val_df["ensemble_score"] >= threshold)].groupby('topic_id').agg({'content_ids': " ".join})
    thresholded_outputs["topic_id"] = thresholded_outputs.index
    # TODO: we need to merge with those topics doesn't have any contents
    no_content_topics = list(set(val_df.topic_id.values).difference(set(thresholded_outputs.topic_id.values)))

    no_content_topics_contents = []
    for topic_id in no_content_topics:
        top_contents = val_df[val_df.topic_id == topic_id].sort_values("ensemble_score").content_ids.values[-5:]
        # top_contents = merge_preds[merge_preds.topic_id == topic_id].sort_values("score").content_ids.values[-5:]
        # top_contents = merge_preds[merge_preds.topic_id == topic_id].sort_values("distance").content_ids.values[:5]
        no_content_topics_contents.append(" ".join(top_contents))
        
        # no_content_topics_contents.append(" ")

    no_content_df = pd.DataFrame({
        "topic_id": no_content_topics,
        "content_ids": no_content_topics_contents
    })

    final_predictions = pd.concat([thresholded_outputs, no_content_df])
    final_predictions.sort_values("topic_id")

    gt = corrs_df[corrs_df.topic_id.isin(val_topic_ids)].sort_values("topic_id")    
    thresholded_score = f2_score(gt["content_ids"], final_predictions.sort_values("topic_id")["content_ids"])
    print("threshold =", threshold, "f2_score =", thresholded_score)

#### Try ensemble classifier

In [None]:
cls_dict = {}
for i, row in cls_final_predictions.iterrows():
    cls_dict[row.topic_id] = row.content_id

    
xgb_dict = {}
for i, row in final_predictions.iterrows():
    xgb_dict[row.topic_id] = row.content_ids


In [None]:
content_ids_1 = []
content_ids_2 = []
topic_ids = []

for k, v in cls_dict.items():
    topic_ids.append(k)
    content_ids_1.append(v)
    content_ids_2.append(xgb_dict[k])

In [None]:
merged = pd.DataFrame({
    "topic_id": topic_ids,
    "content_ids_1": content_ids_1,
    "content_ids_2": content_ids_2,
})

In [None]:
merged

In [None]:
new_content_ids = []

for i, row in merged.iterrows():
    new_ids = list(set(row.content_ids_1.split(" ")).union(set(row.content_ids_2.split(" "))))
    if len(new_ids) == 0:
        new_ids = row.content_ids_2.split(" ")
    new_content_ids.append(
        " ".join(new_ids)
    )

In [None]:
merged["new_content_ids"] = new_content_ids
merged

In [None]:
f2_score(gt["content_ids"], merged.sort_values("topic_id")["new_content_ids"])

In [None]:
gt

In [None]:
merged.sort_values("topic_id")["new_content_ids"]

In [None]:
f2_score(gt["content_ids"], final_predictions.sort_values("topic_id")["content_ids"])

In [None]:
f2_score(gt["content_ids"], cls_final_predictions.sort_values("topic_id")["content_id"])

In [None]:
final_predictions = final_predictions.sort_values("topic_id")

for index in range(11, len(gt)):
    topic_id = topic_df[topic_df.id == gt.topic_id.values[index]].id.values[0]
    row_predictions = final_predictions["content_id"][index: index + 1].values[0].split(" ")
    row_gt = gt["content_ids"][index: index + 1].values[0].split(" ")
    
    print("topic_id", topic_id)
    print("predictions:", row_predictions)
    print("gt:", row_gt)

    tp = set(row_predictions).intersection(set(row_gt))
    fp = set(row_predictions).difference(set(row_gt))
    fn = set(row_gt).difference(set(row_predictions))

    print("true positive:", tp)
    print("false positive:", fp)
    print("false negative:", fn)

    print(f2_score(gt["content_ids"][index:index + 1], final_predictions["content_id"][index: index + 1]))
    break

In [None]:
print("ALL")
merge_preds[(merge_preds.topic_id == topic_id) & (merge_preds.content_id.isin(row_predictions))]

In [None]:
print("True Positive")
merge_preds[(merge_preds.topic_id == topic_id) & (merge_preds.content_id.isin(tp))]

In [None]:
print("False Negative")
merge_preds[(merge_preds.topic_id == topic_id) & (merge_preds.content_id.isin(fn))]

In [None]:
print("False Positive")
merge_preds[(merge_preds.topic_id == topic_id) & (merge_preds.content_id.isin(fp))]

In [None]:
topic_df[topic_df.id == topic_id]

In [None]:
content_df[content_df.id.isin(fp)]

In [None]:
# from torch.nn.functional import cosine_similarity

In [None]:
# # gt sims
# list_all_content_ids = list(all_content_ids)

# all_sims = []
# for i, row in tqdm(gt.iterrows()):
#     sims = []

#     t_e = topic_embs[all_test_ids.index(row["topic_id"])]
#     for content_id in row["content_ids"].split(" "):
#         c_e = content_embs[list_all_content_ids.index(content_id)]
#         sims.append(cosine_similarity(torch.from_numpy(t_e), torch.from_numpy(c_e), 0))
    
#     all_sims.append(" ".join([str(s.item())[:5] for s in sims]))

In [None]:
# all_sims

In [None]:
# # prediction sims

# list_all_content_ids = list(all_content_ids)

# all_preds_sims = []
# for i, row in tqdm(preds.iterrows()):
#     sims = []

#     t_e = topic_embs[all_test_ids.index(row["topic_id"])]
#     if not row["content_ids"]:
#         continue
#     for content_id in row["content_ids"].split(" "):
#         c_e = content_embs[list_all_content_ids.index(content_id)]
#         sims.append(cosine_similarity(torch.from_numpy(t_e), torch.from_numpy(c_e), 0))
    
#     all_preds_sims.append(" ".join([str(s.item())[:5] for s in sims]))

In [None]:
# all_preds_sims

In [None]:
# import numpy as np

# sample_preds = pd.DataFrame({
#     "content_ids": [
#         "a b c"
#     ]
# })

# sample_gts = pd.DataFrame({
#     "content_ids": [
#         "a d e f g h"
#     ]
# })
# f2_score(
#     sample_gts["content_ids"],
#     sample_preds["content_ids"],
# )