In [14]:
"""
model: 模型官方名字
model_path: kaggle上的模型数据集
"""
stage1_ensemble ={
    'ex109':
        {
            'model' : 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2',  # 
            'model_path': 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2',  
            'weight': '../outputs/ex109/',       # ###### 这里使用
            'max_len' : 128,
            'explain': ''
        },
    }



In [33]:
# =========================================================================================
# Libraries
# =========================================================================================
import os
import gc
import time
import math
import random
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from torch.utils.checkpoint import checkpoint
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_cosine_schedule_with_warmup, DataCollatorWithPadding
# import cupy as cp
# from cuml.metrics import pairwise_distances
# from cuml.neighbors import NearestNeighbors
%env TOKENIZERS_PARALLELISM=false
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu');

env: TOKENIZERS_PARALLELISM=false


In [39]:
# =========================================================================================
# Configurations
# =========================================================================================
class CFG:
    print_freq = 3000
    num_workers = 4
    s1_batch_size = 128
    s2_batch_size = 64
    

    seed = 42
    
   #--------------------------------------------------------
    top_k = 100
    final_top_k = 50
    threshold = 0.01
    n_fold = 4
    trn_fold = list(range(n_fold)) 
    #--------------------------------------------------------
    
    
    dropout=0.1
    data_dir = 'data/raw/'

    
    apex = True
    gradient_checkpoint = False


In [5]:
def get_uns_data(data_dir, tokenizer=None):
    content_df = pd.read_csv(data_dir + "content.csv")
    topic_df = pd.read_csv(data_dir + "topics.csv")
    sub_df = pd.read_csv(data_dir + "sample_submission.csv")
    corr_df = pd.read_csv(data_dir + "correlations.csv")

    # Merge topics with sample submission to only infer test topics
    # topic_df = topic_df.merge(sub_df, how = 'inner', left_on = 'id', right_on = 'topic_id')
    # topic_df = topic_df.merge(sub_df, how = 'inner', left_on = 'id', right_on = 'topic_id')

    # Fill in blanks and limit the amount of content text
    topic_df["title"].fillna("No topic title", inplace=True)
    topic_df["description"].fillna("No topic description", inplace=True)
    content_df["title"].fillna("No content title", inplace=True)
    content_df["description"].fillna("No content description", inplace=True)
    content_df["text"].fillna("No content text", inplace=True)
    content_df["text"] = [x[:300] for x in content_df["text"]]
    
    

    
#     topic_df['length'] = topic_df['text'].apply(lambda x: len(x))
#     content_df['length'] = content_df['text'].apply(lambda x: len(x))
#     topic_df.sort_values('length', inplace = True)
#     content_df.sort_values('length', inplace = True)
#     # Reset index
#     topic_df.reset_index(drop = True, inplace = True)
#     content_df.reset_index(drop = True, inplace = True)

    # Drop cols
    # 'topic_title','topic_description',
    # 'content_title','content_description', 'content_text',
    topic_df.drop(columns=[col for col in topic_df.columns if col not in ("id", "title", "description", "language")],inplace=True)
    content_df.drop(columns=[col for col in content_df.columns if col not in ("id", "title", "description", "text", "language")], inplace=True)
#     topic_df.drop(['channel', 'category', 'level', 'language', 'parent','content_ids', 'has_content'], axis = 1, inplace = True)
#     content_df.drop(['kind', 'copyright_holder', 'language', 'license',], axis = 1, inplace = True)
    corr_df["content_id"] = [x.split() for x in corr_df["content_ids"]]
    exploded = corr_df.explode("content_id")
    return topic_df, content_df, exploded #, exploded, combined 

In [6]:
# =========================================================================================
# Prepare input, tokenize
# =========================================================================================
def prepare_uns_input( cfg, max_len, title, description, text=None):
    sep = cfg.uns_tokenizer.sep_token
    if text is None:
        row = title #+ sep + description
    else:
        row = title #+ sep + description + sep + text
        
    inputs = cfg.uns_tokenizer(
        row, 
        truncation=True, 
        max_length=max_len, 
        padding=False,
        add_special_tokens = True, 
    )

    # Remove token_type_ids. They will just cause errors.
    if "token_type_ids" in inputs:
        del inputs["token_type_ids"]

#     for k, v in inputs.items():
#         inputs[k] = torch.tensor(v, dtype = torch.long)
    return inputs

# =========================================================================================
# Unsupervised dataset
# =========================================================================================
class uns_dataset(Dataset):
    def __init__(self, df, cfg, max_len, isContent=False):
        self.cfg = cfg
        self.max_len = max_len
        self.title = df['title'].values
        self.description = df['description'].values
        self.isContent = isContent
        if self.isContent:
              self.text = df['text'].values
        
    def __len__(self):
        return len(self.title)  # zheli
    
    def __getitem__(self, index):
        output = prepare_uns_input(self.cfg, 
                                   self.max_len,
                                   self.title[index],
                                   self.description[index],
                                   self.text[index] if self.isContent else None,
                                  )
        return output


        
class Collate:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
    def __call__(self, batch):
        output = dict()
        output["input_ids"] = [sample["input_ids"] for sample in batch]
        output["attention_mask"] = [sample["attention_mask"] for sample in batch]
        batch_max = max([len(ids) for ids in output["input_ids"]])
        
        # add padding
        if self.tokenizer.padding_side == "right":
            output["input_ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["input_ids"]]
            output["attention_mask"] = [s + (batch_max - len(s)) * [0] for s in output["attention_mask"]]
        else:
            output["input_ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in output["input_ids"]]
            output["attention_mask"] = [(batch_max - len(s)) * [0] + s for s in output["attention_mask"]]

        # convert to tensors  避免CPU和GPU之间不必要的数据传输
        output["input_ids"] = torch.tensor(output["input_ids"])
        output["attention_mask"] = torch.tensor(output["attention_mask"])
        
        return output

In [7]:
# =========================================================================================
# Mean pooling class
# =========================================================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
# =========================================================================================
# Unsupervised model
# =========================================================================================
class uns_model(nn.Module):
    def __init__(self, model, cfg):
        super().__init__()
        self.cfg = cfg
        self.config = AutoConfig.from_pretrained(model)
        self.backbone = AutoModel.from_pretrained(model, config = self.config)
        self.backbone.resize_token_embeddings(len(cfg.uns_tokenizer)) 

        self.dropout = nn.Dropout(self.cfg.dropout)
        
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)
        self.pool = MeanPooling()
        self.head = nn.Linear(self.config.hidden_size, 1)

    def feature(self, inputs):
        outputs = self.backbone(**inputs)
        last_hidden_state = outputs.last_hidden_state
        feature = self.pool(last_hidden_state, inputs['attention_mask'])
        return feature
    def forward(self, inputs):
        feature = self.feature(inputs)
        return feature

In [8]:
# =========================================================================================
# Get embeddings
# =========================================================================================
def get_embeddings(loader, model):
    model.eval()
    preds = []
    for step, inputs in enumerate(tqdm(loader)):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            with torch.cuda.amp.autocast(enabled=CFG.apex):
                y_preds = model(inputs)
        preds.append(y_preds)
        
    return torch.cat(preds, dim=0)

In [9]:
import datasets
import heapq
from typing import Callable


def cos_sim(a, b):
    # From https://github.com/UKPLab/sentence-transformers/blob/master/sentence_transformers/util.py#L31
    """
    Computes the cosine similarity cos_sim(a[i], b[j]) for all i and j.
    :return: Matrix with res[i][j]  = cos_sim(a[i], b[j])
    """
    if not isinstance(a, torch.Tensor):
        a = torch.tensor(a)

    if not isinstance(b, torch.Tensor):
        b = torch.tensor(b)

    if len(a.shape) == 1:
        a = a.unsqueeze(0) # 在第一个维度上增加一维 （1，） -> （1,1）

    if len(b.shape) == 1:
        b = b.unsqueeze(0)

    a_norm = torch.nn.functional.normalize(a, p=2, dim=1)
    b_norm = torch.nn.functional.normalize(b, p=2, dim=1)
    return torch.mm(a_norm, b_norm.transpose(0, 1))


# From: https://github.com/UKPLab/sentence-transformers/blob/master/sentence_transformers/util.py#L204
def semantic_search(
    query_embeddings: torch.Tensor,
    corpus_embeddings: torch.Tensor,
    query_chunk_size: int = 100,
    corpus_chunk_size: int = 500000,
    top_k: int = 100,
    score_function: Callable[[torch.Tensor, torch.Tensor], torch.Tensor] = cos_sim,
):
    if isinstance(query_embeddings, (np.ndarray, np.generic)):
        query_embeddings = torch.from_numpy(query_embeddings)
    elif isinstance(query_embeddings, list):
        query_embeddings = torch.stack(query_embeddings)

    if len(query_embeddings.shape) == 1:
        query_embeddings = query_embeddings.unsqueeze(0)

    if isinstance(corpus_embeddings, (np.ndarray, np.generic)):
        corpus_embeddings = torch.from_numpy(corpus_embeddings)
    elif isinstance(corpus_embeddings, list):
        corpus_embeddings = torch.stack(corpus_embeddings)

    # Check that corpus and queries are on the same device
    if corpus_embeddings.device != query_embeddings.device:
        query_embeddings = query_embeddings.to(corpus_embeddings.device)

    queries_result_list = [[] for _ in range(len(query_embeddings))]  # 对每个topic建立一个空列表，最后要选出来top k个候选

    # 一次计算query_chunk_size个topic emb和corpus_chunk_size个content emb之间的相似度
    for query_start_idx in range(0, len(query_embeddings), query_chunk_size):  
        # Iterate over chunks of the corpus
        for corpus_start_idx in range(0, len(corpus_embeddings), corpus_chunk_size):
            # Compute cosine similarities  返回一个矩阵 res[i][j]  = cos_sim(a[i], b[j])  维度为： [query_chunk_size, corpus_chunk_size]
            # 行为topic emb的个数，列为content emb的个数，即每个topic与候选库里的所有content的相似度
            cos_scores = score_function(
                query_embeddings[query_start_idx : query_start_idx + query_chunk_size],
                corpus_embeddings[
                    corpus_start_idx : corpus_start_idx + corpus_chunk_size
                ],
            )

            # Get top-k scores  得到的维度都是 [query_chunk_size, top k]
            cos_scores_top_k_values, cos_scores_top_k_idx = torch.topk(
                cos_scores,
                min(top_k, len(cos_scores[0])),
                dim=1, # 指定在哪个维度上排序， 默认是最后一个维度，这里是对content 维度排序 ，dim=1表示按照行求 topn
                largest=True, # 按照大到小排序
                sorted=False, # 按照顺序返回
            )
            cos_scores_top_k_values = cos_scores_top_k_values.cpu().tolist()
            cos_scores_top_k_idx = cos_scores_top_k_idx.cpu().tolist()

            # 因为是分块计算相似度矩阵，因此每次循环对应的索引需要加上之前的，得到真正的索引
            for query_itr in range(len(cos_scores)): # len(cos_scores) = query_chunk_size
                # [query_chunk_size, top k]
                for sub_corpus_id, score in zip(cos_scores_top_k_idx[query_itr], cos_scores_top_k_values[query_itr]):
                    corpus_id = corpus_start_idx + sub_corpus_id
                    query_id = query_start_idx + query_itr
                    if len(queries_result_list[query_id]) < top_k:
                        # heapq.heappush(heap, item)， 将 item 的值加入 heap 中，保持堆的不变性
                        # 这里是将每个主题对应的content列表看作一个堆结构，然后传入topK相似度分数和这些topK对应的正常索引
                        heapq.heappush(
                            queries_result_list[query_id], (score, corpus_id)
                        )  # heaqp tracks the quantity of the first element in the tuple
                    else:
                        # 将 item 放入堆中，然后弹出并返回 heap 的最小元素， 如果已经得到了topk，那么就将相似度最小的候选content弹出
                        heapq.heappushpop(
                            queries_result_list[query_id], (score, corpus_id)
                        )

    # change the data format and sort
    for query_id in range(len(queries_result_list)):  # 全部的topic个数
        for doc_itr in range(len(queries_result_list[query_id])): # 当前topic下对应的top K个候选
            score, corpus_id = queries_result_list[query_id][doc_itr]
            queries_result_list[query_id][doc_itr] = {
                "corpus_id": corpus_id,
                "score": score,
            }
        queries_result_list[query_id] = sorted(
            queries_result_list[query_id], key=lambda x: x["score"], reverse=True 
        )

    gc.collect()
    
    return queries_result_list

In [27]:
def get_candidates(topic_df, content_df, model, max_len, cfg):
    
    t_dataset = uns_dataset(topic_df, CFG, max_len)
    c_dataset = uns_dataset(content_df, CFG, max_len, isContent=True)

    collate_fn = Collate(CFG.uns_tokenizer)
    test_params = {'batch_size': CFG.s1_batch_size,
                    'shuffle': False,
                    'collate_fn' : collate_fn,
                    'num_workers': CFG.num_workers, 
                    'pin_memory': True, 
                    'drop_last': False
                    }
    t_loader = DataLoader(t_dataset, **test_params)
    c_loader = DataLoader(c_dataset, **test_params)
    
#     print(f"{' '*5}>>> {ex_name} , get embedding ... <<<{' '*5}")
    t_embeddings = get_embeddings(t_loader, model)
    c_embeddings = get_embeddings(c_loader, model)
    search_results = semantic_search(
                                    torch.tensor(t_embeddings),
                                    torch.tensor(c_embeddings),
                                    top_k=cfg.top_k)
#     return search_results
    content_ids = content_df.content_id.values
    all_pred_c_ids = [[content_ids[x["corpus_id"]] for x in row] for row in search_results]
    topic_df['predictions'] = all_pred_c_ids 
    
    del content_df, model, all_pred_c_ids, search_results
    gc.collect()
    
    return topic_df

In [28]:
# ======= data prepare =====
topic_df, content_df, explode = get_uns_data(CFG.data_dir)
content_df = content_df.rename(columns={"id": "content_id"})

display(topic_df.head())
display(content_df.head())

Unnamed: 0,id,title,description,language
0,t_00004da3a1b2,Откриването на резисторите,"Изследване на материали, които предизвикват на...",bg
1,t_000095e03056,Unit 3.3 Enlargements and Similarities,No topic description,en
2,t_00068291e9a4,Entradas e saídas de uma função,Entenda um pouco mais sobre funções.,pt
3,t_00069b63a70a,Transcripts,No topic description,en
4,t_0006d41a73a8,Графики на експоненциални функции (Алгебра 2 н...,Научи повече за графиките на сложните показате...,bg


Unnamed: 0,content_id,title,description,text,language
0,c_00002381196d,"Sumar números de varios dígitos: 48,029+233,930","Suma 48,029+233,930 mediante el algoritmo está...",No content text,es
1,c_000087304a9e,Trovare i fattori di un numero,Sal trova i fattori di 120.\n\n,No content text,it
2,c_0000ad142ddb,Sumar curvas de demanda,Cómo añadir curvas de demanda\n\n,No content text,es
3,c_0000c03adc8d,Nado de aproximação,Neste vídeo você vai aprender o nado de aproxi...,\nNado de aproximação\nSaber nadar nas ondas ...,pt
4,c_00016694ea2a,geometry-m3-topic-a-overview.pdf,geometry-m3-topic-a-overview.pdf,Estándares Comunes del Estado de Nueva York\n\...,es


In [13]:
topic_df

Unnamed: 0,id,title,description,language
0,t_00004da3a1b2,Откриването на резисторите,"Изследване на материали, които предизвикват на...",bg
1,t_000095e03056,Unit 3.3 Enlargements and Similarities,No topic description,en
2,t_00068291e9a4,Entradas e saídas de uma função,Entenda um pouco mais sobre funções.,pt
3,t_00069b63a70a,Transcripts,No topic description,en
4,t_0006d41a73a8,Графики на експоненциални функции (Алгебра 2 н...,Научи повече за графиките на сложните показате...,bg
...,...,...,...,...
76967,t_fffb0bf2801d,4.3 Graph of functions,No topic description,en
76968,t_fffbe1d5d43c,Inscribed shapes problem solving,Use properties of inscribed angles to prove pr...,sw
76969,t_fffe14f1be1e,Lección 7,No topic description,es
76970,t_fffe811a6da9,تحديد العلاقة بين الإحداثيّات القطبية والإحداث...,5b9e5ca86571f90499ea987f,ar


In [12]:
explode

Unnamed: 0,topic_id,content_ids,content_id
0,t_00004da3a1b2,c_1108dd0c7a5d c_376c5a8eb028 c_5bc0e1e2cba0 c...,c_1108dd0c7a5d
0,t_00004da3a1b2,c_1108dd0c7a5d c_376c5a8eb028 c_5bc0e1e2cba0 c...,c_376c5a8eb028
0,t_00004da3a1b2,c_1108dd0c7a5d c_376c5a8eb028 c_5bc0e1e2cba0 c...,c_5bc0e1e2cba0
0,t_00004da3a1b2,c_1108dd0c7a5d c_376c5a8eb028 c_5bc0e1e2cba0 c...,c_76231f9d0b5e
1,t_00068291e9a4,c_639ea2ef9c95 c_89ce9367be10 c_ac1672cdcd2c c...,c_639ea2ef9c95
...,...,...,...
61513,t_fff9e5407d13,c_026db653a269 c_0fb048a6412c c_20de77522603 c...,c_d64037a72376
61514,t_fffbe1d5d43c,c_46f852a49c08 c_6659207b25d5,c_46f852a49c08
61514,t_fffbe1d5d43c,c_46f852a49c08 c_6659207b25d5,c_6659207b25d5
61515,t_fffe14f1be1e,c_cece166bad6a,c_cece166bad6a


In [35]:
torch.cuda.set_device(2)  # 3 指的是第三块 GPU

In [40]:
from collections import Counter
# oof = pd.DataFrame()
# oof['content_ids'] = []
for step, ex_name in  enumerate(stage1_ensemble):
    ex = stage1_ensemble[ex_name]
    print(f"{' '*3}======  Current ex is {ex_name}  ======{' '*3}")
    # -----------------------------------------------------------
    CFG.uns_tokenizer = AutoTokenizer.from_pretrained(ex['model_path'])

    # -----------------------------------------------------------
    for fold in CFG.trn_fold:
        print(f"{' '*5}=== {ex_name} , Fold {fold} ==={' '*5}")
        state = torch.load(ex['weight']+f"/{ex['model'].split('/')[-1]}_fold{fold}.pth",
                map_location='cuda')
        model = uns_model(ex['model_path'], CFG)    
        model.to(device)
        model.load_state_dict(state['model'])
        topic_df = get_candidates(topic_df, content_df, model, ex['max_len'], CFG)

        #topic = topic[['topic_id', 'predictions']]
        
#         oof['topic_id'] = topic['topic_id']
        print(step)
        if step == 0:
            topic_df['content_ids'] = topic_df['predictions'] 
            step+=1
        else:
            topic_df['content_ids'] += topic_df['predictions'] 
            
        # ！！！！！！！！
        del model, state
        gc.collect()
        torch.cuda.empty_cache()
        

    torch.cuda.empty_cache()
        
    print(f"\n{'='*8} !!!!! {ex_name} is complete !!!!! {'='*8}\n")

topic_df.drop(['predictions'], axis = 1, inplace = True)
# topic_df['content_ids'] = topic_df['content_ids'].apply(lambda x: np.unique(x))
# topic_df['content_ids'] = topic_df['content_ids'].apply(lambda x: ' '.join(x))

     === ex109 , Fold 0 ===     


  0%|          | 0/602 [00:00<?, ?it/s]

  0%|          | 0/1204 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fad4651ddc0>
Traceback (most recent call last):
Exception ignored in:   File "/home/user/anaconda3/envs/torch/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1358, in __del__
<function _MultiProcessingDataLoaderIter.__del__ at 0x7fad4651ddc0>    
self._shutdown_workers()Traceback (most recent call last):
  File "/home/user/anaconda3/envs/torch/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1358, in __del__

      File "/home/user/anaconda3/envs/torch/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1341, in _shutdown_workers
self._shutdown_workers()    
  File "/home/user/anaconda3/envs/torch/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1341, in _shutdown_workers
if w.is_alive():    
if w.is_alive():  File "/home/user/anaconda3/envs/torch/lib/python3.9/multiprocessing/process.py", line 160, in is_alive

      File "/home/user/anaconda3/

0
     === ex109 , Fold 1 ===     


  0%|          | 0/602 [00:00<?, ?it/s]

  0%|          | 0/1204 [00:00<?, ?it/s]

1
     === ex109 , Fold 2 ===     


  0%|          | 0/602 [00:00<?, ?it/s]

  0%|          | 0/1204 [00:00<?, ?it/s]

1
     === ex109 , Fold 3 ===     


  0%|          | 0/602 [00:00<?, ?it/s]

  0%|          | 0/1204 [00:00<?, ?it/s]

1




In [41]:
topic_df

Unnamed: 0,id,title,description,language,content_ids
0,t_00004da3a1b2,Откриването на резисторите,"Изследване на материали, които предизвикват на...",bg,"[c_5bc0e1e2cba0, c_8fae27abecfb, c_0feaaa5dc39..."
1,t_000095e03056,Unit 3.3 Enlargements and Similarities,No topic description,en,"[c_68c9cfd45cd3, c_16cc04ed8b08, c_a7f0809d2c9..."
2,t_00068291e9a4,Entradas e saídas de uma função,Entenda um pouco mais sobre funções.,pt,"[c_ac1672cdcd2c, c_89ce9367be10, c_14bf71640ec..."
3,t_00069b63a70a,Transcripts,No topic description,en,"[c_5449e7f6288b, c_e63f7989f1be, c_193d3b6d679..."
4,t_0006d41a73a8,Графики на експоненциални функции (Алгебра 2 н...,Научи повече за графиките на сложните показате...,bg,"[c_62191f77b582, c_41ad3d9bd68a, c_5c737630789..."
...,...,...,...,...,...
76967,t_fffb0bf2801d,4.3 Graph of functions,No topic description,en,"[c_150630496d06, c_33978d3e638a, c_713222b0adb..."
76968,t_fffbe1d5d43c,Inscribed shapes problem solving,Use properties of inscribed angles to prove pr...,sw,"[c_0509c1059da0, c_2c08a5d04b47, c_9c78cef31a1..."
76969,t_fffe14f1be1e,Lección 7,No topic description,es,"[c_ecf695c4922d, c_7e3827563ffd, c_8eb3de9697e..."
76970,t_fffe811a6da9,تحديد العلاقة بين الإحداثيّات القطبية والإحداث...,5b9e5ca86571f90499ea987f,ar,"[c_5ae12ead0daf, c_1f8f32581e0f, c_3a29fd0bf7c..."


In [48]:
content_ids = topic_df.content_ids.to_list()
final_contents=[]
for res in content_ids:
        cnts = Counter(res).most_common(50)
        
        final_contents.append(
            [x[0] for x in cnts]
        )

In [51]:
topic_df['pred_content_ids'] = final_contents
display(topic_df.head())

Unnamed: 0,id,title,description,language,content_ids,pred_content_ids
0,t_00004da3a1b2,Откриването на резисторите,"Изследване на материали, които предизвикват на...",bg,"[c_5bc0e1e2cba0, c_8fae27abecfb, c_0feaaa5dc39...","[c_5bc0e1e2cba0, c_8fae27abecfb, c_0feaaa5dc39..."
1,t_000095e03056,Unit 3.3 Enlargements and Similarities,No topic description,en,"[c_68c9cfd45cd3, c_16cc04ed8b08, c_a7f0809d2c9...","[c_68c9cfd45cd3, c_16cc04ed8b08, c_a7f0809d2c9..."
2,t_00068291e9a4,Entradas e saídas de uma função,Entenda um pouco mais sobre funções.,pt,"[c_ac1672cdcd2c, c_89ce9367be10, c_14bf71640ec...","[c_ac1672cdcd2c, c_89ce9367be10, c_14bf71640ec..."
3,t_00069b63a70a,Transcripts,No topic description,en,"[c_5449e7f6288b, c_e63f7989f1be, c_193d3b6d679...","[c_5449e7f6288b, c_193d3b6d6794, c_f9b3324fbfa..."
4,t_0006d41a73a8,Графики на експоненциални функции (Алгебра 2 н...,Научи повече за графиките на сложните показате...,bg,"[c_62191f77b582, c_41ad3d9bd68a, c_5c737630789...","[c_62191f77b582, c_41ad3d9bd68a, c_5c737630789..."


In [54]:
corr_df = pd.read_csv("data/raw/correlations.csv")
corr_df["content_ids"] = [x.split() for x in corr_df["content_ids"]]

In [55]:
corr_df

Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,"[c_1108dd0c7a5d, c_376c5a8eb028, c_5bc0e1e2cba..."
1,t_00068291e9a4,"[c_639ea2ef9c95, c_89ce9367be10, c_ac1672cdcd2..."
2,t_00069b63a70a,[c_11a1dc0bfb99]
3,t_0006d41a73a8,"[c_0c6473c3480d, c_1c57a1316568, c_5e375cf14c4..."
4,t_0008768bdee6,"[c_34e1424229b4, c_7d1a964d66d5, c_aab93ee667f4]"
...,...,...
61512,t_fff830472691,"[c_61fb63326e5d, c_8f224e321c87]"
61513,t_fff9e5407d13,"[c_026db653a269, c_0fb048a6412c, c_20de7752260..."
61514,t_fffbe1d5d43c,"[c_46f852a49c08, c_6659207b25d5]"
61515,t_fffe14f1be1e,[c_cece166bad6a]


In [56]:
topic_df.drop(['content_ids'], axis = 1, inplace = True)
topic_df

Unnamed: 0,id,title,description,language,pred_content_ids
0,t_00004da3a1b2,Откриването на резисторите,"Изследване на материали, които предизвикват на...",bg,"[c_5bc0e1e2cba0, c_8fae27abecfb, c_0feaaa5dc39..."
1,t_000095e03056,Unit 3.3 Enlargements and Similarities,No topic description,en,"[c_68c9cfd45cd3, c_16cc04ed8b08, c_a7f0809d2c9..."
2,t_00068291e9a4,Entradas e saídas de uma função,Entenda um pouco mais sobre funções.,pt,"[c_ac1672cdcd2c, c_89ce9367be10, c_14bf71640ec..."
3,t_00069b63a70a,Transcripts,No topic description,en,"[c_5449e7f6288b, c_193d3b6d6794, c_f9b3324fbfa..."
4,t_0006d41a73a8,Графики на експоненциални функции (Алгебра 2 н...,Научи повече за графиките на сложните показате...,bg,"[c_62191f77b582, c_41ad3d9bd68a, c_5c737630789..."
...,...,...,...,...,...
76967,t_fffb0bf2801d,4.3 Graph of functions,No topic description,en,"[c_150630496d06, c_33978d3e638a, c_c379a1e2a9a..."
76968,t_fffbe1d5d43c,Inscribed shapes problem solving,Use properties of inscribed angles to prove pr...,sw,"[c_0509c1059da0, c_2c08a5d04b47, c_9c78cef31a1..."
76969,t_fffe14f1be1e,Lección 7,No topic description,es,"[c_017a33a15d64, c_0892b5c1148c, c_02afd53135f..."
76970,t_fffe811a6da9,تحديد العلاقة بين الإحداثيّات القطبية والإحداث...,5b9e5ca86571f90499ea987f,ar,"[c_5ae12ead0daf, c_1f8f32581e0f, c_3a29fd0bf7c..."


In [57]:
final = topic_df.merge(corr_df, how = 'inner', left_on = 'id', right_on = 'topic_id')

In [59]:
final = final.rename(
    columns={
        "title": "topic_title",
        "description": "topic_description",
        "language": "topic_language"
    }
)
final.drop(['id'], axis = 1, inplace = True)
final

Unnamed: 0,topic_title,topic_description,topic_language,pred_content_ids,topic_id,content_ids
0,Откриването на резисторите,"Изследване на материали, които предизвикват на...",bg,"[c_5bc0e1e2cba0, c_8fae27abecfb, c_0feaaa5dc39...",t_00004da3a1b2,"[c_1108dd0c7a5d, c_376c5a8eb028, c_5bc0e1e2cba..."
1,Entradas e saídas de uma função,Entenda um pouco mais sobre funções.,pt,"[c_ac1672cdcd2c, c_89ce9367be10, c_14bf71640ec...",t_00068291e9a4,"[c_639ea2ef9c95, c_89ce9367be10, c_ac1672cdcd2..."
2,Transcripts,No topic description,en,"[c_5449e7f6288b, c_193d3b6d6794, c_f9b3324fbfa...",t_00069b63a70a,[c_11a1dc0bfb99]
3,Графики на експоненциални функции (Алгебра 2 н...,Научи повече за графиките на сложните показате...,bg,"[c_62191f77b582, c_41ad3d9bd68a, c_5c737630789...",t_0006d41a73a8,"[c_0c6473c3480d, c_1c57a1316568, c_5e375cf14c4..."
4,100 સુધીનો સરવાળો,37 અને 49 જેવી બે-અંકની સંખ્યાઓ ઉમેરતા શીખો.,gu,"[c_179302899cfc, c_aab93ee667f4, c_1b34550c407...",t_0008768bdee6,"[c_34e1424229b4, c_7d1a964d66d5, c_aab93ee667f4]"
...,...,...,...,...,...,...
61512,Scalar Projections,No topic description,en,"[c_61fb63326e5d, c_8f224e321c87, c_8d10d18beb9...",t_fff830472691,"[c_61fb63326e5d, c_8f224e321c87]"
61513,NA_U06 - El periódico,No topic description,es,"[c_0fb048a6412c, c_5a80e03b571a, c_d1635b5d709...",t_fff9e5407d13,"[c_026db653a269, c_0fb048a6412c, c_20de7752260..."
61514,Inscribed shapes problem solving,Use properties of inscribed angles to prove pr...,sw,"[c_0509c1059da0, c_2c08a5d04b47, c_9c78cef31a1...",t_fffbe1d5d43c,"[c_46f852a49c08, c_6659207b25d5]"
61515,Lección 7,No topic description,es,"[c_017a33a15d64, c_0892b5c1148c, c_02afd53135f...",t_fffe14f1be1e,[c_cece166bad6a]


In [61]:
content_df = content_df.rename(
    columns={
        "title": "content_title",
        "description": "content_description",
        "text": "content_text",
        "language": "content_language"
    }
)
content_df

Unnamed: 0,content_id,content_title,content_description,content_text,content_language
0,c_00002381196d,"Sumar números de varios dígitos: 48,029+233,930","Suma 48,029+233,930 mediante el algoritmo está...",No content text,es
1,c_000087304a9e,Trovare i fattori di un numero,Sal trova i fattori di 120.\n\n,No content text,it
2,c_0000ad142ddb,Sumar curvas de demanda,Cómo añadir curvas de demanda\n\n,No content text,es
3,c_0000c03adc8d,Nado de aproximação,Neste vídeo você vai aprender o nado de aproxi...,\nNado de aproximação\nSaber nadar nas ondas ...,pt
4,c_00016694ea2a,geometry-m3-topic-a-overview.pdf,geometry-m3-topic-a-overview.pdf,Estándares Comunes del Estado de Nueva York\n\...,es
...,...,...,...,...,...
154042,c_fffcbdd4de8b,2. 12: Diffusion,No content description,What will eventually happen to these dyes?\n\n...,en
154043,c_fffe15a2d069,Sommare facendo gruppi da 10,Sal somma 5+68 spezzando il 5 in un 2 e un 3.\n\n,No content text,it
154044,c_fffed7b0d13a,Introdução à subtração,Sal fala sobre o que significa subtrair. Os ex...,No content text,pt
154045,c_ffff04ba7ac7,SA of a Cone,No content description,No content text,en


In [64]:
import ast
import swifter
def post_process(oof, content_df, top_k):
    print(f'Top k : {top_k}')
    oof['pred_content_ids'] = [row[:top_k] for row in oof['pred_content_ids']]  # 每个topic只选top k个content
    oof = oof.explode("pred_content_ids").reset_index(drop=True)
    oof['target'] = oof.swifter.apply(lambda x: 1 if x.pred_content_ids in x.content_ids else 0, axis=1)
    print("Label counts")
    print(f'{oof.target.value_counts()}')
    oof = oof.merge(content_df[['content_id', 'content_title', 'content_description', 'content_text','content_language']], how = 'inner', left_on = 'pred_content_ids', right_on = 'content_id')
    return oof 


In [65]:
stage2_data = post_process(final, content_df, 50)

Top k : 50


Pandas Apply:   0%|          | 0/3075850 [00:00<?, ?it/s]

Label counts
0    2843556
1     232294
Name: target, dtype: int64


In [67]:
stage2_data.to_csv('../data/stage2/stage2.csv')