# 配置 

In [1]:
Local_Test = False

In [None]:
DATA_DIR = '../input/learning-equality-curriculum-recommendations'

class REC_CFG_1:
    n_recalls = 50
    n_recalls_full = 200
    max_seq_len = 256
    model_path = '/kaggle/input/output-simcse-model-37-first-epo100'
    model_weight = 'sentence-transformers-paraphrase-multilingual-mpnet-base-v2_fold0_best.pth'
    token_dir = 'tokenizer'
    batch_size = 32
    
class CLF_CFG_1:
    max_seq_len = 300
    model_path = '/kaggle/input/lecr-mdeberta-cv6339'
    batch_size = 32
    
class CLF_CFG_2:
    max_seq_len = 256
    model_path = '/kaggle/input/lecr-mdeberta-cv6339'
    batch_size = 32
    
class CLF_CFG_3:
    max_seq_len = 256
    model_path = '/kaggle/input/zy-1-150-model-636/output_model_class_simcsemodel_lr_warmup_150'
    batch_size = 32
    
class CLF_CFG_4:
    max_seq_len = 256
    model_path = '/kaggle/input/lecr-mdeberta-clf-model-exp05-f18263'
    batch_size = 32
    
class CLF_CFG_5:
    max_seq_len = 256
    model_path = '/kaggle/input/focalloss655/636focal_loss'
    batch_size = 32
    
class CLF_CFG_6:
    max_seq_len = 256
    model_path = '/kaggle/input/lecr-xlm-base-model-f18071'
    batch_size = 48

In [3]:
import warnings
warnings.simplefilter('ignore')

import os
import re
import gc
import sys
import time
import math
import random
import pickle
from pathlib import Path
from collections import defaultdict

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)
from tqdm.auto import tqdm

sys.path.append('/kaggle/input/sentence-transformers')
from sentence_transformers import SentenceTransformer
from sentence_transformers import util

import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset
from transformers import get_cosine_schedule_with_warmup, DataCollatorWithPadding

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# PART1: 召回

### 加载数据 

In [4]:
df_content = pd.read_csv(f'{DATA_DIR}/content.csv')
df_topic = pd.read_csv(f'{DATA_DIR}/topics.csv')

if Local_Test:
    df_test = pd.read_csv(f'/kaggle/input/lecr-valid-samples-1000/valid_samples_1000.csv')
else:     
    df_test = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')

# parent / children desc
df_tmp = df_topic[['id', 'parent']].reset_index(drop=True).copy()
df_tmp.columns = ['child', 'id']
df_topic = pd.merge(df_topic, df_tmp, on='id', how='left')
df_children = df_topic.groupby('id')['child'].agg(list).reset_index(name='children')
df_topic = df_topic.merge(df_children, on='id', how='left').drop_duplicates('id').reset_index(drop=True)
df_topic.drop('child', axis=1, inplace=True)
desc_dict = df_topic[['id', 'description']].fillna('').set_index('id').to_dict()['description']
parent_texts = []
children_texts = []
for _, row in tqdm(df_topic.iterrows(), total=len(df_topic)):
    desc = row['description']
    parent = row['parent']
    children = row['children']
    p_text = ''
    if not pd.isna(parent):
        p_text = desc_dict[parent]
    parent_texts.append(p_text)
    children_texts.append(' '.join([desc_dict[child] for child in children if not pd.isna(child)]))
df_topic['parent_description'] = parent_texts
df_topic['children_description'] = children_texts
del desc_dict, parent_texts, children_texts; gc.collect()

# 需要提交的 topic
df_topic = df_topic[df_topic['id'].isin(df_test['topic_id'].values.tolist())].reset_index(drop=True)

print(df_content.shape, df_topic.shape, df_test.shape)

  0%|          | 0/76972 [00:00<?, ?it/s]

(154047, 8) (5, 12) (5, 2)


## 文本处理

In [5]:
# topic context
topics_df = pd.read_csv(f'{DATA_DIR}/topics.csv',index_col=0).fillna({"title": "", "description": ""})
content_df = pd.read_csv(f'{DATA_DIR}/content.csv',index_col=0).fillna("")

class Topic:
    def __init__(self, topic_id):
        self.id = topic_id

    @property
    def parent(self):
        parent_id = topics_df.loc[self.id].parent
        if pd.isna(parent_id):
            return None
        else:
            return Topic(parent_id)

    @property
    def ancestors(self):
        ancestors = []
        parent = self.parent
        while parent is not None:
            ancestors.append(parent)
            parent = parent.parent
        return ancestors

    @property
    def siblings(self):
        if not self.parent:
            return []
        else:
            return [topic for topic in self.parent.children if topic != self]

    @property
    def content(self):
        if self.id in correlations_df.index:
            return [ContentItem(content_id) for content_id in correlations_df.loc[self.id].content_ids.split()]
        else:
            return tuple([]) if self.has_content else []

    def get_breadcrumbs(self, separator=" >> ", include_self=True, include_root=True):
        ancestors = self.ancestors
        if include_self:
            ancestors = [self] + ancestors
        if not include_root:
            ancestors = ancestors[:-1]
        return separator.join(reversed([a.title for a in ancestors]))

    @property
    def children(self):
        return [Topic(child_id) for child_id in topics_df[topics_df.parent == self.id].index]

    def subtree_markdown(self, depth=0):
        markdown = "  " * depth + "- " + self.title + "\n"
        for child in self.children:
            markdown += child.subtree_markdown(depth=depth + 1)
        for content in self.content:
            markdown += ("  " * (depth + 1) + "- " + "[" + content.kind.title() + "] " + content.title) + "\n"
        return markdown

    def __eq__(self, other):
        if not isinstance(other, Topic):
            return False
        return self.id == other.id

    def __getattr__(self, name):
        return topics_df.loc[self.id][name]

    def __str__(self):
        return self.title

    def __repr__(self):
        return f"<Topic(id={self.id}, title=\"{self.title}\")>"


class ContentItem:
    def __init__(self, content_id):
        self.id = content_id

    @property
    def topics(self):
        return [Topic(topic_id) for topic_id in
                topics_df.loc[correlations_df[correlations_df.content_ids.str.contains(self.id)].index].index]

    def __getattr__(self, name):
        return content_df.loc[self.id][name]

    def __str__(self):
        return self.title

    def __repr__(self):
        return f"<ContentItem(id={self.id}, title=\"{self.title}\")>"

    def __eq__(self, other):
        if not isinstance(other, ContentItem):
            return False
        return self.id == other.id

    def get_all_breadcrumbs(self, separator=" >> ", include_root=True):
        breadcrumbs = []
        for topic in self.topics:
            new_breadcrumb = topic.get_breadcrumbs(separator=separator, include_root=include_root)
            if new_breadcrumb:
                new_breadcrumb = new_breadcrumb + separator + self.title
            else:
                new_breadcrumb = self.title
            breadcrumbs.append(new_breadcrumb)
        return breadcrumbs


def get_context(topic_id):
    topic = Topic(topic_id)
    return topic.get_breadcrumbs()


df_topic['context'] = df_topic['id'].apply(get_context)

del content_df, topics_df
gc.collect()

df_content.fillna('', inplace=True)
df_topic.fillna('', inplace=True)


def get_text_content(row):
    text = row['title'] +\
           '[SEP]' + row['description'] +\
           '[SEP]' + row['text']
    return text[:REC_CFG_1.max_seq_len]


def get_text_topic(row):
    text = row['title'] +\
           '[SEP]' + row['description'] +\
           '[SEP]' + row['context'] +\
           '[SEP]' + row['parent_description'] +\
           '[SEP]' + row['children_description']
    return text[:REC_CFG_1.max_seq_len]


def get_text_content_all(row):
    text = row['title'] + \
           ' [SEP] ' + row['kind'] + \
           ' [SEP] ' + row['language'] + \
           ' [SEP] ' + row['description'] + \
           ' [SEP] ' + row['text']
    return text[:REC_CFG_1.max_seq_len]


def get_text_topic_all(row):
    text = row['title'] + \
           ' [SEP] ' + row['channel'] + \
           ' [SEP] ' + row['category'] + \
           ' [SEP] ' + str(row['level']) + \
           ' [SEP] ' + str(row['language']) + \
           ' [SEP] ' + row['description'] + \
           ' [SEP] ' + row['context'] + \
           ' [SEP] ' + row['parent_description'] + \
           ' [SEP] ' + row['children_description']
    return text[:REC_CFG_1.max_seq_len]


df_content['text2'] = df_content.apply(lambda row: get_text_content(row), axis=1)
df_topic['text2'] = df_topic.apply(lambda row: get_text_topic(row), axis=1)


df_content['text2_all'] = df_content.apply(lambda row: get_text_content_all(row), axis=1)
df_topic['text2_all'] = df_topic.apply(lambda row: get_text_topic_all(row), axis=1)

In [6]:
##### 验证集标签 #####
if Local_Test:
    df_target = pd.read_csv(f'{DATA_DIR}/correlations.csv')
    df_target = df_target[df_target['topic_id'].isin(df_test['topic_id'].values.tolist())].reset_index(drop=True)
    print(df_target.shape)

## 召回

In [7]:
# 根据语言划分

languages = df_content['language'].unique().tolist()

content_dict = {}
for lang in tqdm(languages):
    content_dict[lang] = df_content[df_content['language'] == lang].reset_index(drop=True)

  0%|          | 0/27 [00:00<?, ?it/s]

In [8]:
# 加载召回模型

class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

    
class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 4, layer_weights=None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
            torch.tensor([1] * (num_hidden_layers + 1 - layer_start), dtype=torch.float)
        )

    def forward(self, all_hidden_states):
        all_layer_embedding = all_hidden_states[self.layer_start:, :, :, :]
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor * all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
        return weighted_average

    
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)

        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)

        self.pool = MeanPooling()
        self.fc_dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.config.hidden_size, 1)
        self._init_weights(self.fc)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

def get_model_feature(model, texts, tokenizer):
    feature_outs_all = []
    test_dataset = TestDataset(texts, tokenizer)
    test_loader = DataLoader(test_dataset,
                             batch_size=32,
                             shuffle=False,
                             collate_fn=DataCollatorWithPadding(tokenizer=tokenizer, padding='longest'),
                             num_workers=0, pin_memory=True, drop_last=False)

    for inputs in test_loader:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            feature_outs = model(inputs)
            feature_outs_all.append(feature_outs)

    feature_outs_all_final = torch.cat(feature_outs_all, dim=0)
    print(feature_outs_all_final.shape)

    return feature_outs_all_final


# test dataset
class TestDataset(Dataset):
    def __init__(self, texts, tokenizer):
        self.texts = texts
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = self.tokenizer(self.texts[item],
                               add_special_tokens=True,
                               return_offsets_mapping=False)

        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
        return inputs

In [9]:
# recall function

def recall_func(CFG):
    if CFG.token_dir:
        tokenizer = AutoTokenizer.from_pretrained(f'{CFG.model_path}/{CFG.token_dir}')
    else:
        tokenizer = AutoTokenizer.from_pretrained(CFG.model_path)
    model = CustomModel(cfg=None, config_path=CFG.model_path + '/config.pth', pretrained=False)

    model.load_state_dict(torch.load(f'{CFG.model_path}/{CFG.model_weight}',
                                 map_location=device)['model'])
    model.eval()
    model = model.to(device)
    corpus_embeddings_dict = {}
    for lang in tqdm(languages):
        corpus_embeddings = get_model_feature(model, content_dict[lang]['text2_all'], tokenizer)
        corpus_embeddings_dict[lang] = corpus_embeddings
    topic_embedding_list = get_model_feature(model, df_topic['text2_all'], tokenizer)
    del model; gc.collect()
    return corpus_embeddings_dict, topic_embedding_list

In [10]:
%%time

corpus_embeddings_dict1, topic_embedding_list1 = recall_func(REC_CFG_1)

  0%|          | 0/27 [00:00<?, ?it/s]

torch.Size([30844, 768])
torch.Size([1300, 768])
torch.Size([10435, 768])
torch.Size([65939, 768])
torch.Size([999, 768])
torch.Size([6050, 768])
torch.Size([3677, 768])
torch.Size([1447, 768])
torch.Size([4042, 768])
torch.Size([7418, 768])
torch.Size([2513, 768])
torch.Size([641, 768])
torch.Size([3849, 768])
torch.Size([10682, 768])
torch.Size([505, 768])
torch.Size([319, 768])
torch.Size([216, 768])
torch.Size([516, 768])
torch.Size([326, 768])
torch.Size([188, 768])
torch.Size([501, 768])
torch.Size([495, 768])
torch.Size([206, 768])
torch.Size([184, 768])
torch.Size([225, 768])
torch.Size([285, 768])
torch.Size([245, 768])
torch.Size([5, 768])
CPU times: user 8min 19s, sys: 2.79 s, total: 8min 22s
Wall time: 8min 35s


In [11]:
pred_final = []
for idx, row in tqdm(df_topic.iterrows(), total=len(df_topic)):
    query = row['text2_all']
    lang = row['language']
    if lang in corpus_embeddings_dict1:
        corpus_embeddings1 = corpus_embeddings_dict1[lang]
        content_df = content_dict[lang]
    else:
        corpus_embeddings1 = corpus_embeddings_dict1['en']
        content_df = content_dict['en']
    # score1    
    query_embedding1 = topic_embedding_list1[idx, :]
    cos_scores1 = util.cos_sim(query_embedding1, corpus_embeddings1)[0]
    # 权重融合
    cos_scores = 1.0*cos_scores1
    
    # recall 100
    top_results = torch.topk(cos_scores, k=REC_CFG_1.n_recalls_full)
    indics = top_results[1].cpu().numpy()
    pid = content_df['id'][indics]
    pred_final.append(' '.join(pid))

df_topic['recall_ids'] = pred_final

  0%|          | 0/5 [00:00<?, ?it/s]

## 召回评估 (线上去掉)

In [12]:
if Local_Test:
    df_metric = pd.merge(df_topic, df_target, left_on='id', right_on='topic_id', how='left')

    def get_pos_score(y_true, y_pred, top_n):
        y_true = y_true.apply(lambda x: set(x.split()))
        y_pred = y_pred.apply(lambda x: set(x.split()[:top_n]))
        int_true = np.array([len(x[0] & x[1]) / len(x[0]) for x in zip(y_true, y_pred)])
        return round(np.mean(int_true), 5)

    pos_score = get_pos_score(df_metric['content_ids'], df_metric['recall_ids'], 50)
    print(f'max positive score top 50 is {pos_score}')
    pos_score = get_pos_score(df_metric['content_ids'], df_metric['recall_ids'], 70)
    print(f'max positive score top 70 is {pos_score}')
    pos_score = get_pos_score(df_metric['content_ids'], df_metric['recall_ids'], 100)
    print(f'max positive score top 100 is {pos_score}')
    pos_score = get_pos_score(df_metric['content_ids'], df_metric['recall_ids'], 200)
    print(f'max positive score top 200 is {pos_score}')

    df_metric['content_ids'] = df_metric['content_ids'].astype(str).apply(lambda x: x.split())
    df_metric['recall_ids'] = df_metric['recall_ids'].astype(str).apply(lambda x: x.split())
    f2_scores = []

    N_RECALLS = [50, 100, 200]
    N_TOP_F2 = [5]
    for n_top in N_TOP_F2:
        for _, row in df_metric.iterrows():
            true_ids = set(row['content_ids'])
            pred_ids = set(row['recall_ids'][:n_top])
            tp = len(true_ids.intersection(pred_ids))
            fp = len(pred_ids - true_ids)
            fn = len(true_ids - pred_ids)
            if pred_ids:
                precision = tp / (tp + fp)
                recall = tp / (tp + fn)
                f2 = tp / (tp + 0.2 * fp + 0.8 * fn)
            else:
                f2 = 0
            f2_scores.append(f2)
        print(f'Average F2@{n_top}:', np.mean(f2_scores))
    for n_recall in N_RECALLS:
        total = 0
        correct = 0
        for _, row in df_metric.iterrows():
            y_trues = row['content_ids']
            y_preds = row['recall_ids'][:n_recall]
            for y_true in y_trues:
                total += 1
                if y_true in y_preds:
                    correct += 1
        print(f'hitrate@{n_recall}:', correct/total)

## 清理内存

In [13]:
del corpus_embeddings_dict1
del topic_embedding_list1
del corpus_embeddings1
del content_dict
del pred_final

for _ in range(5):
    gc.collect()

# PART2: 排序

## 文本预处理

In [14]:
df_topic.rename(columns={'text2': 'topic_text'}, inplace=True)
df_topic.rename(columns={'text2_all': 'topic_text_all'}, inplace=True)

df_content.rename(columns={'text2': 'content_text', 'id': 'content_id'}, inplace=True)
df_content.rename(columns={'text2_all': 'content_text_all'}, inplace=True)

# save for 2nd round reranking
df_topic_copy = df_topic.copy()

# recall 50
df_topic['content_id'] = df_topic['recall_ids'].astype(str).apply(lambda x: x.split()[:REC_CFG_1.n_recalls])
df_topic = df_topic.explode("content_id")
df_topic = pd.merge(df_topic, df_content, on='content_id', how='left')

test_data = df_topic[['id', 'content_id', 'topic_text', 'content_text','topic_text_all','content_text_all']]
test_data['text_all'] = test_data['topic_text'] + '[SEP]' + test_data['content_text']
test_data['text_all_all'] = test_data['topic_text_all'] + '[SEP]' + test_data['content_text_all']
test_data['text_len'] = test_data['text_all'].apply(len)
test_data = test_data.sort_values(['text_len']).reset_index(drop=True)

display(test_data)

Unnamed: 0,id,content_id,topic_text,content_text,topic_text_all,content_text_all,text_all,text_all_all,text_len
0,t_4054df11a74e,c_efef77a7364b,Flow Charts: Logical Thinking?[SEP]This lesson is focused on flow charts. It supports young people who are starting to program and can be used in other subjects and at different levels of study. T...,Guiding Questions for Better Projects[SEP][SEP],Flow Charts: Logical Thinking? [SEP] 6e3ba4 [SEP] source [SEP] 2 [SEP] en [SEP] This lesson is focused on flow charts. It supports young people who are starting to program and can be used in other...,Guiding Questions for Better Projects [SEP] video [SEP] en [SEP] [SEP],Flow Charts: Logical Thinking?[SEP]This lesson is focused on flow charts. It supports young people who are starting to program and can be used in other subjects and at different levels of study. T...,Flow Charts: Logical Thinking? [SEP] 6e3ba4 [SEP] source [SEP] 2 [SEP] en [SEP] This lesson is focused on flow charts. It supports young people who are starting to program and can be used in other...,308
1,t_00068291e9a4,c_da82c952d90b,Entradas e saídas de uma função[SEP]Entenda um pouco mais sobre funções.[SEP]Khan Academy (Português (Brasil)) >> Matemática por ano (Alinhada à BNCC) >> 9º Ano >> Álgebra: funções >> Entradas e s...,"Notação básica de conjunto[SEP]A união, complemento e interseção de conjuntos.[SEP]",Entradas e saídas de uma função [SEP] 8e286a [SEP] source [SEP] 4 [SEP] pt [SEP] Entenda um pouco mais sobre funções. [SEP] Khan Academy (Português (Brasil)) >> Matemática por ano (Alinhada à BNCC...,"Notação básica de conjunto [SEP] exercise [SEP] pt [SEP] A união, complemento e interseção de conjuntos. [SEP]",Entradas e saídas de uma função[SEP]Entenda um pouco mais sobre funções.[SEP]Khan Academy (Português (Brasil)) >> Matemática por ano (Alinhada à BNCC) >> 9º Ano >> Álgebra: funções >> Entradas e s...,Entradas e saídas de uma função [SEP] 8e286a [SEP] source [SEP] 4 [SEP] pt [SEP] Entenda um pouco mais sobre funções. [SEP] Khan Academy (Português (Brasil)) >> Matemática por ano (Alinhada à BNCC...,344
2,t_00004da3a1b2,c_f817eafce04c,"Откриването на резисторите[SEP]Изследване на материали, които предизвикват намаление в отклонението, когато се свържат последователно с нашия измервателен уред. [SEP]Khan Academy (български език) ...",Електроскоп с топче от стиропор[SEP]Примитивно устройство за измерване на електростатична сила\n\n[SEP],"Откриването на резисторите [SEP] 000cf7 [SEP] source [SEP] 4 [SEP] bg [SEP] Изследване на материали, които предизвикват намаление в отклонението, когато се свържат последователно с нашия измервате...",Електроскоп с топче от стиропор [SEP] video [SEP] bg [SEP] Примитивно устройство за измерване на електростатична сила\n\n [SEP],"Откриването на резисторите[SEP]Изследване на материали, които предизвикват намаление в отклонението, когато се свържат последователно с нашия измервателен уред. [SEP]Khan Academy (български език) ...","Откриването на резисторите [SEP] 000cf7 [SEP] source [SEP] 4 [SEP] bg [SEP] Изследване на материали, които предизвикват намаление в отклонението, когато се свържат последователно с нашия измервате...",362
3,t_00068291e9a4,c_035baf9425e0,Entradas e saídas de uma função[SEP]Entenda um pouco mais sobre funções.[SEP]Khan Academy (Português (Brasil)) >> Matemática por ano (Alinhada à BNCC) >> 9º Ano >> Álgebra: funções >> Entradas e s...,"Cálculo de funções[SEP]Dada a fórmula da função, calcule funções para valores de entrada específicos.[SEP]",Entradas e saídas de uma função [SEP] 8e286a [SEP] source [SEP] 4 [SEP] pt [SEP] Entenda um pouco mais sobre funções. [SEP] Khan Academy (Português (Brasil)) >> Matemática por ano (Alinhada à BNCC...,"Cálculo de funções [SEP] exercise [SEP] pt [SEP] Dada a fórmula da função, calcule funções para valores de entrada específicos. [SEP]",Entradas e saídas de uma função[SEP]Entenda um pouco mais sobre funções.[SEP]Khan Academy (Português (Brasil)) >> Matemática por ano (Alinhada à BNCC) >> 9º Ano >> Álgebra: funções >> Entradas e s...,Entradas e saídas de uma função [SEP] 8e286a [SEP] source [SEP] 4 [SEP] pt [SEP] Entenda um pouco mais sobre funções. [SEP] Khan Academy (Português (Brasil)) >> Matemática por ano (Alinhada à BNCC...,367
4,t_0006d41a73a8,c_cde9544b589e,Графики на експоненциални функции (Алгебра 2 ниво)[SEP]Научи повече за графиките на сложните показателни функции от вида y=a*b^(x+c)+d.[SEP]Khan Academy (български език) >> Математика >> Алгебра (...,Чертане на експоненциално нарастване и намаляване[SEP]Начертай показателни функции от основния вид f(x)=a⋅rˣ.[SEP],Графики на експоненциални функции (Алгебра 2 ниво) [SEP] 000cf7 [SEP] source [SEP] 4 [SEP] bg [SEP] Научи повече за графиките на сложните показателни функции от вида y=a*b^(x+c)+d. [SEP] Khan Acad...,Чертане на експоненциално нарастване и намаляване [SEP] exercise [SEP] bg [SEP] Начертай показателни функции от основния вид f(x)=a⋅rˣ. [SEP],Графики на експоненциални функции (Алгебра 2 ниво)[SEP]Научи повече за графиките на сложните показателни функции от вида y=a*b^(x+c)+d.[SEP]Khan Academy (български език) >> Математика >> Алгебра (...,Графики на експоненциални функции (Алгебра 2 ниво) [SEP] 000cf7 [SEP] source [SEP] 4 [SEP] bg [SEP] Научи повече за графиките на сложните показателни функции от вида y=a*b^(x+c)+d. [SEP] Khan Acad...,375
...,...,...,...,...,...,...,...,...,...
245,t_00069b63a70a,c_99ed78127f4a,Transcripts[SEP][SEP]MIT Blossoms >> Engineering >> Flow Charts: Logical Thinking? >> Transcripts[SEP]This lesson is focused on flow charts. It supports young people who are starting to program an...,The Art of Making La..: Written Transcript of this video lesson[SEP]Written Transcript of this video lesson[SEP]BLOSSOMS\t\n VIDEO\t\n LESSON\t\n TRANSCRIPT\t\n \t\n \n\t\n \nThe\t\n Art\t\...,Transcripts [SEP] 6e3ba4 [SEP] source [SEP] 3 [SEP] en [SEP] [SEP] MIT Blossoms >> Engineering >> Flow Charts: Logical Thinking? >> Transcripts [SEP] This lesson is focused on flow charts. It sup...,The Art of Making La..: Written Transcript of this video lesson [SEP] document [SEP] en [SEP] Written Transcript of this video lesson [SEP] BLOSSOMS\t\n VIDEO\t\n LESSON\t\n TRANSCRIPT\t\n \t\...,Transcripts[SEP][SEP]MIT Blossoms >> Engineering >> Flow Charts: Logical Thinking? >> Transcripts[SEP]This lesson is focused on flow charts. It supports young people who are starting to program an...,Transcripts [SEP] 6e3ba4 [SEP] source [SEP] 3 [SEP] en [SEP] [SEP] MIT Blossoms >> Engineering >> Flow Charts: Logical Thinking? >> Transcripts [SEP] This lesson is focused on flow charts. It sup...,517
246,t_00069b63a70a,c_72eca0a54cb3,Transcripts[SEP][SEP]MIT Blossoms >> Engineering >> Flow Charts: Logical Thinking? >> Transcripts[SEP]This lesson is focused on flow charts. It supports young people who are starting to program an...,Connections in the P..: Written Transcript of this video lesson in Arabic[SEP]Written Transcript of this video lesson in Arabic[SEP]‫�ب�س�م �ا�ل�ل�ه �ا�ل�ر�ح�م�ن �ا�ل�ر�ح�ي�م‬\n\n‫‪�.‬ج�ا�م�ع�ة �ا...,Transcripts [SEP] 6e3ba4 [SEP] source [SEP] 3 [SEP] en [SEP] [SEP] MIT Blossoms >> Engineering >> Flow Charts: Logical Thinking? >> Transcripts [SEP] This lesson is focused on flow charts. It sup...,Connections in the P..: Written Transcript of this video lesson in Arabic [SEP] document [SEP] en [SEP] Written Transcript of this video lesson in Arabic [SEP] ‫�ب�س�م �ا�ل�ل�ه �ا�ل�ر�ح�م�ن �ا�ل�ر...,Transcripts[SEP][SEP]MIT Blossoms >> Engineering >> Flow Charts: Logical Thinking? >> Transcripts[SEP]This lesson is focused on flow charts. It supports young people who are starting to program an...,Transcripts [SEP] 6e3ba4 [SEP] source [SEP] 3 [SEP] en [SEP] [SEP] MIT Blossoms >> Engineering >> Flow Charts: Logical Thinking? >> Transcripts [SEP] This lesson is focused on flow charts. It sup...,517
247,t_00069b63a70a,c_ba9d887a1218,Transcripts[SEP][SEP]MIT Blossoms >> Engineering >> Flow Charts: Logical Thinking? >> Transcripts[SEP]This lesson is focused on flow charts. It supports young people who are starting to program an...,English: From Psychology to Logic: Learning Computer Programming in the Kitchen[SEP]This lesson focuses on the biggest problem faced by any young programmer - i.e. the LOGIC BUILDING required whil...,Transcripts [SEP] 6e3ba4 [SEP] source [SEP] 3 [SEP] en [SEP] [SEP] MIT Blossoms >> Engineering >> Flow Charts: Logical Thinking? >> Transcripts [SEP] This lesson is focused on flow charts. It sup...,English: From Psychology to Logic: Learning Computer Programming in the Kitchen [SEP] video [SEP] en [SEP] This lesson focuses on the biggest problem faced by any young programmer - i.e. the LOGIC...,Transcripts[SEP][SEP]MIT Blossoms >> Engineering >> Flow Charts: Logical Thinking? >> Transcripts[SEP]This lesson is focused on flow charts. It supports young people who are starting to program an...,Transcripts [SEP] 6e3ba4 [SEP] source [SEP] 3 [SEP] en [SEP] [SEP] MIT Blossoms >> Engineering >> Flow Charts: Logical Thinking? >> Transcripts [SEP] This lesson is focused on flow charts. It sup...,517
248,t_00069b63a70a,c_af71916db76f,Transcripts[SEP][SEP]MIT Blossoms >> Engineering >> Flow Charts: Logical Thinking? >> Transcripts[SEP]This lesson is focused on flow charts. It supports young people who are starting to program an...,Connections in the P..: Written Transcript of this video lesson in English[SEP]Written Transcript of this video lesson in English[SEP]Salam. My name is Mohammad Zuheir Abu-Sbeih from King Fahd Uni...,Transcripts [SEP] 6e3ba4 [SEP] source [SEP] 3 [SEP] en [SEP] [SEP] MIT Blossoms >> Engineering >> Flow Charts: Logical Thinking? >> Transcripts [SEP] This lesson is focused on flow charts. It sup...,Connections in the P..: Written Transcript of this video lesson in English [SEP] document [SEP] en [SEP] Written Transcript of this video lesson in English [SEP] Salam. My name is Mohammad Zuheir ...,Transcripts[SEP][SEP]MIT Blossoms >> Engineering >> Flow Charts: Logical Thinking? >> Transcripts[SEP]This lesson is focused on flow charts. It supports young people who are starting to program an...,Transcripts [SEP] 6e3ba4 [SEP] source [SEP] 3 [SEP] en [SEP] [SEP] MIT Blossoms >> Engineering >> Flow Charts: Logical Thinking? >> Transcripts [SEP] This lesson is focused on flow charts. It sup...,517


## 封装预测函数

In [15]:
def predict_func(CFG, test_data):
    
    # 声明 tokenizer
    
    if CFG.token_path:
        CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.model_class_path +'/tokenizer')
    else:
        CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.model_class_path)
    
    # 输入预处理
    def prepare_input(cfg, text_topic, text_content):
        text_topic = text_topic.replace('[SEP]', CFG.sep_rep)
        text_content = text_content.replace('[SEP]', CFG.sep_rep)
        text = text_topic + '[SEP]' + text_content
        inputs = cfg.tokenizer.encode_plus(
            text,
            return_tensors=None,
            add_special_tokens=True,
            max_length=CFG.max_seq_len,
            pad_to_max_length=True,
            truncation=True
        )
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
        return inputs

    # dataset functions
    class TestDataset(Dataset):
        def __init__(self, cfg, df):
            self.cfg = cfg
            self.text_topic = df[CFG.topic_col].values
            self.text_content = df[CFG.content_col].values

        def __len__(self):
            return len(self.text_topic)

        def __getitem__(self, item):
            inputs = prepare_input(self.cfg, self.text_topic[item], self.text_content[item])
            return inputs

    def collate(inputs):
        mask_len = int(inputs["attention_mask"].sum(axis=1).max())
        for k, v in inputs.items():
            inputs[k] = inputs[k][:, :mask_len]
        return inputs


    class MeanPooling(nn.Module):
        def __init__(self):
            super(MeanPooling, self).__init__()

        def forward(self, last_hidden_state, attention_mask):
            input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
            sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
            sum_mask = input_mask_expanded.sum(1)
            sum_mask = torch.clamp(sum_mask, min=1e-9)
            mean_embeddings = sum_embeddings / sum_mask
            return mean_embeddings

    # model function
    class CustomModel(nn.Module):
        def __init__(self, cfg, config_path=None, pretrained=False):
            super().__init__()
            self.cfg = cfg
            if config_path is None:
                self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
                self.config.hidden_dropout = 0.
                self.config.hidden_dropout_prob = 0.
                self.config.attention_dropout = 0.
                self.config.attention_probs_dropout_prob = 0.
            else:
                self.config = torch.load(config_path)

            if pretrained:
                self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
            else:
                self.model = AutoModel.from_config(self.config)

            self.pool = MeanPooling()
            self.fc_dropout = nn.Dropout(0.1)
            self.fc = nn.Linear(self.config.hidden_size, 1)
            self._init_weights(self.fc)

        def _init_weights(self, module):
            if isinstance(module, nn.Linear):
                module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
                if module.bias is not None:
                    module.bias.data.zero_()
            elif isinstance(module, nn.Embedding):
                module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
                if module.padding_idx is not None:
                    module.weight.data[module.padding_idx].zero_()
            elif isinstance(module, nn.LayerNorm):
                module.bias.data.zero_()
                module.weight.data.fill_(1.0)

        def forward(self, inputs):
            outputs_topic = self.model(**inputs)
            last_hidden_states_topic = outputs_topic[0]
            feature = self.pool(last_hidden_states_topic, inputs['attention_mask'])
            logits = self.fc(self.fc_dropout(feature))
            return logits
        
    # dataloader
    test_dataset = TestDataset(CFG, test_data)
    test_loader = DataLoader(test_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
    # load finetuned model 
    model = CustomModel(cfg=CFG, config_path=CFG.model_class_path+'/config.pth', pretrained=False)
    state = torch.load(f'{CFG.model_class_path}/{CFG.model_finetuned_weight}',
                       map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.eval()
    model = model.to(device)

    # batch predict
    props = []
    for step, inputs in tqdm(enumerate(test_loader), total=len(test_loader)):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            logits = model(inputs)
            props.append(logits.sigmoid().squeeze().to('cpu').numpy().reshape(-1))
    prop_all = np.concatenate(props, axis=0)

    test_data[CFG.prob_colname] = prop_all
    
    # release memory
    del model
    del prop_all
    del props
    for _ in range(5):
        gc.collect()
        
    return test_data

In [16]:
def predict_func2(CFG, test_data):
    
    CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.model_name)
    CFG.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    def prepare_input(cfg, text_topic, text_content):
        text_topic = text_topic.replace('[SEP]', ' ')
        text_content = text_content.replace('[SEP]', ' ')
        text = text_topic + '[SEP]' + text_content
        inputs = cfg.tokenizer(text,
                               add_special_tokens=True,
                               max_length=cfg.max_seq_len,
                               truncation=True,
                               padding='max_length',
                               return_offsets_mapping=False)
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
        return inputs
    
    def collate(inputs):
        mask_len = int(inputs["attention_mask"].sum(axis=1).max())
        for k, v in inputs.items():
            inputs[k] = inputs[k][:, :mask_len]
        return inputs
    
    class TestDataset(Dataset):
        def __init__(self, cfg, df):
            self.cfg = cfg
            self.text_topic = df['topic_text'].values
            self.text_content = df['content_text'].values

        def __len__(self):
            return len(self.text_topic)

        def __getitem__(self, item):
            inputs = prepare_input(self.cfg, self.text_topic[item], self.text_content[item])
            return inputs


    def load_test_data(cfg, test_data):
        test_dataset = TestDataset(cfg, test_data)
        test_dataloader = DataLoader(dataset=test_dataset, batch_size=CFG.batch_size, 
                                     shuffle=False, num_workers=CFG.num_workers,
                                     pin_memory=True, drop_last=False)
        return test_dataloader


    class Model(nn.Module):
        def __init__(self, cfg):
            super().__init__()

            self.config = AutoConfig.from_pretrained(cfg.model_name, output_hidden_states=True)
            self.bert = AutoModel.from_pretrained(cfg.model_name, config=self.config)

            self.num_labels = 2
            self.loss_fct = nn.CrossEntropyLoss()

            self.dropout_fc = nn.Dropout(0.1)
            self.classifier = nn.Linear(self.config.hidden_size, self.num_labels)
            self._init_weights(self.classifier)
            self.attention = nn.Sequential(
                nn.Linear(self.config.hidden_size, 512),
                nn.Tanh(),
                nn.Linear(512, 1),
                nn.Softmax(dim=1)
            )
            self._init_weights(self.attention)

        def _init_weights(self, module):
            if isinstance(module, nn.Linear):
                module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
                if module.bias is not None:
                    module.bias.data.zero_()
            elif isinstance(module, nn.Embedding):
                module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
                if module.padding_idx is not None:
                    module.weight.data[module.padding_idx].zero_()
            elif isinstance(module, nn.LayerNorm):
                module.bias.data.zero_()
                module.weight.data.fill_(1.0)

        def feature(self, inputs):
            outputs = self.bert(**inputs)
            last_hidden_states = outputs[0]
            weights = self.attention(last_hidden_states)
            feature = torch.sum(weights * last_hidden_states, dim=1)
            return feature, last_hidden_states

        def forward(self, inputs, labels=None):
            feature, last_hidden_states = self.feature(inputs)
            logits = self.classifier(self.dropout_fc(feature))

            outputs = (logits,) + (last_hidden_states,)

            if labels is not None:
                loss = self.loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
                outputs = (loss,) + outputs

            return outputs
        
    def predict_func_(cfg):

        # tokenizer
        tokenizer = cfg.tokenizer
        # dataloader
        test_dataloader = load_test_data(cfg, test_data)

        final_res = None

        # 初始化模型
        best_model = Model(cfg)
        # 加载训练好的权重
        state = torch.load(cfg.model_class_path + '/best_model.bin', map_location=torch.device('cpu'))
        best_model.load_state_dict(state)
        # 加载到显卡
        best_model.to(cfg.device)
        # eval 模式
        best_model.eval()
        # batch predict
        p_logit = []
        with torch.no_grad():
            for inputs in tqdm(test_dataloader, total=len(test_dataloader)):
                inputs = collate(inputs)
                for k, v in inputs.items():
                    inputs[k] = v.to(cfg.device)
                logits = best_model(inputs)[0]
                p_logit.extend(torch.softmax(logits, -1).cpu().numpy())
        res = np.vstack(p_logit)
        if final_res is None:
            final_res = res
        else:
            final_res += res

        return final_res
    
    prop_all = predict_func_(CFG)
    test_data[CFG.prob_colname] = prop_all[:, 1]

    del CFG
    del prop_all

    for _ in range(5):
        gc.collect()
        
    return test_data

## 排序模型 1

In [17]:
class CFG:
    batch_size = CLF_CFG_1.batch_size
    max_seq_len = CLF_CFG_1.max_seq_len
    model_class_path = CLF_CFG_1.model_path
    model_finetuned_weight = 'mdeberta-v3-base_fold0_simcsemodel_1k_cv6895_best.pth'
    num_workers = 4
    topic_col = 'topic_text_all'
    content_col = 'content_text_all'
    prob_colname = 'prop_3'
    token_path = None
    sep_rep = ' '
    
    
test_data = predict_func(CFG, test_data)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/8 [00:00<?, ?it/s]

## 排序模型 2

In [18]:
class CFG:
    batch_size = CLF_CFG_2.batch_size
    max_seq_len = CLF_CFG_2.max_seq_len
    model_class_path = CLF_CFG_2.model_path
    model_finetuned_weight = 'mdeberta-v3-base_fold0_simcsemodel_lr_cv6647_best.pth'
    num_workers = 4
    topic_col = 'topic_text'
    content_col = 'content_text'
    prob_colname = 'prop_6'
    token_path = None
    sep_rep = ' '
    
    
test_data = predict_func(CFG, test_data)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/8 [00:00<?, ?it/s]

	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


## 排序模型 3

In [19]:
class CFG:
    batch_size = CLF_CFG_2.batch_size
    max_seq_len = CLF_CFG_2.max_seq_len
    model_class_path = CLF_CFG_2.model_path
    model_finetuned_weight = 'mdeberta-v3-base_fold0_simcsemodel_fgm_ema_cv6963_best.pth'
    num_workers = 4
    topic_col = 'topic_text'
    content_col = 'content_text'
    prob_colname = 'prop_7'
    token_path = None
    sep_rep = ' '
    
    
test_data = predict_func(CFG, test_data)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/8 [00:00<?, ?it/s]

	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


## 排序模型 4

In [20]:
class CFG:
    batch_size = CLF_CFG_3.batch_size
    max_seq_len = CLF_CFG_3.max_seq_len
    model_class_path = CLF_CFG_3.model_path
    model_finetuned_weight = 'microsoft-mdeberta-v3-base_fold0_best.pth'
    num_workers = 4
    topic_col = 'topic_text_all'
    content_col = 'content_text_all'
    prob_colname = 'prop_8'
    token_path = 'tokenizer'
    sep_rep = ' '
    
    
test_data = predict_func(CFG, test_data)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/8 [00:00<?, ?it/s]

## 排序模型 5

In [21]:
class CFG:
    batch_size = CLF_CFG_2.batch_size
    max_seq_len = CLF_CFG_2.max_seq_len
    model_class_path = CLF_CFG_2.model_path
    model_finetuned_weight = 'mdeberta-v3-base_fold0_warmup0_valid1k_cv6832_best.pth'
    num_workers = 4
    topic_col = 'topic_text_all'
    content_col = 'content_text_all'
    prob_colname = 'prop_9'
    token_path = None
    sep_rep = ' '
    
    
test_data = predict_func(CFG, test_data)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/8 [00:00<?, ?it/s]

## 排序模型 6

In [22]:
class CFG:
    batch_size = CLF_CFG_5.batch_size
    max_seq_len = CLF_CFG_5.max_seq_len
    model_class_path = CLF_CFG_5.model_path
    model_finetuned_weight = 'microsoft-mdeberta-v3-base_fold0_best.pth'
    num_workers = 4
    topic_col = 'topic_text_all'
    content_col = 'content_text_all'
    prob_colname = 'prop_11'
    token_path = 'tokenizer'
    sep_rep = ' '
    
    
test_data = predict_func(CFG, test_data)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/8 [00:00<?, ?it/s]

## 排序模型 7

In [23]:
class CFG:
    batch_size = CLF_CFG_2.batch_size
    max_seq_len = CLF_CFG_2.max_seq_len
    model_class_path = CLF_CFG_2.model_path
    model_finetuned_weight = 'mdeberta-v3-base_fold0_valid1k_simcsemodel_fgm_ema_cv7149_best.pth'
    num_workers = 4
    topic_col = 'topic_text_all'
    content_col = 'content_text_all'
    prob_colname = 'prop_12'
    token_path = None
    sep_rep = ' '
    
    
test_data = predict_func(CFG, test_data)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/8 [00:00<?, ?it/s]

## 排序模型 8

In [24]:
class CFG:
    batch_size = CLF_CFG_2.batch_size
    max_seq_len = CLF_CFG_2.max_seq_len
    model_class_path = CLF_CFG_2.model_path
    model_finetuned_weight = 'mdeberta-v3-base_fold0_pretrain_valid1k_cv6756_best.pth'
    num_workers = 4
    topic_col = 'topic_text_all'
    content_col = 'content_text_all'
    prob_colname = 'prop_13'
    token_path = None
    sep_rep = ' '
    
    
test_data = predict_func(CFG, test_data)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/8 [00:00<?, ?it/s]

	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


## 排序模型 9

In [25]:
class CFG:
    batch_size = CLF_CFG_2.batch_size
    max_seq_len = CLF_CFG_2.max_seq_len
    model_class_path = CLF_CFG_2.model_path
    model_finetuned_weight = 'mdeberta-v3-base_fold0_recall50_valid1k_fgm_ema_cv7086_best.pth'
    num_workers = 4
    topic_col = 'topic_text_all'
    content_col = 'content_text_all'
    prob_colname = 'prop_14'
    token_path = None
    sep_rep = '[UNK]'
    
    
test_data = predict_func(CFG, test_data)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/8 [00:00<?, ?it/s]

	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


## 排序模型 10

In [26]:
class CFG:
    batch_size = CLF_CFG_4.batch_size
    max_seq_len = CLF_CFG_4.max_seq_len
    model_class_path = CLF_CFG_4.model_path
    model_name = '/kaggle/input/mdeberta-v3-base/mdeberta-v3-base'
    num_workers = 4
    prob_colname = 'prop_5'
    
    
test_data = predict_func2(CFG, test_data)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at /kaggle/input/mdeberta-v3-base/mdeberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.weight', 'mask_predictions.classifer.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifer.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining mod

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/8 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

## 排序模型 11

In [27]:
class CFG:
    batch_size = CLF_CFG_6.batch_size
    max_seq_len = CLF_CFG_6.max_seq_len
    model_class_path = CLF_CFG_6.model_path
    model_name = '/kaggle/input/xlmroberta/xlm-roberta-base'
    num_workers = 4
    prob_colname = 'prop_15'
    
    
test_data = predict_func2(CFG, test_data)

Some weights of the model checkpoint at /kaggle/input/xlmroberta/xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/6 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

## 权重融合 

In [28]:
test_data[[col for col in test_data.columns if col.startswith('prop_')]]

Unnamed: 0,prop_3,prop_6,prop_7,prop_8,prop_9,prop_11,prop_12,prop_13,prop_14,prop_5,prop_15
0,0.000003,0.000003,8.132501e-07,0.000004,0.000004,0.005722,5.055330e-07,0.000004,0.000003,0.000003,0.000012
1,0.000003,0.000003,9.961373e-07,0.000006,0.000020,0.024047,1.490672e-06,0.000006,0.000009,0.000005,0.000052
2,0.000004,0.000003,1.125036e-06,0.000012,0.000006,0.008615,7.348532e-07,0.000004,0.000003,0.000003,0.000051
3,0.000012,0.000003,1.091572e-06,0.000045,0.000008,0.039374,7.357309e-06,0.000012,0.000004,0.000040,0.004492
4,0.001029,0.000007,1.606668e-06,0.000023,0.000021,0.050640,3.875716e-05,0.000017,0.000009,0.000003,0.000012
...,...,...,...,...,...,...,...,...,...,...,...
245,0.000003,0.000003,8.694565e-07,0.000004,0.000004,0.005722,6.407964e-07,0.000004,0.000003,0.000003,0.000012
246,0.000005,0.000003,1.098923e-06,0.000004,0.000004,0.005872,8.311657e-07,0.000005,0.000007,0.000004,0.000012
247,0.000003,0.000003,1.153533e-06,0.000004,0.000004,0.005887,8.664973e-07,0.000004,0.000003,0.000003,0.000012
248,0.000004,0.000003,1.002914e-06,0.000004,0.000004,0.006074,7.320742e-07,0.000004,0.000006,0.000004,0.000012


In [29]:
test_data.rename(columns={'id':'topic_id'}, inplace=True)

In [30]:
best_thres = 0.15
best_n_rec = 43

test_data['score'] = 0.
cols = ['prop_3', 'prop_6', 'prop_5', 'prop_15', 'prop_7', 'prop_8', 'prop_9', 'prop_11', 'prop_12', 'prop_13', 'prop_14']
weights = [
    0.06261942, 0.01710989, 0.07492457, 0.04533291, 0.1506641,  0.05495807,
    0.08113562, 0.07582885, 0.22189808, 0.05397026, 0.14677599
]
for c, w in zip(cols, weights):
    test_data['score'] += w * test_data[c]
    
test_data = test_data.sort_values(['topic_id', 'score'], ascending=[True, False]).reset_index(drop=True)

In [31]:
test_copy = test_data[['topic_id', 'content_id', 'score']].copy() # 用来补全的备份

In [32]:
# 算分

if Local_Test:
    df_target_metric = pd.merge(df_test, df_target, on='topic_id', how='left')
    test_sub = test_data[test_data['score'] >= best_thres].reset_index(drop=True)
    sub_df = test_sub.groupby('topic_id').apply(lambda g: g.head(best_n_rec)).reset_index(drop=True)
    sub_df = sub_df[['topic_id', 'content_id']].groupby('topic_id')['content_id'].agg(list).to_frame(name='preds').reset_index()
    sub_df['preds'] = sub_df['preds'].apply(lambda x: ' '.join(x))
    df_test_metric = df_target_metric[['topic_id', 'content_ids']].copy()
    df_test_metric = pd.merge(df_test_metric, sub_df, on='topic_id', how='left')
    df_metric = df_test_metric[['content_ids', 'preds']].copy()
    df_metric['content_ids'] = df_metric['content_ids'].astype(str).apply(lambda x: x.split())
    df_metric['preds'] = df_metric['preds'].astype(str).apply(lambda x: x.split())
    f2_scores = []
    for _, row in df_metric.iterrows():
        true_content_ids = set(row['content_ids'])
        pred_content_ids = set(row['preds'])
        tp = len(true_content_ids.intersection(pred_content_ids))
        fp = len(pred_content_ids - true_content_ids)
        fn = len(true_content_ids - pred_content_ids)
        if pred_content_ids:
            precision = tp / (tp + fp)
            recall = tp / (tp + fn)
            f2 = tp / (tp + 0.2 * fp + 0.8*fn)
        else:
            f2 = 0
        f2_scores.append(f2)
    score = np.mean(f2_scores)
    print('reranker f2 score:', score)

In [33]:
# recall@50 结果

df1 = test_data[test_data['score'] >= best_thres].reset_index(drop=True)
sub_df1 = df1.groupby('topic_id').apply(lambda g: g.head(best_n_rec)).reset_index(drop=True)
sub_df1 = sub_df1[['topic_id', 'content_id']].groupby('topic_id')['content_id'].agg(list).to_frame(name='preds').reset_index()
sub_df1['preds'] = sub_df1['preds'].apply(lambda x: ' '.join(x))

sub_df1

Unnamed: 0,topic_id,preds
0,t_00004da3a1b2,c_1108dd0c7a5d c_376c5a8eb028 c_5bc0e1e2cba0 c_76231f9d0b5e
1,t_00068291e9a4,c_639ea2ef9c95 c_89ce9367be10 c_ac1672cdcd2c c_ebb7fdf10a7e c_14bf71640ecd
2,t_00069b63a70a,c_11a1dc0bfb99
3,t_0006d41a73a8,c_0c6473c3480d c_5e375cf14c47 c_d7a0d7eaf799 c_b972646631cb c_1c57a1316568
4,t_4054df11a74e,c_3695c5dc1df6


## 2nd round rerank for bad case

In [34]:
if len(sub_df1) < len(df_test):

    topics_succ = sub_df1.topic_id.unique().tolist()

    # 需要进行第二轮精排的
    df_topic2 = df_topic_copy[~df_topic_copy['id'].isin(topics_succ)].reset_index(drop=True)

    df_topic2.rename(columns={'text2': 'topic_text'}, inplace=True)
    df_topic2.rename(columns={'text2_all': 'topic_text_all'}, inplace=True)

    df_topic2['content_id'] = df_topic2['recall_ids'].astype(str).apply(lambda x: x.split())
    df_topic2 = df_topic2.explode("content_id")
    df_topic2 = pd.merge(df_topic2, df_content, on='content_id', how='left')

    test_data2 = df_topic2[['id', 'content_id', 'topic_text', 'content_text','topic_text_all','content_text_all']]
    test_data2['text_all'] = test_data2['topic_text'] + '[SEP]' + test_data2['content_text']
    test_data2['text_all_all'] = test_data2['topic_text_all'] + '[SEP]' + test_data2['content_text_all']
    test_data2['text_len'] = test_data2['text_all'].apply(len)
    test_data2 = test_data2.sort_values(['text_len']).reset_index(drop=True)

    display(test_data2)
    
    # 先用最高 CV 单模

    class CFG:
        batch_size = CLF_CFG_2.batch_size
        max_seq_len = CLF_CFG_2.max_seq_len
        model_class_path = CLF_CFG_2.model_path
        model_finetuned_weight = 'mdeberta-v3-base_fold0_valid1k_simcsemodel_fgm_ema_cv7149_best.pth'
        num_workers = 4
        topic_col = 'topic_text_all'
        content_col = 'content_text_all'
        prob_colname = 'prop'
        token_path = None
        sep_rep = ' '


    test_data2 = predict_func(CFG, test_data2)
    
    test_data2.rename(columns={'id':'topic_id'}, inplace=True)
    test_data2['score'] = 1.0*test_data2['prop']
    test_data2 = test_data2.sort_values(['topic_id', 'score'], ascending=[True, False]).reset_index(drop=True)

    best_thres = 0.003
    best_n_rec = 4
    df2 = test_data2[test_data2['score'] >= best_thres].reset_index(drop=True)
    sub_df2 = df2.groupby('topic_id').apply(lambda g: g.head(best_n_rec)).reset_index(drop=True)

    sub_df2 = test_data2.groupby('topic_id').apply(lambda g: g.head(best_n_rec)).reset_index(drop=True)
    sub_df2 = sub_df2[['topic_id', 'content_id']].groupby('topic_id')['content_id'].agg(list).to_frame(name='preds').reset_index()
    sub_df2['preds'] = sub_df2['preds'].apply(lambda x: ' '.join(x))

    display(sub_df2)
    
    # 合并
    sub_df = pd.concat([sub_df1, sub_df2]).reset_index(drop=True)
    
else:
    sub_df = sub_df1.copy()

### 补全

In [35]:
sub_df = pd.merge(df_test[['topic_id']], sub_df, on='topic_id', how='left')
# 最后还是得补全
if sub_df['preds'].isna().sum() > 0:
    # 从原先补充少数为空的样本
    test_orig = test_copy[['topic_id', 'content_id', 'score']].groupby('topic_id').apply(lambda g: g.head(4)).reset_index(drop=True)
    test_orig = test_orig[['topic_id', 'content_id']].groupby('topic_id')['content_id'].agg(list).to_frame(name='preds_orig').reset_index()
    test_orig['preds_orig'] = test_orig['preds_orig'].apply(lambda x: ' '.join(x))
    sub_df = sub_df.merge(test_orig, on='topic_id', how='left')
    sub_df['content_ids'] = sub_df.apply(lambda row: row['preds_orig'] if pd.isna(row['preds']) else row['preds'], axis=1)
    sub_df = sub_df[['topic_id', 'content_ids']].copy()
else:
    sub_df.columns = ['topic_id', 'content_ids']

display(sub_df)
print(sub_df['content_ids'].isna().sum())

Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_1108dd0c7a5d c_376c5a8eb028 c_5bc0e1e2cba0 c_76231f9d0b5e
1,t_00068291e9a4,c_639ea2ef9c95 c_89ce9367be10 c_ac1672cdcd2c c_ebb7fdf10a7e c_14bf71640ecd
2,t_00069b63a70a,c_11a1dc0bfb99
3,t_0006d41a73a8,c_0c6473c3480d c_5e375cf14c47 c_d7a0d7eaf799 c_b972646631cb c_1c57a1316568
4,t_4054df11a74e,c_3695c5dc1df6


0


In [36]:
# 再算一次分

if Local_Test:
    df_target_metric = pd.merge(df_test, df_target, on='topic_id', how='left')
    df_test_metric = pd.merge(df_target_metric, sub_df, on='topic_id', how='left')
    df_metric = df_test_metric[['content_ids_x', 'content_ids_y']].copy()
    df_metric['content_ids'] = df_metric['content_ids_x'].astype(str).apply(lambda x: x.split())
    df_metric['preds'] = df_metric['content_ids_y'].astype(str).apply(lambda x: x.split())
    f2_scores = []
    for _, row in df_metric.iterrows():
        true_content_ids = set(row['content_ids'])
        pred_content_ids = set(row['preds'])
        tp = len(true_content_ids.intersection(pred_content_ids))
        fp = len(pred_content_ids - true_content_ids)
        fn = len(true_content_ids - pred_content_ids)
        if pred_content_ids:
            precision = tp / (tp + fp)
            recall = tp / (tp + fn)
            f2 = tp / (tp + 0.2 * fp + 0.8*fn)
        else:
            f2 = 0
        f2_scores.append(f2)
    score = np.mean(f2_scores)
    print('reranker f2 score:', score)

# generate submission 

In [37]:
display(sub_df)

Unnamed: 0,topic_id,content_ids
0,t_00004da3a1b2,c_1108dd0c7a5d c_376c5a8eb028 c_5bc0e1e2cba0 c_76231f9d0b5e
1,t_00068291e9a4,c_639ea2ef9c95 c_89ce9367be10 c_ac1672cdcd2c c_ebb7fdf10a7e c_14bf71640ecd
2,t_00069b63a70a,c_11a1dc0bfb99
3,t_0006d41a73a8,c_0c6473c3480d c_5e375cf14c47 c_d7a0d7eaf799 c_b972646631cb c_1c57a1316568
4,t_4054df11a74e,c_3695c5dc1df6


In [38]:
sub_df.to_csv('submission.csv', index=False)