In [1]:
!nvidia-smi

Wed Feb  8 01:47:01 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.47.03    Driver Version: 510.47.03    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   51C    P0    28W /  70W |      0MiB / 15360MiB |     10%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!mkdir -p .kaggle
!cp "./kaggle.json" .kaggle/
!chmod 600 .kaggle/kaggle.json
!cp -r .kaggle /root

!kaggle -v

Kaggle API 1.5.12


In [3]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: sentence_transformers
  Building wheel for sentence_transformers (setup.py) ... [?25ldone
[?25h  Created wheel for sentence_transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125938 sha256=39a6c7a64bb3aacb5ba9f0104ce1ec63556deb14d1d7730d2a5d0c536983a513
  Stored in directory: /root/.cache/pip/wheels/bf/06/fb/d59c1e5bd1dac7f6cf61ec0036cc3a10ab8fecaa6b2c3d3ee9
Successfully built sentence_transformers
Installing collected packages: sentence_transformers
Successfully installed sentence_transformers-2.2.2
[0m

In [3]:
!kaggle competitions download -c learning-equality-curriculum-recommendations
!mkdir -p input_data/learning-equality-curriculum-recommendations
!unzip learning-equality-curriculum-recommendations.zip -d input_data/learning-equality-curriculum-recommendations
!rm learning-equality-curriculum-recommendations.zip

!dname="lecr-1st-train-data" && \
 kaggle datasets download -d takamichitoda/"$dname" && \
 mkdir -p input_data/"$dname"  && \
 unzip "$dname".zip -d input_data/"$dname"  && \
 rm "$dname".zip

!dname="lecr-cut-train-dataset" && \
 kaggle datasets download -d takamichitoda/"$dname" && \
 mkdir -p input_data/"$dname"  && \
 unzip "$dname".zip -d input_data/"$dname"  && \
 rm "$dname".zip

!kaggle datasets download -d takamichitoda/lecr-topic-with-parent
!mkdir -p input_data/lecr-topic-with-parent
!unzip lecr-topic-with-parent.zip -d input_data/lecr-topic-with-parent
!rm lecr-topic-with-parent.zip

!dname="lecr-cv-data" && \
 kaggle datasets download -d takamichitoda/"$dname" && \
 mkdir -p input_data/"$dname"  && \
 unzip "$dname".zip -d input_data/"$dname"  && \
 rm "$dname".zip

Downloading learning-equality-curriculum-recommendations.zip to /home/jupyter
 95%|██████████████████████████████████████▉  | 241M/254M [00:02<00:00, 121MB/s]
100%|█████████████████████████████████████████| 254M/254M [00:02<00:00, 126MB/s]


In [1]:
import os
import random
import numpy as np
import torch
import json
import pandas as pd
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, InputExample
from sentence_transformers import evaluation
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

%env TOKENIZERS_PARALLELISM=true

env: TOKENIZERS_PARALLELISM=true


In [2]:
class CFG:
    INPUT = './input_data/learning-equality-curriculum-recommendations'

    TRAIN_1ST = './input_data/lecr-1st-train-data/first_stage_train.json'
    CUT_TEXT = './input_data/lecr-cut-train-dataset'
    CV_SPLIT = './input_data/lecr-cv-data'
    TOPIC_WITH_PARENT = './input_data/lecr-topic-with-parent/topics_has_content_with_parent_title.csv'

    #MODEL_NAME = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
    MODEL_NAME = 'xlm-roberta-base'
    #MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

    OUTPUT = './output/xlm-roberta'
    BS = 128 #256
    N_EPOCH = 50
    WARMUP_RATE = 0.0
    LR = 5e-05
    MARGIN = 0.5
    MAX_GRAD_NORM = 1.0
    
    SEED = 42

In [3]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [4]:
content_df = pd.read_csv(f'{CFG.INPUT}/content.csv')
content_df = content_df.fillna('')

topics_df = pd.read_csv(f'{CFG.TOPIC_WITH_PARENT}')
correlations_df = pd.read_csv(f'{CFG.CV_SPLIT}/correlations_with_fold.csv')
correlations_with_topics_df = correlations_df.merge(topics_df, left_on='topic_id', right_on='id').drop(columns='id')
valid_correlations_with_topics_df = correlations_with_topics_df.query('fold==0').fillna('')
train_correlations_with_topics_df = correlations_with_topics_df.query('fold!=0').fillna('')

with open(f'{CFG.TRAIN_1ST}', 'r') as f:
    first_stage_train = json.load(f)
    
with open(f'{CFG.CUT_TEXT}/content_joint_sentences.json', 'r') as f:
    content_joint_sentences = json.load(f)
with open(f'{CFG.CUT_TEXT}/topic_joint_sentence.json', 'r') as f:
    topic_joint_sentence = json.load(f)

In [5]:
#from sentence_transformers.util import dot_score
from sentence_transformers import SentenceTransformer, models

In [6]:
#model = SentenceTransformer(CFG.MODEL_NAME)

word_embedding_model = models.Transformer(CFG.MODEL_NAME)

tokens = ["<my_sep>"]
word_embedding_model.tokenizer.add_tokens(tokens, special_tokens=True)
word_embedding_model.auto_model.resize_token_embeddings(len(word_embedding_model.tokenizer))

pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

#train_loss = losses.ContrastiveLoss(
#    model=model,
#    margin=CFG.MARGIN,
    #distance_metric=losses.SiameseDistanceMetric.EUCLIDEAN,
#)
#train_loss = losses.OnlineContrastiveLoss(model=model)
#train_loss = losses.MultipleNegativesRankingLoss(model=model, scale=1.0, similarity_fct=dot_score)
train_loss = losses.MultipleNegativesRankingLoss(model=model)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaModel: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
examples = []
for ids in tqdm(first_stage_train):
    topic_text = topic_joint_sentence[ids['topic_id']]
    content_text = content_joint_sentences[ids['content_id']]
    label = ids['label']
    #e = InputExample(texts=[topic_text, content_text], label=label)
    if label == 0:
        continue
    e = InputExample(texts=[topic_text, content_text])
    examples.append(e)

  0%|          | 0/2668159 [00:00<?, ?it/s]

In [8]:
relevant_docs = {}
for _, row in tqdm(valid_correlations_with_topics_df.iterrows(), total=len(valid_correlations_with_topics_df)):
    relevant_docs[row['topic_id']] = set(row['content_ids'].split())
    
queries = {}
for _, row in tqdm(valid_correlations_with_topics_df.iterrows(), total=len(valid_correlations_with_topics_df)):
    queries[row['topic_id']] = topic_joint_sentence[row['topic_id']]
    
corpus = {}
for _, row in tqdm(content_df.iterrows(), total=len(content_df)):
    corpus[row['id']] = content_joint_sentences[row['id']]

  0%|          | 0/11113 [00:00<?, ?it/s]

  0%|          | 0/11113 [00:00<?, ?it/s]

  0%|          | 0/154047 [00:00<?, ?it/s]

In [10]:
ir_evaluator = evaluation.InformationRetrievalEvaluator(
    queries,
    corpus,
    relevant_docs,
    precision_recall_at_k=[50, 100],
    name='ir',
    write_csv=True,
    show_progress_bar=True
)

In [None]:
seed_everything(CFG.SEED)
train_dataloader = DataLoader(examples, shuffle=True, batch_size=CFG.BS)

n_step = len(train_dataloader)
print("org n_step:", n_step)
#n_step = 2500
print("update n_step:", n_step)

warmup_steps = int(len(train_dataloader) * CFG.N_EPOCH * CFG.WARMUP_RATE)
print('warmup_steps:', warmup_steps)

model.fit([(train_dataloader, train_loss)],
          epochs=CFG.N_EPOCH,
          evaluator=ir_evaluator,
          #evaluation_steps=n_step,
          warmup_steps=warmup_steps,
          max_grad_norm=CFG.MAX_GRAD_NORM,
          optimizer_params={'lr': CFG.LR},
          scheduler='warmupcosine',
          use_amp=True,
          output_path=f'{CFG.OUTPUT}',
          #checkpoint_save_steps=n_step,
          checkpoint_path=f'{CFG.OUTPUT}/checkpoints/',
          show_progress_bar=True,
)

org n_step: 1884
update n_step: 1884
warmup_steps: 0


Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1884 [00:00<?, ?it/s]

Batches:   0%|          | 0/348 [00:00<?, ?it/s]


Corpus Chunks:   0% 0/4 [00:00<?, ?it/s][A
Corpus Chunks:  25% 1/4 [02:26<07:18, 146.23s/it][A
Corpus Chunks:  50% 2/4 [04:53<04:53, 146.97s/it][A
Corpus Chunks:  75% 3/4 [07:19<02:26, 146.41s/it][A
Corpus Chunks: 100% 4/4 [07:32<00:00, 113.19s/it][A


Iteration:   0%|          | 0/1884 [00:00<?, ?it/s]

Batches:   0%|          | 0/348 [00:00<?, ?it/s]


Corpus Chunks:   0% 0/4 [00:00<?, ?it/s][A
Corpus Chunks:  25% 1/4 [02:27<07:21, 147.12s/it][A
Corpus Chunks:  50% 2/4 [04:54<04:54, 147.09s/it][A
Corpus Chunks:  75% 3/4 [07:22<02:27, 147.70s/it][A
Corpus Chunks: 100% 4/4 [07:36<00:00, 114.11s/it][A


Iteration:   0%|          | 0/1884 [00:00<?, ?it/s]

Batches:   0%|          | 0/348 [00:00<?, ?it/s]


Corpus Chunks:   0% 0/4 [00:00<?, ?it/s][A
Corpus Chunks:  25% 1/4 [02:28<07:26, 148.88s/it][A
Corpus Chunks:  50% 2/4 [04:55<04:55, 147.76s/it][A
Corpus Chunks:  75% 3/4 [07:22<02:27, 147.32s/it][A
Corpus Chunks: 100% 4/4 [07:36<00:00, 114.12s/it][A


Iteration:   0%|          | 0/1884 [00:00<?, ?it/s]

Batches:   0%|          | 0/348 [00:00<?, ?it/s]


Corpus Chunks:   0% 0/4 [00:00<?, ?it/s][A

In [None]:
!ls