In [1]:
!pip install transformers 

[0m

In [31]:
import pandas as pd 
import numpy as np
import transformers 
import torch


In [34]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=5)

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/tokenizer_config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_tok

In [35]:
model = model.to('cuda')

In [36]:
df = pd.read_json('/kaggle/input/vc-it-cup-ranking/train_preprocessed.json')

In [37]:
df_raw = df.explode(['comments', 'score'])

In [38]:
df_raw['text'] = df_raw['posts'] + ' ' + df_raw['comments']

In [39]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df_raw, random_state=42)

In [40]:
train_df = train_df.rename(columns={'score': 'labels'})
val_df = val_df.rename(columns={'score': 'labels'})

In [41]:
from tqdm.notebook import tqdm
from datasets import Dataset

# Tokenize Data 

In [42]:
train = Dataset.from_pandas(train_df)
val = Dataset.from_pandas(val_df)

In [44]:
train_tokenized = train.map(lambda x: tokenizer(x['text'], truncation=True, max_length=128, padding=True, return_tensors='pt'))

  0%|          | 0/330401 [00:00<?, ?ex/s]

In [45]:
val_tokenized = val.map(lambda x: tokenizer(x['text'], truncation=True,max_length=128, padding=True, return_tensors='pt'))

  0%|          | 0/110134 [00:00<?, ?ex/s]

In [None]:
train_tokenized = train_tokenized.map(lambda x: {"input_ids": x['input_ids'][0], "attention_mask": x['attention_mask'][0]})

In [73]:
val_tokenized = val_tokenized.map(lambda x: {"input_ids": x['input_ids'][0], "attention_mask": x['attention_mask'][0]})

  0%|          | 0/110134 [00:00<?, ?ex/s]

# Tune model

In [52]:
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding

In [66]:
from datasets import load_metric
metric = load_metric("precision")

Downloading builder script:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [84]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    return metric.compute(predictions=predictions, references=labels, average='macro')

In [87]:
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=True,
    learning_rate = 1e-5,
    warmup_steps = 600,
    num_train_epochs = 3.0
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using cuda_amp half precision backend


In [88]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text, comments, posts, __index_level_0__. If text, comments, posts, __index_level_0__ are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 330401
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 61953
  Number of trainable parameters = 66957317
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss,Precision
1,1.5513,1.545814,0.264947
2,1.5315,1.548313,0.269698
3,1.5219,1.559641,0.271373


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text, comments, posts, __index_level_0__. If text, comments, posts, __index_level_0__ are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 110134
  Batch size = 16
Saving model checkpoint to my_awesome_model/checkpoint-20651
Configuration saved in my_awesome_model/checkpoint-20651/config.json
Model weights saved in my_awesome_model/checkpoint-20651/pytorch_model.bin
tokenizer config file saved in my_awesome_model/checkpoint-20651/tokenizer_config.json
Special tokens file saved in my_awesome_model/checkpoint-20651/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text, comments, posts, __index_level_0__. If text, com

TrainOutput(global_step=61953, training_loss=1.5331353127572516, metrics={'train_runtime': 8074.2896, 'train_samples_per_second': 122.76, 'train_steps_per_second': 7.673, 'total_flos': 3.2607822333222452e+16, 'train_loss': 1.5331353127572516, 'epoch': 3.0})

In [121]:
import torch 

torch.save(model.state_dict(), 'classifier_for_ranking')

# Get logits for val set

In [145]:
logits_val= trainer.predict(val_tokenized)

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text, comments, posts, __index_level_0__. If text, comments, posts, __index_level_0__ are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 110134
  Batch size = 16


In [155]:
logits_val.predictions.shape

(110134, 5)

In [151]:
np.save("logits_val.npy", logits_val.predictions)


Exception in thread SystemMonitor:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/opt/conda/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.7/site-packages/wandb/sdk/internal/system/system_monitor.py", line 118, in _start
    asset.start()
  File "/opt/conda/lib/python3.7/site-packages/wandb/sdk/internal/system/assets/cpu.py", line 166, in start
    self.metrics_monitor.start()
  File "/opt/conda/lib/python3.7/site-packages/wandb/sdk/internal/system/assets/interfaces.py", line 168, in start
    logger.info(f"Started {self._process.name}")
AttributeError: 'NoneType' object has no attribute 'name'

Exception in thread SystemMonitor:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/opt/conda/lib/python3.7/threading.py", line 870, in run
  

# Get logits for train set

In [156]:
logits_train = trainer.predict(train_tokenized)

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text, comments, posts, __index_level_0__. If text, comments, posts, __index_level_0__ are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 330401
  Batch size = 16


In [157]:
np.save("logits_train.npy", logits_train.predictions)

# Get logits for test set

In [158]:
test_set = pd.read_json("/kaggle/input/vc-it-cup-ranking/test_preprocessed.json")

In [160]:
test_raw = test_set.explode(['comments'])

In [161]:
test_raw['text'] = test_raw['posts'] + ' ' + test_raw['comments']

In [162]:
test = Dataset.from_pandas(test_raw)

In [163]:
test_tokenized = test.map(lambda x: tokenizer(x['text'], truncation=True, max_length=128, padding=True, return_tensors='pt'))

  0%|          | 0/70020 [00:00<?, ?ex/s]

In [166]:
test_tokenized = test_tokenized.map(lambda x: {"input_ids": x['input_ids'][0], "attention_mask": x['attention_mask'][0]})

  0%|          | 0/70020 [00:00<?, ?ex/s]

In [167]:
logits_test = trainer.predict(test_tokenized)

The following columns in the test set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text, comments, posts, __index_level_0__. If text, comments, posts, __index_level_0__ are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 70020
  Batch size = 16


In [170]:
np.save('logits_test.npy', logits_test.predictions)