In [1]:
from kaggle_secrets import UserSecretsClient
import wandb
user_secrets = UserSecretsClient()
wandb_api = user_secrets.get_secret("wandb_api")

wandb.login(key=wandb_api)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [2]:
!pip install transformers

[0m

In [3]:
from transformers import AutoTokenizer, AlbertTokenizer
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, Trainer

df = pd.read_parquet("/kaggle/input/custom-pog/dataset.parquet")
train, test = train_test_split(df, test_size=0.2)

In [4]:
train.columns

Index(['index', 'video_id', 'title', 'publishedAt', 'channelId', 'category',
       'channelTitle', 'categoryId', 'trending_date', 'tags', 'view_count',
       'likes', 'dislikes', 'comment_count', 'thumbnail_link',
       'comments_disabled', 'ratings_disabled', 'description', 'id',
       'duration_seconds', 'has_thumbnail', 'target', 'publish_day',
       'publish_hour', 'trending_date-publishedAt', 'tags_len',
       'description_len', 'description_title'],
      dtype='object')

In [5]:
!pip install sentencepiece

[0m

In [6]:
# tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-multilingual-cased")

train_encodings = tokenizer(list(train["description_title"].values), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(test["description_title"].values), truncation=True, padding=True, max_length=128)

Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466 [00:00<?, ?B/s]

In [7]:
class NSMCDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)

train_dataset = NSMCDataset(train_encodings, train["view_count"].values.astype(np.float32))
test_dataset = NSMCDataset(test_encodings, np.expand_dims(test["view_count"].values.astype(np.float32), 0))

In [8]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=100.0,
    per_device_train_batch_size=256,
    per_device_eval_batch_size=256,
    warmup_steps=340,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=680,
    save_steps=680,
    eval_steps=100,
    report_to="wandb",
    gradient_accumulation_steps=8,
    evaluation_strategy="steps"
)

In [9]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-multilingual-cased", problem_type="regression")


Downloading:   0%|          | 0.00/517M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'pre_classifie

In [10]:
model.config.dim

768

In [11]:
model.num_labels = 1

In [12]:
model.pre_classifier = nn.Linear(model.config.dim, model.config.dim)
model.classifier = nn.Linear(model.config.dim, 1)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()

***** Running training *****
  Num examples = 70548
  Num Epochs = 100
  Instantaneous batch size per device = 256
  Total train batch size (w. parallel, distributed & accumulation) = 2048
  Gradient Accumulation steps = 8
  Total optimization steps = 3400
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mjtiger958[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
100,No log,69.195633
200,No log,1.310448
300,No log,1.31353
400,No log,1.449688
500,No log,2.540612
600,No log,2.289674
700,26.620300,2.477746
800,26.620300,2.274256
900,26.620300,2.276162
1000,26.620300,3.397683


***** Running Evaluation *****
  Num examples = 1
  Batch size = 256
  return F.mse_loss(input, target, reduction=self.reduction)
***** Running Evaluation *****
  Num examples = 1
  Batch size = 256
***** Running Evaluation *****
  Num examples = 1
  Batch size = 256
***** Running Evaluation *****
  Num examples = 1
  Batch size = 256
***** Running Evaluation *****
  Num examples = 1
  Batch size = 256
***** Running Evaluation *****
  Num examples = 1
  Batch size = 256
Saving model checkpoint to ./results/checkpoint-680
Configuration saved in ./results/checkpoint-680/config.json
Model weights saved in ./results/checkpoint-680/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1
  Batch size = 256
  return F.mse_loss(input, target, reduction=self.reduction)
***** Running Evaluation *****
  Num examples = 1
  Batch size = 256
***** Running Evaluation *****
  Num examples = 1
  Batch size = 256
***** Running Evaluation *****
  Num examples = 1
  Batch size = 256
***** Runn

TrainOutput(global_step=3400, training_loss=5.54996867909151, metrics={'train_runtime': 22042.3111, 'train_samples_per_second': 320.057, 'train_steps_per_second': 0.154, 'total_flos': 1.4052394727033862e+17, 'train_loss': 5.54996867909151, 'epoch': 99.99})

In [13]:
trainer.evaluate()
trainer.save_model("./results")

***** Running Evaluation *****
  Num examples = 1
  Batch size = 256
  return F.mse_loss(input, target, reduction=self.reduction)


Saving model checkpoint to ./results
Configuration saved in ./results/config.json
Model weights saved in ./results/pytorch_model.bin


In [14]:
wandb.finish()

VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,▃▂▂▂▂▂▂▂▂▂▂▁▂▂▂▁▂▂▂▂▂▂▂▁▁▁▂▂▂▂▂▄▂▂█
eval/samples_per_second,▄▆▆▇▇▇▆▆▇▆▆▇▆▆▆█▆▆▆▇▆▆▆▇▇█▆▇▇▇▆▃▇▆▁
eval/steps_per_second,▄▆▆▇▇▇▆▆▇▆▆▇▆▆▆█▆▆▆▇▆▆▆▇▇█▆▇▇▇▆▃▇▆▁
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇█████
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇█████
train/learning_rate,█▆▅▃▁
train/loss,█▁▁▁▁
train/total_flos,▁
train/train_loss,▁

0,1
eval/loss,2.51393
eval/runtime,0.0423
eval/samples_per_second,23.639
eval/steps_per_second,23.639
train/epoch,99.99
train/global_step,3400.0
train/learning_rate,0.0
train/loss,0.2588
train/total_flos,1.4052394727033862e+17
train/train_loss,5.54997
