In [1]:
from datasets import Dataset, DatasetDict
from transformers import AutoModel, AutoTokenizer, DataCollatorWithPadding
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import pandas as pd

import pytorch_lightning as pl
import numpy as np



In [2]:
import warnings, logging, os
warnings.simplefilter('ignore')
logging.disable(logging.WARNING)

In [3]:
pl.seed_everything(34)

34

In [4]:
model_name = 'microsoft/deberta-v3-base'

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [6]:
%%time
class_names = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train = pd.read_csv('../input/train.csv.zip').fillna(' ')[['comment_text']]
test = pd.read_csv('../input/test.csv.zip').fillna(' ')[['comment_text']]

CPU times: user 1.47 s, sys: 31.9 ms, total: 1.51 s
Wall time: 1.5 s


In [7]:
train.head()

Unnamed: 0,comment_text
0,Explanation\nWhy the edits made under my usern...
1,D'aww! He matches this background colour I'm s...
2,"Hey man, I'm really not trying to edit war. It..."
3,"""\nMore\nI can't make any real suggestions on ..."
4,"You, sir, are my hero. Any chance you remember..."


In [8]:
ds = DatasetDict({'train': Dataset.from_pandas(train), 'test':Dataset.from_pandas(test)})
train_tok_ds = ds['train'].map(
    lambda x: tokenizer(x['comment_text'], max_length=320, truncation=True, padding=True), 
    batched=True,
    remove_columns='comment_text'
)
test_tok_ds = ds['test'].map(
    lambda x: tokenizer(x['comment_text'], max_length=320, truncation=True, padding=True), 
    batched=True,
    remove_columns='comment_text'
)



  0%|          | 0/160 [00:00<?, ?ba/s]

  0%|          | 0/154 [00:00<?, ?ba/s]

In [9]:
train_ds = list(train_tok_ds.flatten())
test_ds = list(test_tok_ds.flatten())



In [10]:
train_dl = DataLoader(
    train_ds, 
    batch_size=16, 
    num_workers=4,
    collate_fn=DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
)
test_dl = DataLoader(
    test_ds, 
    batch_size=16, 
    num_workers=4,
    collate_fn=DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
)

In [11]:
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [12]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state.detach().cpu()
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9
    )

In [13]:
model = model.cuda()
model.eval()

train_embs = []
for batch in tqdm(train_dl):
    batch = {k: v.cuda() for k, v in batch.items() if k != 'labels'}
    with torch.no_grad():
        with torch.cuda.amp.autocast():
            model_output = model(**batch)
    embeddings = mean_pooling(model_output, batch['attention_mask'].detach().cpu())
    embeddings = F.normalize(embeddings, p=2, dim=1)
    embeddings = embeddings.squeeze(0).detach().cpu().numpy()
    train_embs.extend(embeddings)

train_embs = np.array(train_embs)

test_embs = []
for batch in tqdm(test_dl):
    batch = {k: v.cuda() for k, v in batch.items() if k != 'labels'}
    with torch.no_grad():
        with torch.cuda.amp.autocast():
            model_output = model(**batch)
    embeddings = mean_pooling(model_output, batch['attention_mask'].detach().cpu())
    embeddings = F.normalize(embeddings, p=2, dim=1)
    embeddings = embeddings.squeeze(0).detach().cpu().numpy()
    test_embs.extend(embeddings)

test_embs = np.array(test_embs)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9974/9974 [19:13<00:00,  8.65it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9573/9573 [18:26<00:00,  8.65it/s]


In [14]:
np.save('../input/train_embs.npy', train_embs)

In [15]:
np.save('../input/test_embs.npy', test_embs)

In [16]:
train_embs.shape

(159571, 768)

In [17]:
test_embs.shape

(153164, 768)