In [1]:
import csv
import pandas as pd
test = pd.read_csv("../input/bert-18000-128/dataset/test.csv", escapechar = "\\", quoting = csv.QUOTE_NONE)

In [2]:
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [3]:
import pickle
file = open("../input/bert-18000-128/lable_map.pickle",'rb')
label_map = pickle.load(file)
file.close()

label_to_id = {}
for key, value in label_map.items():
    label_to_id[value] = key

In [4]:
import torch

class Dataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, is_train=True, label_map={}, max_length=128, load_desc = True, load_bullets = True):
        self.df = df
        self.load_desc = load_desc
        self.load_bullets = load_bullets
        self.title = df.TITLE.values
        self.desc = df.DESCRIPTION.values
        self.bullets = df.BULLET_POINTS.apply(lambda x: x[1:-1] if len(x)>0 and x[0]=='[' else x).values
        self.tokenizer = tokenizer
        if is_train:
            self.labels = df.BROWSE_NODE_ID.apply(lambda x: label_map[x]).values
            self.label_map = label_map
        self.is_train = is_train
        self.max_length = max_length
 
    def __getitem__(self, idx):
        req_string = self.title[idx] + ' ~ '
        if self.load_desc:
            req_string += self.desc[idx]
        req_string += ' ~ '
        if self.load_bullets:
            req_string += self.bullets[idx]
        
        tokenized_data = tokenizer.tokenize(req_string)
        to_append = ["[CLS]"] + tokenized_data[:self.max_length - 2] + ["[SEP]"]
        input_ids = tokenizer.convert_tokens_to_ids(to_append)
        input_mask = [1] * len(input_ids)
        padding = [0] * (self.max_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        item = {
            "input_ids": torch.tensor(input_ids),
            "attention_mask": torch.tensor(input_mask),
            "token_type_ids": torch.tensor([0]*self.max_length)
        }
        if self.is_train:
            item['labels'] = torch.tensor(self.labels[idx])
        return item
 
    def __len__(self):
        return len(self.df)



In [5]:
from transformers import BertPreTrainedModel, Trainer, BertModel
from transformers.modeling_outputs import SequenceClassifierOutput
from torch import nn

class BertForSequenceClassification(BertPreTrainedModel):
    def __init__(self, config, num_labels=len(label_map)):
        super().__init__(config)
        self.num_labels = num_labels
        self.config = config

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, self.num_labels)

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [6]:
model = BertForSequenceClassification.from_pretrained("../input/bert-18000-128/results/checkpoint-18000")

In [7]:
test_dataset = Dataset(test.fillna(""), tokenizer, is_train=False, load_desc = False, load_bullets = True)

In [8]:
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.eval()

test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
logits = []
with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits.append(outputs["logits"])
    labels = torch.cat(logits).argmax(1).cpu()

  0%|          | 0/6924 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (520 > 512). Running this sequence through the model will result in indexing errors


In [9]:
data_dict = {
    "PRODUCT_ID": range(1, 110776),
    "BROWSE_NODE_ID": labels
}
submit = pd.DataFrame.from_dict(data_dict)
submit["BROWSE_NODE_ID"] = submit["BROWSE_NODE_ID"].apply(lambda x: label_to_id[x])
submit.to_csv("BERTbase_18000_no_desc.csv", index=False)
submit.head()
logits = torch.cat(logits).cpu()
with open('logits_bert_18000_no_desc.pickle', 'wb') as handle:
    pickle.dump(logits, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [10]:
test_dataset = Dataset(test.fillna(""), tokenizer, is_train=False, load_desc = True, load_bullets = False)

In [11]:
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.eval()

test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
logits = []
with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits.append(outputs["logits"])
    labels = torch.cat(logits).argmax(1).cpu()

  0%|          | 0/6924 [00:00<?, ?it/s]

In [12]:
data_dict = {
    "PRODUCT_ID": range(1, 110776),
    "BROWSE_NODE_ID": labels
}
submit = pd.DataFrame.from_dict(data_dict)
submit["BROWSE_NODE_ID"] = submit["BROWSE_NODE_ID"].apply(lambda x: label_to_id[x])
submit.to_csv("BERTbase_18000_no_bullet.csv", index=False)
submit.head()
logits = torch.cat(logits).cpu()
with open('logits_bert_18000_no_bullet.pickle', 'wb') as handle:
    pickle.dump(logits, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [13]:
test_dataset = Dataset(test.fillna(""), tokenizer, is_train=False, load_desc = False, load_bullets = False)

In [14]:
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.eval()

test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
logits = []
with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits.append(outputs["logits"])
    labels = torch.cat(logits).argmax(1).cpu()

  0%|          | 0/6924 [00:00<?, ?it/s]

In [15]:
data_dict = {
    "PRODUCT_ID": range(1, 110776),
    "BROWSE_NODE_ID": labels
}
submit = pd.DataFrame.from_dict(data_dict)
submit["BROWSE_NODE_ID"] = submit["BROWSE_NODE_ID"].apply(lambda x: label_to_id[x])
submit.to_csv("BERTbase_18000_no_desc_no_bullet.csv", index=False)
submit.head()
logits = torch.cat(logits).cpu()
with open('logits_bert_18000_no_desc_no_bullet.pickle', 'wb') as handle:
    pickle.dump(logits, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [16]:
test_dataset = Dataset(test.fillna(""), tokenizer, is_train=False, load_desc = True, load_bullets = True)

In [17]:
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.eval()

test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
logits = []
with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits.append(outputs["logits"])
    labels = torch.cat(logits).argmax(1).cpu()

  0%|          | 0/6924 [00:00<?, ?it/s]

In [18]:
data_dict = {
    "PRODUCT_ID": range(1, 110776),
    "BROWSE_NODE_ID": labels
}
submit = pd.DataFrame.from_dict(data_dict)
submit["BROWSE_NODE_ID"] = submit["BROWSE_NODE_ID"].apply(lambda x: label_to_id[x])
submit.to_csv("BERTbase_18000.csv", index=False)
submit.head()
logits = torch.cat(logits).cpu()
with open('logits_bert_18000.pickle', 'wb') as handle:
    pickle.dump(logits, handle, protocol=pickle.HIGHEST_PROTOCOL)