In [1]:
import pandas as pd
import re
import os
import torch
import numpy as np

Install HuggingFace implementation of bert (https://huggingface.co/).

In [2]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/10/aeefced99c8a59d828a92cc11d213e2743212d3641c87c82d61b035a7d5c/transformers-2.3.0-py3-none-any.whl (447kB)
[K     |████████████████████████████████| 450kB 2.7MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/1f/8e/ed5364a06a9ba720fddd9820155cc57300d28f5f43a6fd7b7e817177e642/sacremoses-0.0.35.tar.gz (859kB)
[K     |████████████████████████████████| 860kB 8.8MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l- \ | done
[?25h  Created wheel for sacremoses: filename=sacremoses-0.0.35-cp36-none-any.whl size=883999 sha256=633f5de7eb3e57d618e754eeedd12518a7c94000701735c3680ceaa17c22df83
  Stored in directory: /tmp/.cache/pip/wheels/63/2a/db/63e2909042c634ef551d0d9ac825b2b0b32dede4a6d87ddc94
Successfully built sacremoses
Installing collected packages: sacremoses, transformers
Successfully in

In [3]:
from transformers import BertModel, BertTokenizer

In [4]:
path_to_dataset = '/kaggle/input/nlp-getting-started/'

In [5]:
test_df = pd.read_csv(os.path.join(path_to_dataset, 'test.csv'))

Defining our simple model (logistic regression over the bert base model).

In [6]:
class Model(torch.nn.Module):
    
    def __init__(self, ):
        
        super(Model, self).__init__()
        self.base_model = BertModel.from_pretrained('bert-base-uncased') #pretrained bert model
        self.fc1 = torch.nn.Linear(768, 1) #use logistic regression
        
    def forward(self, ids, masks):
        
        x = self.base_model(ids, attention_mask=masks)[1]
        x = self.fc1(x)
        return x
        

In [7]:
path_to_model = '/kaggle/input/nlpgetstartedbertbasemoel/'

Load the pretrained model

In [8]:
model = torch.load(os.path.join(path_to_model, 'model.pth'))

In [9]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [10]:
model = model.to(device)

In [11]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [12]:
def bert_encode(text, max_len=512):
    
    text = tokenizer.tokenize(text)
    text = text[:max_len-2]
    input_sequence = ["[CLS]"] + text + ["[SEP]"]
    tokens = tokenizer.convert_tokens_to_ids(input_sequence)
    tokens += [0] * (max_len - len(input_sequence))
    pad_masks = [1] * len(input_sequence) + [0] * (max_len - len(input_sequence))

    return tokens, pad_masks

In [13]:
class TestDataset(torch.utils.data.Dataset):
    
    def __init__(self, test_tokens, test_pad_masks):
        
        super(TestDataset, self).__init__()
        self.test_tokens = test_tokens
        self.test_pad_masks = test_pad_masks
        
    def __getitem__(self, index):
        
        tokens = self.test_tokens[index]
        masks = self.test_pad_masks[index]
        
        return (tokens, masks)
    
    def __len__(self,):
        
        return len(self.test_tokens)

In [14]:
test_tokens = []
test_pad_masks = []
for text in test_df.text:
    tokens, masks = bert_encode(text)
    test_tokens.append(tokens)
    test_pad_masks.append(masks)
    
test_tokens = np.array(test_tokens)
test_pad_masks = np.array(test_pad_masks)

In [15]:
test_dataset = TestDataset(
    test_tokens=test_tokens,
    test_pad_masks=test_pad_masks
)

In [16]:
test_dataloader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=3, shuffle=False)

In [17]:
model.eval()
y_preds = []
for (tokens, masks) in test_dataloader:

    y_pred = model(
                torch.tensor(tokens, dtype=torch.long).to(device),
                torch.tensor(masks, dtype=torch.long).to(device),
            )
    y_preds += y_pred.detach().cpu().numpy().squeeze().tolist()

  
  import sys


In [18]:
submission_df = pd.read_csv(os.path.join(path_to_dataset, 'sample_submission.csv'))

Target is 1 if the output is greater than 0.

In [19]:
submission_df['target'] = (np.array(y_preds) >= 0).astype('int')

In [20]:
submission_df.target.value_counts()

0    2148
1    1115
Name: target, dtype: int64

In [21]:
submission_df.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


Writing to submission.csv file

In [22]:
submission_df.to_csv('submission.csv', index=False)