In [1]:
import pandas as pd
import re
import string
import os
import torch
import numpy as np
import tqdm

In [2]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/10/aeefced99c8a59d828a92cc11d213e2743212d3641c87c82d61b035a7d5c/transformers-2.3.0-py3-none-any.whl (447kB)
[K     |████████████████████████████████| 450kB 2.8MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/1f/8e/ed5364a06a9ba720fddd9820155cc57300d28f5f43a6fd7b7e817177e642/sacremoses-0.0.35.tar.gz (859kB)
[K     |████████████████████████████████| 860kB 8.9MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l- \ | done
[?25h  Created wheel for sacremoses: filename=sacremoses-0.0.35-cp36-none-any.whl size=883999 sha256=8ea59287501ca06f879641dafb1ece9da09adf7d662e9fcbe738f39a92a78eac
  Stored in directory: /tmp/.cache/pip/wheels/63/2a/db/63e2909042c634ef551d0d9ac825b2b0b32dede4a6d87ddc94
Successfully built sacremoses
Installing collected packages: sacremoses, transformers
Successfully in

In [3]:
from transformers import BertModel, BertTokenizer

In [4]:
path_to_dataset = '/kaggle/input/nlp-getting-started/'

In [5]:
test = pd.read_csv(os.path.join(path_to_dataset, 'test.csv'))
train = pd.read_csv(os.path.join(path_to_dataset, 'train.csv'))

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [7]:
class Model(torch.nn.Module):
    
    def __init__(self, ):
        
        super(Model, self).__init__()
        self.base_model = BertModel.from_pretrained('bert-base-uncased') # use pre-trained BERT model by HuggingFace
        self.fc1 = torch.nn.Linear(768, 1) # simple logistic regression above the bert model
        
    def forward(self, ids, masks):
        
        x = self.base_model(ids, attention_mask=masks)[1]
        x = self.fc1(x)
        return x
        

In [8]:
model = Model()

In [9]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [10]:
model = model.to(device)

In [11]:
def bert_encode(text, max_len=512):
    
    text = tokenizer.tokenize(text)
    text = text[:max_len-2]
    input_sequence = ["[CLS]"] + text + ["[SEP]"]
    tokens = tokenizer.convert_tokens_to_ids(input_sequence)
    tokens += [0] * (max_len - len(input_sequence))
    pad_masks = [1] * len(input_sequence) + [0] * (max_len - len(input_sequence))

    return tokens, pad_masks

In [12]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9] \n', '', text)
    return text

Use first 6000 for training, rest for validation

In [13]:
train_text = train.text[:6000]
val_text = train.text[6000:]

In [14]:
train_text = train_text.apply(clean_text)
val_text = val_text.apply(clean_text)

In [15]:
train_tokens = []
train_pad_masks = []
for text in train_text:
    tokens, masks = bert_encode(text)
    train_tokens.append(tokens)
    train_pad_masks.append(masks)
    
train_tokens = np.array(train_tokens)
train_pad_masks = np.array(train_pad_masks)

In [16]:
val_tokens = []
val_pad_masks = []
for text in val_text:
    tokens, masks = bert_encode(text)
    val_tokens.append(tokens)
    val_pad_masks.append(masks)
    
val_tokens = np.array(val_tokens)
val_pad_masks = np.array(val_pad_masks)

In [17]:

class Dataset(torch.utils.data.Dataset):
    
    def __init__(self, train_tokens, train_pad_masks, targets):
        
        super(Dataset, self).__init__()
        self.train_tokens = train_tokens
        self.train_pad_masks = train_pad_masks
        self.targets = targets
        
    def __getitem__(self, index):
        
        tokens = self.train_tokens[index]
        masks = self.train_pad_masks[index]
        target = self.targets[index]
        
        return (tokens, masks), target
    
    def __len__(self,):
        
        return len(self.train_tokens)

In [18]:
train_dataset = Dataset(
                    train_tokens=train_tokens,
                    train_pad_masks=train_pad_masks,
                    targets=train.target[:6000]
)

In [19]:
batch_size = 6
EPOCHS = 2

In [20]:
train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

In [21]:
criterion = torch.nn.BCEWithLogitsLoss()

Use Adam Optimizer with learning rate of 0.00001

In [22]:
opt = torch.optim.Adam(model.parameters(), lr=0.00001)

Train for 2 epochs.

In [23]:
model.train()
y_preds = []

for epoch in range(EPOCHS):
        for i, ((tokens, masks), target) in enumerate(train_dataloader):

            y_pred = model(
                        tokens.long().to(device), 
                        masks.long().to(device)
                    )
            loss = criterion(y_pred, target[:, None].float().to(device))
            opt.zero_grad()
            loss.backward()
            opt.step()
            print('\rEpoch: %d/%d, %f%% loss: %0.2f'% (epoch+1, EPOCHS, i/len(train_dataloader)*100, loss.item()), end='')
        print()

Epoch: 1/2, 99.900000% loss: 0.49
Epoch: 2/2, 99.900000% loss: 0.29


Test the model on the validation dataset

In [24]:
val_dataset = Dataset(
                    train_tokens=val_tokens,
                    train_pad_masks=val_pad_masks,
                    targets=train.target[6000:].reset_index(drop=True)
)

In [25]:
val_dataloader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=3, shuffle=False)

Define accuracy metric

In [26]:
def accuracy(y_actual, y_pred):
    y_ = y_pred > 0
    return np.sum(y_actual == y_).astype('int') / y_actual.shape[0]

In [27]:
model.eval()
avg_acc = 0
for i, ((tokens, masks), target) in enumerate(val_dataloader):

    y_pred = model(
                tokens.long().to(device), 
                masks.long().to(device), 
            )
    loss = criterion(y_pred,  target[:, None].float().to(device))
    acc = accuracy(target.cpu().numpy(), y_pred.detach().cpu().numpy().squeeze())
    avg_acc += acc
    print('\r%0.2f%% loss: %0.2f, accuracy %0.2f'% (i/len(val_dataloader)*100, loss.item(), acc), end='')
print('\nAverage accuracy: ', avg_acc / len(val_dataloader))

99.81% loss: 0.01, accuracy 1.00
Average accuracy:  0.8395291201982663


In [28]:
class TestDataset(torch.utils.data.Dataset):
    
    def __init__(self, test_tokens, test_pad_masks):
        
        super(TestDataset, self).__init__()
        self.test_tokens = test_tokens
        self.test_pad_masks = test_pad_masks
        
    def __getitem__(self, index):
        
        tokens = self.test_tokens[index]
        masks = self.test_pad_masks[index]
        
        return (tokens, masks)
    
    def __len__(self,):
        
        return len(self.test_tokens)

In [29]:
test_tokens = []
test_pad_masks = []
for text in test.text:
    tokens, masks = bert_encode(text)
    test_tokens.append(tokens)
    test_pad_masks.append(masks)
    
test_tokens = np.array(test_tokens)
test_pad_masks = np.array(test_pad_masks)

In [30]:
test_dataset = TestDataset(
    test_tokens=test_tokens,
    test_pad_masks=test_pad_masks
)

In [31]:
test_dataloader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=3, shuffle=False)

In [32]:
model.eval()
y_preds = []
for (tokens, masks) in test_dataloader:

    y_pred = model(
                tokens.long().to(device), 
                masks.long().to(device), 
            )
    y_preds += y_pred.detach().cpu().numpy().squeeze().tolist()

In [33]:
submission_df = pd.read_csv(os.path.join(path_to_dataset, 'sample_submission.csv'))

In [34]:
submission_df['target'] = (np.array(y_preds) > 0).astype('int')

In [35]:
submission_df.target.value_counts()

0    2018
1    1245
Name: target, dtype: int64

In [36]:
submission_df

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [37]:
submission_df.to_csv('submission.csv', index=False)