In [32]:
import numpy as np
import pandas as pd
from pathlib import Path

import os
import random

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from collections import Counter
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

import pytorch_lightning as pl

from tqdm import tqdm_notebook as tqdm

In [33]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


seed_everything()

In [34]:
DIR = Path('../data/tweet-sentiment-extraction')
df_train = pd.read_csv(DIR / 'train.csv')
df_test = pd.read_csv(DIR / 'test.csv')

In [35]:
df_train['text'] = df_train['text'].apply(lambda x: str(x))
df_test['text'] = df_test['text'].apply(lambda x: str(x))
df_train['uncased_text'] = df_train['text'].apply(lambda x: x.lower())
df_test['uncased_text'] = df_test['text'].apply(lambda x: x.lower())
df_train['selected_text'] = df_train['selected_text'].apply(lambda x: str(x).lower())

In [36]:
TOKENIZER_DIR = Path('../tokenizers')
tokenizer = BertTokenizer.from_pretrained(TOKENIZER_DIR / 'berttokenizer-base-uncased')

In [37]:
# Tokenize
df_train['tokenized_text'] = df_train['uncased_text'].apply(tokenizer.tokenize)
df_test['tokenized_text'] = df_test['uncased_text'].apply(tokenizer.tokenize)
df_train['tokenized_selected_text'] = df_train['selected_text'].apply(tokenizer.tokenize)

In [38]:
df_train.head()

Unnamed: 0,textID,text,selected_text,sentiment,uncased_text,tokenized_text,tokenized_selected_text
0,cb774db0d1,"I`d have responded, if I were going","i`d have responded, if i were going",neutral,"i`d have responded, if i were going","[i, `, d, have, responded, ,, if, i, were, going]","[i, `, d, have, responded, ,, if, i, were, going]"
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,sooo sad,negative,sooo sad i will miss you here in san diego!!!,"[soo, ##o, sad, i, will, miss, you, here, in, ...","[soo, ##o, sad]"
2,088c60f138,my boss is bullying me...,bullying me,negative,my boss is bullying me...,"[my, boss, is, bullying, me, ., ., .]","[bullying, me]"
3,9642c003ef,what interview! leave me alone,leave me alone,negative,what interview! leave me alone,"[what, interview, !, leave, me, alone]","[leave, me, alone]"
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","sons of ****,",negative,"sons of ****, why couldn`t they put them on t...","[sons, of, *, *, *, *, ,, why, couldn, `, t, t...","[sons, of, *, *, *, *, ,]"


In [39]:
# Filter train data
# 开始结束词
start_position_candidates = []
end_position_candidates = []
df_train['select_length'] = df_train['tokenized_selected_text'].map(len)

for i in tqdm(range(len(df_train))):
    start_position_candidate = [j for j, tok in enumerate(df_train['tokenized_text'].iloc[i]) if
                                tok == df_train['tokenized_selected_text'].iloc[i][0]]
    end_position_candidate = [j for j, tok in enumerate(df_train['tokenized_text'].iloc[i]) if
                              tok == df_train['tokenized_selected_text'].iloc[i][-1]]

    start_position_candidate = [idx for idx in start_position_candidate if
                                idx + df_train['select_length'].iloc[i] - 1 in end_position_candidate]
    end_position_candidate = [idx for idx in end_position_candidate if
                              idx - df_train['select_length'].iloc[i] + 1 in start_position_candidate]

    start_position_candidates.append(start_position_candidate)
    end_position_candidates.append(end_position_candidate)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm(range(len(df_train))):


  0%|          | 0/27481 [00:00<?, ?it/s]

In [40]:
# 如果存在多个候选词，取第一个
start_position_candidates = [l[0] if len(l) > 0 else -1 for l in start_position_candidates]
end_position_candidates = [l[0] if len(l) > 0 else -1 for l in end_position_candidates]

In [41]:
# 测试集合中的开始和结束位置置空 -1
df_train['start_position'] = start_position_candidates
df_train['end_position'] = end_position_candidates
df_test['start_position'] = -1
df_test['end_position'] = -1

In [42]:
df_train = df_train.query('start_position!=-1')

In [44]:
df_train, df_val = train_test_split(df_train, train_size=0.8)

In [45]:
pos_train = df_train.query('sentiment=="positive"')
neg_train = df_train.query('sentiment=="negative"')
neu_train = df_train.query('sentiment=="neutral"')

pos_val = df_train.query('sentiment=="positive"')
neg_val = df_train.query('sentiment=="negative"')
neu_val = df_train.query('sentiment=="neutral"')

pos_test = df_test.query('sentiment=="positive"')
neg_test = df_test.query('sentiment=="negative"')
neu_test = df_test.query('sentiment=="neutral"')

In [46]:
MODEL_DIR = Path('../models/bertforquestionanswering-base-uncased')
pos_model = BertForQuestionAnswering.from_pretrained(MODEL_DIR)
neg_model = BertForQuestionAnswering.from_pretrained(MODEL_DIR)

In [47]:
MAX_LENGTH = 128
BATCH_SIZE = 32

In [48]:
class TestDataset(Dataset):
    def __init__(self, df):
        super().__init__()
        self.texts = df['uncased_text'].values
        self.hash_index = df['textID'].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        returns = {
            'text': self.texts[idx],
            'idx': idx
        }
        return returns

class TrainDataset(Dataset):
    def __init__(self, df):
        super().__init__()
        self.texts = df['uncased_text'].values
        self.start_ids = df['start_position'].values
        self.end_ids = df['end_position'].values
        self.hash_index = df['textID'].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        returns = {
            'text': self.texts[idx],
            'start': self.start_ids[idx],
            'end': self.end_ids[idx],
            'idx': idx
        }
        return returns

class DataModule(pl.LightningDataModule):
    def __init__(self, df_train, df_val, df_test):
        super().__init__()
        self.save_hyperparameters()

    def train_dataloader(self):
        return DataLoader(TrainDataset(df_train), batch_size=BATCH_SIZE, shuffle=True)
    def val_dataloader(self):
        return DataLoader(TrainDataset(df_val), batch_size=BATCH_SIZE, shuffle=False)
    def test_dataloader(self):
        return DataLoader(TestDataset(df_test), batch_size=BATCH_SIZE, shuffle=False)


In [49]:
class BaseSuperModule(pl.LightningModule):
    def __init__(self, bertmodel, tokenizer, prediction_save_path):
        super().__init__()
        self.bertmodel = bertmodel
        self.tokenizer = tokenizer
        self.prediction_save_path = prediction_save_path

    def get_device(self):
        return self.bertmodel.state_dict()['bert.embeddings.word_embeddings.weight'].device

    def save_predictions(self, start_positions, end_positions):
        d = pd.DataFrame({'start_position': start_positions, 'end_position': end_positions})
        d.to_csv(self.prediction_save_path, index=False)

    def forward(self, batch):
        """
        Input:
            batch(dict), where
                batch['text'] = uncased text: str
                batch['idx'] = raw text: list(int)
                batch['start'] = start position indices : list(int) (for train & val batch only)
                batch['end'] = end position indices : list(int) (for train & val batch only)

        Output:
            For train batch, which has 'start' key and 'end' key:
                Tuple of (loss(int), start_score(torch.tensor), end_score(torch.tensor))
            For test batch, without 'start' key and 'end' key:
                Tuple of (start_score(torch.tensor), end_score(torch.tensor))
        """
        encoded_batch = tokenizer.batch_encode_plus(batch['text'], max_length=MAX_LENGTH, pad_to_max_length=True)
        input_ids = torch.tensor(encoded_batch['input_ids']).to(self.get_device())
        attention_mask = torch.tensor(encoded_batch['attention_mask']).to(self.get_device())
        start_positions = batch['start'].to(self.get_device()) + 1 if 'start' in batch.keys() else None
        end_positions = batch['end'].to(self.get_device()) + 1 if 'end' in batch.keys() else None

        model_inputs = {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'start_positions': start_positions,
            'end_positions': end_positions
        }

        return self.bertmodel(**model_inputs)

    def training_step(self, batch, batch_nb):
        """
        (batch) -> (dict or OrderedDict)
        # Caution: key for loss function must exactly be 'loss'.
        """
        idx = batch['idx']
        loss = self.forward(batch)[0]
        return {'loss': loss, 'idx': idx}

    def validation_step(self, batch, batch_nb):
        """
        (batch) -> (dict or OrderedDict)
        # Caution: key for loss function must exactly be 'loss'.
        """
        idx = batch['idx']
        loss = self.forward(batch)[0]
        return {'loss': loss, 'idx': idx}

    def test_step(self, batch, batch_nb):
        """
        (batch) -> (dict or OrderedDict)
        """
        idx = batch['idx']
        start_scores = self.forward(batch)[0]
        end_scores = self.forward(batch)[1]
        return {'start_scores': start_scores, 'end_scores': end_scores, 'idx': idx}

    def training_end(self, outputs):
        """
        outputs(dict) -> loss(dict or OrderedDict)
        # Caution: key must exactly be 'loss'.
        """
        return {'loss': outputs['loss']}

    def validation_end(self, outputs):
        """
        For single dataloader:
            outputs(list of dict) -> (dict or OrderedDict)
        For multiple dataloaders:
            outputs(list of (list of dict)) -> (dict or OrderedDict)
        """
        return {'loss': torch.mean(torch.tensor([output['loss'] for output in outputs])).detach()}

    def test_end(self, outputs):
        """
        For single dataloader:
            outputs(list of dict) -> (dict or OrderedDict)
        For multiple dataloaders:
            outputs(list of (list of dict)) -> (dict or OrderedDict)
        """
        start_scores = torch.cat([output['start_scores'] for output in outputs]).detach().cpu().numpy()
        start_positions = np.argmax(start_scores, axis=1) - 1

        end_scores = torch.cat([output['end_scores'] for output in outputs]).detach().cpu().numpy()
        end_positions = np.argmax(end_scores, axis=1) - 1
        self.save_predictions(start_positions, end_positions)
        return {}

    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=2e-5)

In [50]:
pos_module = BaseSuperModule(pos_model, tokenizer, 'pos_pred.csv')
neg_module = BaseSuperModule(neg_model, tokenizer, 'neg_pred.csv')

In [51]:
device = torch.device('cuda:0') if torch.cuda.is_available() else 'cpu'

In [52]:
pos_module.to(device)
neg_module.to(device)

BaseSuperModule(
  (bertmodel): BertForQuestionAnswering(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=Tr

In [54]:
DEBUG_MODE = False
pos_trainer = pl.Trainer(max_epochs=3, fast_dev_run=DEBUG_MODE)
neg_trainer = pl.Trainer(max_epochs=3, fast_dev_run=DEBUG_MODE)

GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(
GPU available: True, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [55]:
pos_trainer.fit(pos_module, datamodule=DataModule(pos_train, pos_val, pos_test))

Missing logger folder: /home/seeyou/projects/模板/lightning-hydra-template/notebooks/lightning_logs

  | Name      | Type                     | Params
-------------------------------------------------------
0 | bertmodel | BertForQuestionAnswering | 108 M 
-------------------------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
435.573   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [None]:
neg_trainer.fit(neg_module, datamodule=DataModule(neg_train, neg_val, neg_test))