In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

import os
import random

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from collections import Counter
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

import pytorch_lightning as pl

from tqdm import tqdm_notebook as tqdm

In [2]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

In [3]:
DIR = Path('../data/tweet-sentiment-extraction')
df_train = pd.read_csv(DIR / 'train.csv')
df_test = pd.read_csv(DIR / 'test.csv')

In [4]:
df_train['text'] = df_train['text'].apply(lambda x: str(x))
df_test['text'] = df_test['text'].apply(lambda x: str(x))
df_train['uncased_text'] = df_train['text'].apply(lambda x: x.lower())
df_test['uncased_text'] = df_test['text'].apply(lambda x: x.lower())
df_train['selected_text'] = df_train['selected_text'].apply(lambda x: str(x).lower())

In [6]:
tokenizer = BertTokenizer.from_pretrained(DIR / 'berttokenizer-base-uncased')

In [7]:
# Tokenize
df_train['tokenized_text'] = df_train['uncased_text'].apply(tokenizer.tokenize)
df_test['tokenized_text'] = df_test['uncased_text'].apply(tokenizer.tokenize)
df_train['tokenized_selected_text'] = df_train['selected_text'].apply(tokenizer.tokenize)

In [8]:
df_train.head()

Unnamed: 0,textID,text,selected_text,sentiment,uncased_text,tokenized_text,tokenized_selected_text
0,cb774db0d1,"I`d have responded, if I were going","i`d have responded, if i were going",neutral,"i`d have responded, if i were going","[i, `, d, have, responded, ,, if, i, were, going]","[i, `, d, have, responded, ,, if, i, were, going]"
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,sooo sad,negative,sooo sad i will miss you here in san diego!!!,"[soo, ##o, sad, i, will, miss, you, here, in, ...","[soo, ##o, sad]"
2,088c60f138,my boss is bullying me...,bullying me,negative,my boss is bullying me...,"[my, boss, is, bullying, me, ., ., .]","[bullying, me]"
3,9642c003ef,what interview! leave me alone,leave me alone,negative,what interview! leave me alone,"[what, interview, !, leave, me, alone]","[leave, me, alone]"
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","sons of ****,",negative,"sons of ****, why couldn`t they put them on t...","[sons, of, *, *, *, *, ,, why, couldn, `, t, t...","[sons, of, *, *, *, *, ,]"


In [9]:
# Filter train data
start_position_candidates = []
end_position_candidates = []
df_train['select_length'] = df_train['tokenized_selected_text'].map(len)

for i in tqdm(range(len(df_train))):
    start_position_candidate = [j for j, tok in enumerate(df_train['tokenized_text'].iloc[i]) if tok == df_train['tokenized_selected_text'].iloc[i][0]]
    end_position_candidate = [j for j, tok in enumerate(df_train['tokenized_text'].iloc[i]) if tok == df_train['tokenized_selected_text'].iloc[i][-1]]

    start_position_candidate = [idx for idx in start_position_candidate if idx + df_train['select_length'].iloc[i] - 1 in end_position_candidate]
    end_position_candidate = [idx for idx in end_position_candidate if idx - df_train['select_length'].iloc[i] + 1 in start_position_candidate]

    start_position_candidates.append(start_position_candidate)
    end_position_candidates.append(end_position_candidate)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm(range(len(df_train))):


  0%|          | 0/27481 [00:00<?, ?it/s]

In [10]:
start_position_candidates = [l[0] if len(l) > 0 else -1 for l in start_position_candidates]
end_position_candidates = [l[0] if len(l) > 0 else -1 for l in end_position_candidates]

In [11]:
df_train['start_position'] = start_position_candidates
df_train['end_position'] = end_position_candidates
df_test['start_position'] = -1
df_test['end_position'] = -1

In [12]:
df_train = df_train.query('start_position!=-1')

In [13]:
df_train, df_val = train_test_split(df_train, train_size=0.8)

In [14]:
pos_train = df_train.query('sentiment=="positive"')
neg_train = df_train.query('sentiment=="negative"')
neu_train = df_train.query('sentiment=="neutral"')

pos_val = df_train.query('sentiment=="positive"')
neg_val = df_train.query('sentiment=="negative"')
neu_val = df_train.query('sentiment=="neutral"')

pos_test = df_test.query('sentiment=="positive"')
neg_test = df_test.query('sentiment=="negative"')
neu_test = df_test.query('sentiment=="neutral"')

In [16]:
"""BERT fine-tuning"""

'BERT fine-tuning'

In [None]:
pos_model = BertForQuestionAnswering.from_pretrained('../input/bertforquestionanswering-base-uncased')
neg_model = BertForQuestionAnswering.from_pretrained('../input/bertforquestionanswering-base-uncased')