In [1]:
import transformers
from transformers import BertModel, BertTokenizer, BertForSequenceClassification
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

import utils

from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

[nltk_data] Downloading package punkt to /home/ruecker/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
torch.cuda.is_available()

True

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
PRE_TRAINED_MODEL_NAME = 'bert-base-german-cased' # 'distilbert-base-german-cased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

cuda:0


In [4]:
sample_text = "Das hier ist ein deutscher Beispieltext. Und einen zweiten müssen wir auch noch haben."
tokens = tokenizer.tokenize(sample_text) # just tokenizes
token_ids = tokenizer.convert_tokens_to_ids(tokens)
ids = tokenizer.encode(sample_text) # already adds special tokens
encoded_plus = tokenizer.encode_plus(sample_text,
                                     max_length = 10,
                                     return_token_type_ids=False,
                                     pad_to_max_length=True,
                                     return_attention_mask=True,)

print(tokens)
print(token_ids)
print(ids)
print("---")
print(encoded_plus)

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


['Das', 'hier', 'ist', 'ein', 'deutscher', 'Beispiel', '##text', '.', 'Und', 'einen', 'zweiten', 'müssen', 'wir', 'auch', 'noch', 'haben', '.']
[295, 702, 127, 39, 2433, 2249, 8859, 26914, 1356, 303, 1909, 1475, 232, 194, 357, 474, 26914]
[3, 295, 702, 127, 39, 2433, 2249, 8859, 26914, 1356, 303, 1909, 1475, 232, 194, 357, 474, 26914, 4]
---
{'input_ids': [3, 295, 702, 127, 39, 2433, 2249, 8859, 26914, 4], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [5]:
#tokenizer.get_vocab() # shows tokenizer vocab (subwords!)

In [6]:
tokenizer.sep_token, tokenizer.sep_token_id, tokenizer.cls_token, tokenizer.cls_token_id, tokenizer.pad_token, tokenizer.pad_token_id

('[SEP]', 4, '[CLS]', 3, '[PAD]', 0)

In [7]:
#ROOT = Path('/Volumes/INWT/Daten_NLP/') # local (Laptop)
ROOT = Path('/home/ruecker/data/Daten_INWT/') # JULIE-Server

DATA = ROOT / '200707_aachener_zeitung_modified.csv' # text is already minimal preprocessed

In [8]:
class INWT_Dataset(Dataset):

    def __init__(self, df, target, text_base, tokenizer, max_len):
        self.df = df
        self.text_base = text_base
        self.target = target
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, item):
        text = str(self.df.loc[item, self.text_base])
        target = np.array(self.df.loc[item, self.target])

        # hier einfach encode() nehmen? brauche ich die attention_mask etc?
        encoding = self.tokenizer.encode_plus(text,
                                              max_length=self.max_len,
                                              truncation=True,
                                              return_token_type_ids=False,
                                              pad_to_max_length=True,
                                              return_attention_mask=True,
                                              return_tensors='pt',
                                              )

        return {'text': text,
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'target': torch.tensor(target, dtype=torch.float).unsqueeze(dim=-1) # unsqueezing so shape (batch_size,1)
                }

In [9]:
df_raw = pd.read_csv(DATA)
df_raw = df_raw.fillna('') # replacing Nan with emtpy string
df_raw.head()

Unnamed: 0,articleId,pageviews,entrances,exits,bounces,timeOnPage,conversions,avgTimeOnPage,stickiness,entranceRate,...,nr_tokens,mean_token_length,nr_tokens_teaser,nr_tokens_titelH1,nr_char,nr_sentences,mean_sentence_length,avgTimeOnPage/wordcount,avgTimeOnPage/nr_char,pageviews-exits
0,48620281,21,7,12,7,1012,,112.444444,42.857143,33.333333,...,796,5.359296,29,9,5148,52.0,15.307692,0.146222,0.021842,9
1,48620381,19,6,11,5,1484,,185.5,42.105263,31.578947,...,452,5.938053,33,8,3182,28.0,16.142857,0.420635,0.058297,8
2,48622639,2,2,2,2,0,,0.0,0.0,100.0,...,396,5.848485,30,7,2776,26.0,15.230769,0.0,0.0,0
3,48623085,32,9,20,9,974,,81.166667,37.5,28.125,...,367,5.594005,30,7,2442,23.0,15.956522,0.235266,0.033238,12
4,48623259,24,2,7,2,3797,,223.352941,70.833333,8.333333,...,183,5.622951,22,8,1243,8.0,22.875,1.227214,0.179689,17


In [10]:
# just take articles where ...
#df = df_raw.loc[(df_raw['pageviews'] >= 5)]

#df = df_raw.loc[(df_raw['pageviews'] >= 5) &
#                (df_raw['avgTimeOnPage/wordcount'] <= 5) &
#                (df_raw['avgTimeOnPage/wordcount'] >= 0.1)]

# or all...
df = df_raw

print(len(df))

891


In [11]:
#creating train, dev, test
RANDOM_SEED = 123
df_train, df_test = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED, shuffle=True)
df_dev, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED, shuffle=True)
df_train.reset_index(drop=True, inplace=True) # so that index starts with 0 again
df_dev.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)
print(df_train.shape, df_dev.shape, df_test.shape)

(712, 41) (89, 41) (90, 41)


In [12]:
df_train.head()

Unnamed: 0,articleId,pageviews,entrances,exits,bounces,timeOnPage,conversions,avgTimeOnPage,stickiness,entranceRate,...,nr_tokens,mean_token_length,nr_tokens_teaser,nr_tokens_titelH1,nr_char,nr_sentences,mean_sentence_length,avgTimeOnPage/wordcount,avgTimeOnPage/nr_char,pageviews-exits
0,49583837,28,8,14,7,3680,,262.857143,50.0,28.571429,...,655,6.09771,27,7,4710,40.0,16.375,0.417234,0.055808,14
1,51204297,4,0,1,0,417,,139.0,75.0,0.0,...,77,6.363636,24,6,579,6.0,12.833333,1.7375,0.240069,3
2,49230331,4,1,1,1,171,,57.0,75.0,25.0,...,124,6.201613,24,6,910,10.0,12.4,0.448819,0.062637,3
3,49825661,5,2,2,2,1501,,500.333333,60.0,40.0,...,178,5.52809,28,8,1180,10.0,17.8,2.842803,0.424011,3
4,48897105,5,2,2,2,124,,41.333333,60.0,40.0,...,146,6.410959,25,7,1104,10.0,14.6,0.277405,0.03744,3


In [13]:
def create_DataLoaders(target, text_base, tokenizer, max_len, batch_size):
    # creating DataSets
    ds_train = INWT_Dataset(df=df_train,
                  target = target,
                  text_base = text_base,
                  tokenizer=tokenizer,
                  max_len = max_len)
    ds_dev = INWT_Dataset(df=df_dev,
                  target = target,
                  text_base = text_base,
                  tokenizer=tokenizer,
                  max_len = max_len)
    ds_test = INWT_Dataset(df=df_test,
                  target = target,
                  text_base = text_base,
                  tokenizer=tokenizer,
                  max_len = max_len)
    
    # creating DataLoaders
    dl_train = DataLoader(ds_train, batch_size=batch_size, num_workers=4, shuffle=True)
    dl_dev = DataLoader(ds_dev, batch_size=batch_size, num_workers=4)
    dl_test = DataLoader(ds_test, batch_size=batch_size, num_workers=4)
    
    return dl_train, dl_dev, dl_test

In [14]:
dl_train, dl_dev, dl_test = create_DataLoaders(target = 'avgTimeOnPage/wordcount',
                                               text_base = 'text_preprocessed', # 'titelH1',
                                               tokenizer = tokenizer, 
                                               max_len = 100,            # change depending on used text_base!
                                               batch_size = 8)

In [15]:
# have a look at one batch
data = next(iter(dl_train))
print(data.keys())
input_ids = data['input_ids']
#print(input_ids)
print(input_ids.shape)
attention_mask = data['attention_mask']
#print(attention_mask)
print(attention_mask.shape)
print(data['target'].shape)

dict_keys(['text', 'input_ids', 'attention_mask', 'target'])
torch.Size([8, 100])
torch.Size([8, 100])
torch.Size([8, 1])


## Model

In [16]:
# das ist umständlich (und auch falsch), ich habe stattdessen bereits BertForSequenceClassification genommen
# https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification

#class Bert_regression(nn.Module):
#    
#    def __init__(self, n_outputs): # maybe train pageviews and timeOnPage simultaneously?
#        super(Bert_regression, self).__init__()
#        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
#        self.drop = nn.Dropout(p=0.3)
#        self.out = nn.Linear(self.bert.config.hidden_size, n_outputs)
#
#    def forward(self, input_ids, attention_mask):
#        _, pooled_output = self.bert(input_ids=input_ids,           # das hier ist glaube ich nicht sinnvoll bei mir
#                                     attention_mask=attention_mask)
#        output = self.drop(pooled_output)        
#        return self.out(output)

In [17]:
#model = Bert_regression(n_outputs = 1)
#model = model.to(device)

In [18]:
model = BertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME,
                                                      num_labels = 1, # turns "classification" into regression?
                                                      output_attentions = False,
                                                      output_hidden_states = False,
                                                      )
model = model.to(device)

Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoi

In [19]:
# try ut with one batch (model is not trained yet so nothing exciting to be expected)
model(input_ids.to(device), attention_mask.to(device)) # semms good!

(tensor([[0.4910],
         [0.1137],
         [0.2864],
         [0.0126],
         [0.0241],
         [0.2552],
         [0.1826],
         [0.2770]], device='cuda:0', grad_fn=<AddmmBackward>),)

## Training

In [19]:
optimizer = optim.AdamW(model.parameters(), lr=1e-5)
loss_fn = nn.MSELoss()  # mean squared loss

In [20]:
EPOCHS = 20


for epoch in range(EPOCHS):
    print("Epoch", epoch)
    
    ### TRAINING on train
    print("training")
    model = model.train()
    train_losses = []
    
    for nr, d in enumerate(dl_train):
        print("-Batch", nr, end='\r')
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["target"].to(device)
        #print(targets.shape)    
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)[0] # stimmt das so? ist [0] die logits?
        #print(outputs.shape)
        
        loss = loss_fn(outputs, targets)
        train_losses.append(loss.item())
        loss.backward()

        #nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        optimizer.zero_grad()
    
        #print(np.mean(train_losses))
    print("Mean train loss:", np.mean(train_losses))
    
    ### EVALUATING on dev
    print("evaluating")
    model = model.eval()
    eval_losses = []
    
    with torch.no_grad():
        for nr, d in enumerate(dl_dev):
            print("-Batch", nr, end='\r')
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["target"].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)[0] # stimmt das so?
        
            loss = loss_fn(outputs, targets)
            eval_losses.append(loss.item())
            #print(np.mean(eval_losses))
        print("Mean eval loss:", np.mean(eval_losses))

Epoch 0
training
Mean train loss: 0.2738990727812052
evaluating
Mean eval loss: 0.8107139143206344
Epoch 1
training
Mean train loss: 0.2092793703162938
evaluating
Mean eval loss: 0.8263743265997618
Epoch 2
training
Mean train loss: 0.14616796801264367
evaluating
Mean eval loss: 0.8370372052304447
Epoch 3
training
Mean train loss: 0.10060928159215476
evaluating
Mean eval loss: 0.8128398333986601
Epoch 4
training
Mean train loss: 0.059061058796942234
evaluating
Mean eval loss: 0.8307045791320604
Epoch 5
training
Mean train loss: 0.04686081623972467
evaluating
Mean eval loss: 0.8392508854934325
Epoch 6
training
-Batch 61

Process Process-54:
Process Process-53:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/ruecker/miniconda3/envs/GPU/lib/python3.7/multiprocessing/process.py", line 300, in _bootstrap
    util._exit_function()
  File "/home/ruecker/miniconda3/envs/GPU/lib/python3.7/multiprocessing/util.py", line 337, in _exit_function
    _run_finalizers()
  File "/home/ruecker/miniconda3/envs/GPU/lib/python3.7/multiprocessing/util.py", line 277, in _run_finalizers
    finalizer()
  File "/home/ruecker/miniconda3/envs/GPU/lib/python3.7/multiprocessing/util.py", line 201, in __call__
    res = self._callback(*self._args, **self._kwargs)
  File "/home/ruecker/miniconda3/envs/GPU/lib/python3.7/multiprocessing/queues.py", line 192, in _finalize_join
    thread.join()
  File "/home/ruecker/miniconda3/envs/GPU/lib/python3.7/threading.py", line 1044, in join
    self._wait_for_tstate_lock()
  File "/home/ruecker/miniconda3/envs/GPU/lib/python3.7/multiprocessing/process.py", 

KeyboardInterrupt: 

## predict dev set

In [21]:
import scipy.stats as st

In [28]:
_, dl_dev_oneBatch, _ = create_DataLoaders(target = 'avgTimeOnPage/wordcount',
                                               text_base = 'text_preprocessed', # 'teaser'
                                               tokenizer = tokenizer, 
                                               max_len = 100,            # change depending on used text_base!
                                               batch_size = 89) # for now: just one batch

In [29]:
with torch.no_grad():
    for d in dl_dev_oneBatch:
        pred_dev = model(input_ids=d["input_ids"].to(device), attention_mask=d["attention_mask"].to(device))[0] # just the logits?
        y_dev = d["target"].to(device)
        
        pred_dev = pred_dev.squeeze().cpu()
        y_dev = y_dev.squeeze().cpu()
        print(pred_dev)
        print(y_dev)
        print(st.pearsonr(pred_dev, y_dev))

tensor([ 0.5329,  0.2562,  0.4639,  0.3975,  0.3378,  0.1366,  0.3866,  0.5034,
         0.4795,  0.3761,  0.3445,  0.4251,  0.8253,  0.2645,  0.3172,  0.6630,
         0.3116,  0.1757,  0.4417,  0.0116,  0.5449,  0.9076,  0.1101,  0.7867,
         1.0569,  0.3484,  0.5437,  0.1672,  0.3955,  0.1168,  0.7323,  0.6364,
         0.1067,  0.4060,  0.7389,  0.3821,  0.5545,  0.4256,  0.4132,  1.2090,
         0.2423,  0.3605, -0.0208,  0.1978,  0.2604,  0.3660,  0.2688,  0.3128,
         0.3575,  0.2891,  0.1810,  0.1984,  0.1762,  0.3948,  0.4105,  0.7182,
         0.3595,  0.6303,  0.7492,  0.5521,  0.5746,  0.3578, -0.0190,  0.9274,
         0.2742,  0.1476,  0.2406,  0.3628,  0.4523,  0.3634,  0.4850,  0.3599,
         0.5240,  0.1677,  0.9379,  0.4969,  0.4776,  0.4500,  0.3288,  0.3787,
         0.3845,  0.1362,  0.2795,  0.4424,  0.5441,  0.2259,  0.1552,  0.3783,
         0.1311])
tensor([0.4269, 0.2815, 0.0705, 0.0231, 0.0000, 0.2753, 6.7727, 0.4919, 0.1215,
        0.2277, 0.0000

## Kommentare:
* ist jetzt auf GPU
* Overfitting! Dropout? zu wenig Daten?