In [236]:
import transformers
from transformers import BertModel, BertTokenizer, BertForSequenceClassification
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

In [237]:
PRE_TRAINED_MODEL_NAME = 'bert-base-german-cased' # 'distilbert-base-german-cased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [238]:
sample_text = "Das hier ist ein deutscher Beispieltext. Und ein zweiter."
tokens = tokenizer.tokenize(sample_text) # just tokenizes
token_ids = tokenizer.convert_tokens_to_ids(tokens)
ids = tokenizer.encode(sample_text) # already adds special tokens

print(tokens)
print(token_ids)
print(ids)

['Das', 'hier', 'ist', 'ein', 'deutscher', 'Beispiel', '##text', '.', 'Und', 'ein', 'zweiter', '.']
[295, 702, 127, 39, 2433, 2249, 8859, 26914, 1356, 39, 8266, 26914]
[3, 295, 702, 127, 39, 2433, 2249, 8859, 26914, 1356, 39, 8266, 26914, 4]


In [239]:
tokenizer.sep_token, tokenizer.sep_token_id, tokenizer.cls_token, tokenizer.cls_token_id, tokenizer.pad_token, tokenizer.pad_token_id

('[SEP]', 4, '[CLS]', 3, '[PAD]', 0)

In [240]:
ROOT = Path('/Volumes/INWT/Daten_NLP/') # encrypted folder!
DATA = ROOT / '200707_aachener_zeitung_modified.csv' # text is already minimal preprocessed

In [241]:
class INWT_Dataset(Dataset):

    def __init__(self, df, target, text_base, tokenizer, max_len):
        self.df = df
        self.text_base = text_base
        self.target = target
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, item):
        text = str(self.df.loc[item, self.text_base])
        target = np.array(self.df.loc[item, self.target])

        # hier einfach encode() nehmen? brauche ich die attention_mask etc?
        encoding = self.tokenizer.encode_plus(text,
                                              max_length=self.max_len,
                                              truncation=True,
                                              #return_token_type_ids=False,
                                              pad_to_max_length=True,
                                              return_attention_mask=True,
                                              return_tensors='pt',
                                              )

        return {'text': text,
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'target': torch.tensor(target, dtype=torch.float).unsqueeze(dim=-1) # unsqueezing so shape (batch_size,1)
                }

In [242]:
df_raw = pd.read_csv(DATA)
df_raw = df_raw.fillna('') # replacing Nan with emtpy string
df_raw.head()

Unnamed: 0,articleId,pageviews,entrances,exits,bounces,timeOnPage,conversions,avgTimeOnPage,stickiness,entranceRate,...,titelH3,wordcount,category,city,text_preprocessed,nr_tokens,mean_token_length,nr_tokens_teaser,nr_tokens_titelH1,avgTimeOnPage/wordcount
0,48620281,21,7,12,7,1012,,112.444444,42.857143,33.333333,...,,769,vm,München/Stuttgart,Frische Luft und Bewegung: Diese Kombination r...,796,5.359296,29,9,0.146222
1,48620381,19,6,11,5,1484,,185.5,42.105263,31.578947,...,,441,vm,Berlin/Frankfurt/Main,"Der Wecker klingelt, aufstehen! Doch gerade im...",452,5.938053,33,8,0.420635
2,48622639,2,2,2,2,0,,0.0,0.0,100.0,...,,390,vm,Berlin,Eltern auf der Suche nach einem guten Babyphon...,396,5.848485,30,7,0.0
3,48623085,32,9,20,9,974,,81.166667,37.5,28.125,...,,345,vm,Berlin,Spülmaschinentabs sollen kleine Alleskönner se...,367,5.594005,30,7,0.235266
4,48623259,24,2,7,2,3797,,223.352941,70.833333,8.333333,...,,182,vm,Berlin,Make-up hat heutzutage einen Zweck: Es soll da...,183,5.622951,22,8,1.227214


In [243]:
# just take articles where ...
#df = df_raw.loc[(df_raw['pageviews'] >= 5)]

df = df_raw.loc[(df_raw['pageviews'] >= 5) &
                (df_raw['avgTimeOnPage/wordcount'] <= 5) &
                (df_raw['avgTimeOnPage/wordcount'] >= 0.1)]

print(len(df))

667


In [244]:
#creating train, dev, test
RANDOM_SEED = 123
df_train, df_test = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED, shuffle=True)
df_dev, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED, shuffle=True)
df_train.reset_index(drop=True, inplace=True) # so that index starts with 0 again
df_dev.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)
print(df_train.shape, df_dev.shape, df_test.shape)

(533, 36) (67, 36) (67, 36)


In [245]:
df_train.head()

Unnamed: 0,articleId,pageviews,entrances,exits,bounces,timeOnPage,conversions,avgTimeOnPage,stickiness,entranceRate,...,titelH3,wordcount,category,city,text_preprocessed,nr_tokens,mean_token_length,nr_tokens_teaser,nr_tokens_titelH1,avgTimeOnPage/wordcount
0,48859325,15,8,8,7,2002,,286.0,46.666667,53.333333,...,,578,vm,Hamburg,"Weiße Wände, helle Möbel und wenige Accessoire...",594,5.558923,31,5,0.49481
1,50185755,29,1,11,1,2947,,163.722222,62.068966,3.448276,...,,533,vm,Berlin,Maske auf beim Einkaufen oder in Bus und Bahn:...,523,5.502868,27,11,0.307171
2,49278955,7,1,3,1,296,,74.0,57.142857,14.285714,...,,252,vm,Stuttgart,Oldtimer sind gut in Schuss. Je älter die Auto...,253,5.407115,25,7,0.293651
3,49875299,12,1,4,1,622,,77.75,66.666667,8.333333,...,,402,vm,Berlin,"Der Drill-Instuctor brüllt in sein Mikrofon, d...",413,5.372881,27,10,0.193408
4,48885853,17,7,10,6,576,,82.285714,41.176471,41.176471,...,,520,vm,Berlin,Die «Aida Nova»: 2500 Kabinen. Die «Mein Schif...,535,5.685981,29,8,0.158242


In [246]:
def create_DataLoaders(target, text_base, tokenizer, max_len, batch_size):
    # creating DataSets
    ds_train = INWT_Dataset(df=df_train,
                  target = target,
                  text_base = text_base,
                  tokenizer=tokenizer,
                  max_len = max_len)
    ds_dev = INWT_Dataset(df=df_dev,
                  target = target,
                  text_base = text_base,
                  tokenizer=tokenizer,
                  max_len = max_len)
    ds_test = INWT_Dataset(df=df_test,
                  target = target,
                  text_base = text_base,
                  tokenizer=tokenizer,
                  max_len = max_len)
    
    # creating DataLoaders
    dl_train = DataLoader(ds_train, batch_size=batch_size)#, num_workers=4)
    dl_dev = DataLoader(ds_dev, batch_size=batch_size)#, num_workers=4)
    dl_test = DataLoader(ds_test, batch_size=batch_size)#, num_workers=4)
    
    return dl_train, dl_dev, dl_test

In [257]:
dl_train, dl_dev, dl_test = create_DataLoaders(target = 'avgTimeOnPage/wordcount',
                                               text_base = 'text_preprocessed', # 'titelH1',
                                               tokenizer = tokenizer, 
                                               max_len = 50,            # change depending on used text_base!
                                               batch_size = 32)

In [258]:
# have a look at one batch
data = next(iter(dl_train))
print(data.keys())
input_ids = data['input_ids']
print(input_ids)
print(input_ids.shape)
attention_mask = data['attention_mask']
print(attention_mask)
print(attention_mask.shape)
print(data['target'].shape)

dict_keys(['text', 'input_ids', 'attention_mask', 'target'])
tensor([[    3, 19206, 18285,  ..., 26914,     2,     4],
        [    3,  8359,   772,  ..., 26901,   287,     4],
        [    3, 19024, 21258,  ...,  4140, 10687,     4],
        ...,
        [    3, 10796,   961,  ..., 20665,  5032,     4],
        [    3,  1309,   498,  ...,   204,  2197,     4],
        [    3,   198,  8630,  ..., 12122, 15333,     4]])
torch.Size([32, 50])
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])
torch.Size([32, 50])
torch.Size([32, 1])


## Model

In [141]:
# das ist umständlich (und auch falsch), ich habe stattdessen bereits BertForSequenceClassification genommen
# https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification

#class Bert_regression(nn.Module):
#    
#    def __init__(self, n_outputs): # maybe train pageviews and timeOnPage simultaneously?
#        super(Bert_regression, self).__init__()
#        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
#        self.drop = nn.Dropout(p=0.3)
#        self.out = nn.Linear(self.bert.config.hidden_size, n_outputs)
#
#    def forward(self, input_ids, attention_mask):
#        _, pooled_output = self.bert(input_ids=input_ids,           # das hier ist glaube ich nicht sinnvoll bei mir
#                                     attention_mask=attention_mask)
#        output = self.drop(pooled_output)
#   
#        return self.out(output)

In [None]:
#model = Bert_regression(n_outputs = 1)

In [None]:
#model = model.to(device)

In [259]:
model = BertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME,
                                                      num_labels = 1, # turns "classification" into regression?
                                                      output_attentions = False,
                                                      output_hidden_states = False,
                                                      )

Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoi

In [None]:
#model = model.to(device)

In [260]:
# try ut with one batch (model is not trained yet so nothing exciting to be expected)
model(input_ids, attention_mask) # semms good!

(tensor([[-0.0722],
         [-0.2745],
         [-0.2264],
         [-0.3081],
         [-0.0234],
         [-0.3807],
         [-0.2915],
         [-0.3310],
         [-0.5103],
         [-0.0482],
         [-0.3297],
         [-0.3032],
         [-0.2080],
         [-0.2565],
         [-0.4692],
         [-0.0347],
         [-0.2431],
         [-0.0141],
         [-0.2056],
         [-0.4307],
         [-0.1809],
         [-0.2822],
         [-0.4090],
         [-0.2577],
         [-0.2373],
         [-0.2155],
         [-0.1640],
         [-0.3332],
         [-0.3144],
         [-0.2914],
         [-0.1223],
         [-0.2668]], grad_fn=<AddmmBackward>),)

## Training

In [261]:
optimizer = optim.AdamW(model.parameters(), lr=1e-5)
loss_fn = nn.MSELoss()  # mean squared loss

In [262]:
EPOCHS = 10


for epoch in range(EPOCHS):
    print("Epoch", epoch)
    
    ### TRAINING on train
    print("training")
    model = model.train()
    train_losses = []
    
    for nr, d in enumerate(dl_train):
        print("-Batch", nr, end='\r')
        input_ids = d["input_ids"]
        attention_mask = d["attention_mask"]
        targets = d["target"]
        #print(targets.shape)    
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)[0] # stimmt das so? ist [0] die logits?
        #print(outputs.shape)
        
        loss = loss_fn(outputs, targets)
        train_losses.append(loss.item())
        loss.backward()

        #nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        optimizer.zero_grad()
    
        #print(np.mean(train_losses))
    print("Mean train loss:", np.mean(train_losses))
    
    ### EVALUATING on dev
    print("evaluating")
    model = model.eval()
    eval_losses = []
    
    with torch.no_grad():
        for nr, d in enumerate(dl_dev):
            print("-Batch", nr, end='\r')
            input_ids = d["input_ids"]
            attention_mask = d["attention_mask"]
            targets = d["target"]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)[0] # stimmt das so?
        
            loss = loss_fn(outputs, targets)
            eval_losses.append(loss.item())
            #print(np.mean(eval_losses))
        print("Mean eval loss:", np.mean(eval_losses))

Epoch 0
training
-Batch 6

KeyboardInterrupt: 

## predict dev set

In [218]:
import scipy.stats as st

In [219]:
_, dl_dev_oneBatch, _ = create_DataLoaders(target = 'avgTimeOnPage/wordcount',
                                               text_base = 'text_preprocessed', # 'teaser'
                                               tokenizer = tokenizer, 
                                               max_len = 100,            # change depending on used text_base!
                                               batch_size = 67) # for now: just one batch

In [220]:
with torch.no_grad():
    for d in dl_dev_oneBatch:
        pred_dev = model(input_ids=d["input_ids"], attention_mask=d["attention_mask"])[0] # just the logits?
        y_dev = d["target"]
        
        pred_dev = pred_dev.squeeze()
        y_dev = y_dev.squeeze()
        print(pred_dev)
        print(y_dev)
        print(st.pearsonr(pred_dev, y_dev))

tensor([12.2933, 12.3086, 12.3034, 12.2823, 12.2906, 12.2941, 12.3136, 12.2987,
        12.2861, 12.2873, 12.2507, 12.2997, 12.2819, 12.2853, 12.3088, 12.2795,
        12.2993, 12.2814, 12.2881, 12.3007, 12.2934, 12.2879, 12.2927, 12.2814,
        12.2832, 12.3043, 12.3022, 12.2989, 12.2966, 12.3052, 12.2808, 12.3051,
        12.2821, 12.2927, 12.2764, 12.2815, 12.2991, 12.2824, 12.2783, 12.2737,
        12.2980, 12.3045, 12.2785, 12.2990, 12.2982, 12.3028, 12.2725, 12.2922,
        12.2856, 12.2970, 12.2983, 12.2904, 12.2839, 12.2974, 12.2993, 12.2653,
        12.3095, 12.2745, 12.2972, 12.2916, 12.3087, 12.2794, 12.2994, 12.2933,
        12.2878, 12.3078, 12.2545, 12.2969, 12.2767, 12.2955, 12.3029, 12.3008,
        12.2982, 12.2705, 12.2950, 12.2845, 12.2903, 12.3097, 12.3059, 12.2845,
        12.2968, 12.2984, 12.3076, 12.3052, 12.2739, 12.2729, 12.2879, 12.3032,
        12.2816])
tensor([3.0470e+03, 3.3000e+01, 2.0000e+01, 6.4000e+01, 1.0000e+00, 1.2000e+01,
        2.0000e+00, 2.

## Kommentare:
* Das Modell lernt nicht! Liegt das einfach an der geringen Datenmenge oder ist im Training was falsch?
* anderer Optimizer? Ist Loss richtig berechnet?
* Oder liegt das Problem bereits beim DataSet, Dataloader?