In [1]:
import transformers
from transformers import BertModel, BertTokenizer, BertForSequenceClassification
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

from master_thesis.src import utils

from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

In [2]:
torch.cuda.is_available()

False

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
PRE_TRAINED_MODEL_NAME = 'bert-base-german-cased' # 'distilbert-base-german-cased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

cpu


In [7]:
sample_text = "Das hier ist ein deutscher Beispieltext. Und einen zweiten müssen wir auch noch haben."
tokens = tokenizer.tokenize(sample_text) # just tokenizes
token_ids = tokenizer.convert_tokens_to_ids(tokens)
ids = tokenizer.encode(sample_text) # already adds special tokens
encoded_plus = tokenizer.encode_plus(sample_text,
                                     max_length = 10,
                                     return_token_type_ids=False,
                                     pad_to_max_length=True,
                                     return_attention_mask=True,)

print(tokens)
print(token_ids)
print(ids)
print("---")
print(encoded_plus)

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


['Das', 'hier', 'ist', 'ein', 'deutscher', 'Beispiel', '##text', '.', 'Und', 'einen', 'zweiten', 'müssen', 'wir', 'auch', 'noch', 'haben', '.']
[295, 702, 127, 39, 2433, 2249, 8859, 26914, 1356, 303, 1909, 1475, 232, 194, 357, 474, 26914]
[3, 295, 702, 127, 39, 2433, 2249, 8859, 26914, 1356, 303, 1909, 1475, 232, 194, 357, 474, 26914, 4]
---
{'input_ids': [3, 295, 702, 127, 39, 2433, 2249, 8859, 26914, 4], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [8]:
#tokenizer.get_vocab() # shows tokenizer vocab (subwords!)

In [6]:
tokenizer.sep_token, tokenizer.sep_token_id, tokenizer.cls_token, tokenizer.cls_token_id, tokenizer.pad_token, tokenizer.pad_token_id

('[SEP]', 4, '[CLS]', 3, '[PAD]', 0)

In [9]:
#ROOT = Path('/Volumes/INWT/Daten_NLP/') # local (Laptop)
#ROOT = Path('/home/ruecker/data/Daten_INWT/') # JULIE-Server

#DATA = ROOT / '200707_aachener_zeitung_modified.csv' # text is already minimal preprocessed

In [10]:
class INWT_Dataset(Dataset):

    def __init__(self, df, target, text_base, tokenizer, max_len):
        self.df = df
        self.text_base = text_base
        self.target = target
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, item):
        text = str(self.df.loc[item, self.text_base])
        target = np.array(self.df.loc[item, self.target])

        # hier einfach encode() nehmen? brauche ich die attention_mask etc?
        encoding = self.tokenizer.encode_plus(text,
                                              max_length=self.max_len,
                                              truncation=True,
                                              return_token_type_ids=False,
                                              pad_to_max_length=True,
                                              return_attention_mask=True,
                                              return_tensors='pt',
                                              )

        return {'text': text,
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'target': torch.tensor(target, dtype=torch.float).unsqueeze(dim=-1) # unsqueezing so shape (batch_size,1)
                }

In [11]:
df_raw = utils.get_raw_df()
#df_raw = df_raw.fillna('') # replacing Nan with emtpy string
df_raw.head()

Shared columns: {'dpaGuid', 'avgTimeOnPage', 'bounces', 'avgTimeOnPagePerWordcount', 'pageviews_percentile', 'titel', 'nr_tokens_text', 'zeilen', 'nr_tokens_publisher', 'prozentVerlag', 'entrances', 'article_text', 'pagePath', 'avgTimeOnPage_percentile', 'prozentDpa', 'timeOnPage', 'pageviews', 'category', 'rubric', 'publisher', 'exits', 'date'}
Shape of raw df: (92020, 22)


Unnamed: 0_level_0,dpaGuid,avgTimeOnPage,bounces,avgTimeOnPagePerWordcount,pageviews_percentile,titel,nr_tokens_text,zeilen,nr_tokens_publisher,prozentVerlag,...,pagePath,avgTimeOnPage_percentile,prozentDpa,timeOnPage,pageviews,category,rubric,publisher,exits,date
articleId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SZ_16591,,47.587949,76,0.088949,90.448376,Sparbuch der Kinder ist tabu : Urteil: Vater d...,535,,535,,...,,56.993679,,48968.0,2411,sz-spezial,recht,SZ,1382,
SZ_16595,,40.826923,6,0.163308,3.960089,Gewalt in der Schule : Faustschlag ins Gesicht...,250,,250,,...,,47.765865,,2123.0,92,sz-spezial,recht,SZ,40,
SZ_16723,,38.501931,26,0.166675,70.071579,Abflussrohr der Dachrinne defekt: Gebäudeversi...,231,,231,,...,,43.811973,,9972.0,567,sz-spezial,recht,SZ,308,
SZ_17146,,98.246154,1,0.125154,6.789167,„Er war die Liebe meines Lebens“,785,,785,,...,,84.367253,,6386.0,101,magazine,momente,SZ,36,
SZ_17184,,70.817518,9,0.177933,49.02857,Neuer Laptop kaputt - Rücktritt vom Kauf möglich?,398,,398,,...,,74.262519,,9702.0,261,sz-spezial,recht,SZ,124,


In [25]:
df = df_raw
df = df[df.publisher == "NOZ"]
df = df.sample(frac=0.1, replace=False, random_state=1) # take 20% for faster processing # TODO: change back

print(len(df))

3826


In [26]:
#creating train, dev, test
RANDOM_SEED = 123
df_train, df_test = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED, shuffle=True)
df_dev, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED, shuffle=True)
df_train.reset_index(drop=True, inplace=True) # so that index starts with 0 again
df_dev.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)
print(df_train.shape, df_dev.shape, df_test.shape)

(3060, 22) (383, 22) (383, 22)


In [27]:
df_train.head()

Unnamed: 0,dpaGuid,avgTimeOnPage,bounces,avgTimeOnPagePerWordcount,pageviews_percentile,titel,nr_tokens_text,zeilen,nr_tokens_publisher,prozentVerlag,...,pagePath,avgTimeOnPage_percentile,prozentDpa,timeOnPage,pageviews,category,rubric,publisher,exits,date
0,,169.016973,2948,0.433377,94.172121,,390,,390,,...,,50.152259,,119495.0,4921,lokales,werlte,NOZ,4214,
1,191219-99-211934,136.306452,3,0.411802,4.195073,""": Demokrat wechselt zu Trumps\nRepublikanern",331,36.0,331,,...,https://www.noz.de/deutschland-welt/politik/ar...,28.956459,96.0,8451.0,100,deutschland-und-welt,politik,NOZ,38,20191220.0
2,,127.804762,44,0.383798,46.674166,,333,,333,,...,,24.12508,,26839.0,362,,www.noz.de,NOZ,152,
3,,164.671674,292,0.47593,75.670394,,346,,346,,...,,47.288883,,76737.0,1169,,www.noz.de,NOZ,703,
4,,575.76,514,0.618432,75.179529,,931,,931,,...,,98.259249,,71970.0,1136,deutschland-und-welt,gut-zu-wissen,NOZ,1011,


In [28]:
def create_DataLoaders(target, text_base, tokenizer, max_len, batch_size):
    # creating DataSets
    ds_train = INWT_Dataset(df=df_train,
                  target = target,
                  text_base = text_base,
                  tokenizer=tokenizer,
                  max_len = max_len)
    ds_dev = INWT_Dataset(df=df_dev,
                  target = target,
                  text_base = text_base,
                  tokenizer=tokenizer,
                  max_len = max_len)
    ds_test = INWT_Dataset(df=df_test,
                  target = target,
                  text_base = text_base,
                  tokenizer=tokenizer,
                  max_len = max_len)
    
    # creating DataLoaders
    dl_train = DataLoader(ds_train, batch_size=batch_size, num_workers=4, shuffle=True)
    dl_dev = DataLoader(ds_dev, batch_size=batch_size, num_workers=4)
    dl_test = DataLoader(ds_test, batch_size=batch_size, num_workers=4)
    
    return dl_train, dl_dev, dl_test

In [37]:
dl_train, dl_dev, dl_test = create_DataLoaders(target = 'avgTimeOnPage',
                                               text_base = 'article_text', # 'titelH1',
                                               tokenizer = tokenizer, 
                                               max_len = 300,            # change depending on used text_base!
                                               batch_size = 12)

In [38]:
# have a look at one batch
data = next(iter(dl_train))
print(data.keys())
input_ids = data['input_ids']
#print(input_ids)
print(input_ids.shape)
attention_mask = data['attention_mask']
#print(attention_mask)
print(attention_mask.shape)
print(data['target'].shape)

dict_keys(['text', 'input_ids', 'attention_mask', 'target'])
torch.Size([12, 300])
torch.Size([12, 300])
torch.Size([12, 1])


## Model

In [39]:
# das ist umständlich (und auch falsch), ich habe stattdessen bereits BertForSequenceClassification genommen
# https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification

#class Bert_regression(nn.Module):
#    
#    def __init__(self, n_outputs): # maybe train pageviews and timeOnPage simultaneously?
#        super(Bert_regression, self).__init__()
#        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
#        self.drop = nn.Dropout(p=0.3)
#        self.out = nn.Linear(self.bert.config.hidden_size, n_outputs)
#
#    def forward(self, input_ids, attention_mask):
#        _, pooled_output = self.bert(input_ids=input_ids,           # das hier ist glaube ich nicht sinnvoll bei mir
#                                     attention_mask=attention_mask)
#        output = self.drop(pooled_output)        
#        return self.out(output)

In [40]:
#model = Bert_regression(n_outputs = 1)
#model = model.to(device)

In [41]:
model = BertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME,
                                                      num_labels = 1, # turns "classification" into regression?
                                                      output_attentions = False,
                                                      output_hidden_states = False,
                                                      )
model = model.to(device)

Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoi

In [42]:
# try ut with one batch (model is not trained yet so nothing exciting to be expected)
model(input_ids.to(device), attention_mask.to(device)) # semms good!

(tensor([[-0.3009],
         [-0.0678],
         [-0.2119],
         [-0.2000],
         [-0.4979],
         [-0.1847],
         [-0.1303],
         [-0.0585],
         [-0.2620],
         [ 0.2044],
         [-0.2960],
         [-0.2276]], grad_fn=<AddmmBackward>),)

## Training

In [43]:
optimizer = optim.AdamW(model.parameters(), lr=1e-5)
loss_fn = nn.MSELoss()  # mean squared loss

In [None]:
EPOCHS = 20


for epoch in range(EPOCHS):
    print("Epoch", epoch)
    
    ### TRAINING on train
    print("training")
    model = model.train()
    train_losses = []
    
    for nr, d in enumerate(dl_train):
        print("-Batch", nr, end='\r')
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["target"].to(device)
        #print(targets.shape)    
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)[0] # stimmt das so? ist [0] die logits?
        #print(outputs.shape)
        
        loss = loss_fn(outputs, targets)
        train_losses.append(loss.item())
        loss.backward()

        #nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        optimizer.zero_grad()
    
        #print(np.mean(train_losses))
    print("Mean train loss:", np.mean(train_losses))
    
    ### EVALUATING on dev
    print("evaluating")
    model = model.eval()
    eval_losses = []
    
    with torch.no_grad():
        for nr, d in enumerate(dl_dev):
            print("-Batch", nr, end='\r')
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["target"].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)[0] # stimmt das so?
        
            loss = loss_fn(outputs, targets)
            eval_losses.append(loss.item())
            #print(np.mean(eval_losses))
        print("Mean eval loss:", np.mean(eval_losses))

Epoch 0
training
-Batch 249

## predict dev set

In [21]:
import scipy.stats as st

In [28]:
_, dl_dev_oneBatch, _ = create_DataLoaders(target = 'avgTimeOnPage/wordcount',
                                               text_base = 'text_preprocessed', # 'teaser'
                                               tokenizer = tokenizer, 
                                               max_len = 100,            # change depending on used text_base!
                                               batch_size = 89) # for now: just one batch

In [29]:
with torch.no_grad():
    for d in dl_dev_oneBatch:
        pred_dev = model(input_ids=d["input_ids"].to(device), attention_mask=d["attention_mask"].to(device))[0] # just the logits?
        y_dev = d["target"].to(device)
        
        pred_dev = pred_dev.squeeze().cpu()
        y_dev = y_dev.squeeze().cpu()
        print(pred_dev)
        print(y_dev)
        print(st.pearsonr(pred_dev, y_dev))

tensor([ 0.5329,  0.2562,  0.4639,  0.3975,  0.3378,  0.1366,  0.3866,  0.5034,
         0.4795,  0.3761,  0.3445,  0.4251,  0.8253,  0.2645,  0.3172,  0.6630,
         0.3116,  0.1757,  0.4417,  0.0116,  0.5449,  0.9076,  0.1101,  0.7867,
         1.0569,  0.3484,  0.5437,  0.1672,  0.3955,  0.1168,  0.7323,  0.6364,
         0.1067,  0.4060,  0.7389,  0.3821,  0.5545,  0.4256,  0.4132,  1.2090,
         0.2423,  0.3605, -0.0208,  0.1978,  0.2604,  0.3660,  0.2688,  0.3128,
         0.3575,  0.2891,  0.1810,  0.1984,  0.1762,  0.3948,  0.4105,  0.7182,
         0.3595,  0.6303,  0.7492,  0.5521,  0.5746,  0.3578, -0.0190,  0.9274,
         0.2742,  0.1476,  0.2406,  0.3628,  0.4523,  0.3634,  0.4850,  0.3599,
         0.5240,  0.1677,  0.9379,  0.4969,  0.4776,  0.4500,  0.3288,  0.3787,
         0.3845,  0.1362,  0.2795,  0.4424,  0.5441,  0.2259,  0.1552,  0.3783,
         0.1311])
tensor([0.4269, 0.2815, 0.0705, 0.0231, 0.0000, 0.2753, 6.7727, 0.4919, 0.1215,
        0.2277, 0.0000

## Kommentare:
* ist jetzt auf GPU
* Overfitting! Dropout? zu wenig Daten?