In [1]:
import torch
from torch import optim, nn
import pandas as pd
import numpy as np

from master_thesis.src import utils, data, models

In [2]:
assert torch.cuda.is_available()
device = torch.device('cuda:0')
print("Device is: ", device)

Device is:  cuda:0


In [3]:
# get pretrained model and tokenizer from huggingface's transformer library
model, tokenizer = models.get_model_and_tokenizer()
model.to(device)

Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoi

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [4]:
# get raw data
df = pd.read_csv(utils.DATA / 'combined.tsv', sep = '\t')
df = df.fillna('') # replacing Nan with emtpy string
print("Shape of raw df:", df.shape)

# just take articles with ...
df = df.loc[(df['pageviews'] >= 100) & # hier war vorher 20
            (df['publisher'] == 'bonn')
            #(df['nr_tokens'] >= 10) &  # to delete articles without text or false text
            #(df['avgTimeOnPagePerNr_tokens'] <= 3) & # hier war vorher 4
            #(df['avgTimeOnPagePerNr_tokens'] >= 0.1) # hier war vorher 0.01
            ]
print("Remaining df after conditioning:", df.shape)

  interactivity=interactivity, compiler=compiler, result=result)


Shape of raw df: (104046, 42)
Remaining df after conditioning: (3061, 42)


In [8]:
df.head().titelH1

0     Meteorologischer Winter endet mit Sturm, Glätt...
4     Palästinenser weisen Trumps Nahost-Plan scharf...
12    Coronavirus breitet sich aus - Patienten in De...
18      Staus in deutschen Großstädten nehmen weiter zu
24    Baustellen in Sicht: Bahn investiert 12,2 Mill...
Name: titelH1, dtype: object

In [10]:
BATCH_SIZE = 40 # 16
MAX_LEN = 50
dl_train, dl_dev, dl_test = data.create_DataLoaders(df = df,
                                                    target = 'pageviews', # 'avgTimeOnPagePerNr_tokens',
                                                    text_base = 'titelH1', # 'text_preprocessed', # 'titelH1',
                                                    tokenizer = tokenizer,
                                                    max_len = MAX_LEN,            # change depending on used text_base!
                                                    batch_size = BATCH_SIZE)

In [13]:
len(dl_train)

62

In [16]:
# have a look at one batch in dl_train to see if shapes make sense
data = next(iter(dl_train))
print(data.keys())
input_ids = data['input_ids']
print(input_ids)
print(input_ids.shape)
attention_mask = data['attention_mask']
print(attention_mask)
print(attention_mask.shape)
print(data['target'].shape)
print(data['target'])

dict_keys(['input_ids', 'attention_mask', 'target'])
tensor([[    3, 23405, 11929,  ...,     0,     0,     0],
        [    3,  1108, 26935,  ...,     0,     0,     0],
        [    3,  1972,   153,  ...,     0,     0,     0],
        ...,
        [    3,   103, 11574,  ...,     0,     0,     0],
        [    3,  5418, 26848,  ...,     0,     0,     0],
        [    3,  5418, 16792,  ...,     0,     0,     0]])
torch.Size([40, 50])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
torch.Size([40, 50])
torch.Size([40, 1])
tensor([[ 144.],
        [ 285.],
        [ 986.],
        [ 143.],
        [ 593.],
        [ 218.],
        [7181.],
        [ 143.],
        [ 300.],
        [ 148.],
        [ 137.],
        [1102.],
        [1022.],
        [ 208.],
        [ 155.],
        [ 164.],
        [ 437.],
        [1401.],

In [21]:
# loss and optimizer
LEARNING_RATE = 0.01 #0.001 # 0.00001
#optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)

loss_fn = nn.MSELoss()  # mean squared error

In [20]:
print(optimizer, loss_fn)

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.01
    weight_decay: 0.01
) MSELoss()


In [27]:
model(input_ids=input_ids, attention_mask=attention_mask)[0]  # stimmt das so? ist [0] die logits?

RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 10.91 GiB total capacity; 5.81 GiB already allocated; 12.56 MiB free; 108.80 MiB cached)

In [28]:
##### TRAINING AND EVALUATING #####

EPOCHS = 20

for epoch in range(EPOCHS):
    print("Epoch", epoch)

    ### TRAINING on train
    print("training")
    model = model.train()
    train_losses = []

    for nr, d in enumerate(dl_train):
        print("-Batch", nr, end='\r')
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["target"].to(device)
        # print(targets.shape)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)[0]  # stimmt das so? ist [0] die logits?
        # print(outputs.shape)
        #print(outputs[:10])
        loss = loss_fn(outputs, targets)
        train_losses.append(loss.item())
        loss.backward()

        # nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        optimizer.zero_grad()

        # print(np.mean(train_losses))
    print("Mean train loss:", np.mean(train_losses))

    ### EVALUATING on dev
    print("evaluating")
    model = model.eval()
    eval_losses = []

    with torch.no_grad():
        for nr, d in enumerate(dl_dev):
            print("-Batch", nr, end='\r')
            input_ids = d["input_ids"].to(device)
            #print(input_ids[:10])
            attention_mask = d["attention_mask"].to(device)
            #print(attention_mask[:10])
            targets = d["target"].to(device)
            #print(targets[:10])
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)[0]  # stimmt das so?
            print(outputs[:10])

            loss = loss_fn(outputs, targets)
            eval_losses.append(loss.item())
            # print(np.mean(eval_losses))
        print("Mean eval loss:", np.mean(eval_losses))

Epoch 0
training
-Batch 0

RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 10.91 GiB total capacity; 5.81 GiB already allocated; 12.56 MiB free; 108.74 MiB cached)