In [119]:
import transformers
from transformers import BertModel, BertTokenizer
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

In [5]:
PRE_TRAINED_MODEL_NAME = 'bert-base-german-cased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=254728.0, style=ProgressStyle(descripti…




In [28]:
sample_text = "Das hier ist ein deutscher Beispieltext. Und ein zweiter."
tokens = tokenizer.tokenize(sample_text) # just tokenizes
token_ids = tokenizer.convert_tokens_to_ids(tokens)
ids = tokenizer.encode(sample_text) # already adds special tokens

print(tokens)
print(token_ids)
print(ids)

['Das', 'hier', 'ist', 'ein', 'deutscher', 'Beispiel', '##text', '.', 'Und', 'ein', 'zweiter', '.']
[295, 702, 127, 39, 2433, 2249, 8859, 26914, 1356, 39, 8266, 26914]
[3, 295, 702, 127, 39, 2433, 2249, 8859, 26914, 1356, 39, 8266, 26914, 4]


In [17]:
tokenizer.sep_token, tokenizer.sep_token_id, tokenizer.cls_token, tokenizer.cls_token_id, tokenizer.pad_token, tokenizer.pad_token_id

('[SEP]', 4, '[CLS]', 3, '[PAD]', 0)

In [35]:
ROOT = Path('/Volumes/INWT/Daten_NLP/') # encrypted folder!
DATA = ROOT / '200707_aachener_zeitung_modified.csv' # text is already minimal preprocessed

In [145]:
class INWT_Dataset(Dataset):

    def __init__(self, df, target, text_base, tokenizer, max_len):
        self.df = df
        self.text_base = text_base
        self.target = target
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, item):
        text = str(self.df.loc[item, self.text_base])
        target = self.df.loc[item, self.target]

        # hier einfach encode() nehmen? brauche ich die attention_mask etc?
        encoding = self.tokenizer.encode_plus(text,
                                              max_length=self.max_len,
                                              truncation=True,
                                              #return_token_type_ids=False,
                                              pad_to_max_length=True,
                                              return_attention_mask=True,
                                              return_tensors='pt',
                                              )

        return {'text': text,
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'target': torch.tensor(target, dtype=torch.long)
                }

In [157]:
df = pd.read_csv(DATA)
df = df.fillna('') # replacing Nan with emtpy string
df.head()

Unnamed: 0,articleId,pageviews,entrances,exits,bounces,timeOnPage,conversions,avgTimeOnPage,stickiness,entranceRate,...,titelH3,wordcount,category,city,text_preprocessed,avgTimeOnPage/wordcount,nr_tokens,mean_token_length,nr_tokens_teaser,nr_tokens_titelH1
0,48620281,21,7,12,7,1012,,112.444444,42.857143,33.333333,...,,769,vm,München/Stuttgart,Frische Luft und Bewegung: Diese Kombination r...,0.146222,796,5.359296,29,9
1,48620381,19,6,11,5,1484,,185.5,42.105263,31.578947,...,,441,vm,Berlin/Frankfurt/Main,"Der Wecker klingelt, aufstehen! Doch gerade im...",0.420635,452,5.938053,33,8
2,48622639,2,2,2,2,0,,0.0,0.0,100.0,...,,390,vm,Berlin,Eltern auf der Suche nach einem guten Babyphon...,0.0,396,5.848485,30,7
3,48623085,32,9,20,9,974,,81.166667,37.5,28.125,...,,345,vm,Berlin,Spülmaschinentabs sollen kleine Alleskönner se...,0.235266,367,5.594005,30,7
4,48623259,24,2,7,2,3797,,223.352941,70.833333,8.333333,...,,182,vm,Berlin,Make-up hat heutzutage einen Zweck: Es soll da...,1.227214,183,5.622951,22,8


In [206]:
#creating train, dev, test
RANDOM_SEED = 123
df_train, df_test = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED, shuffle=True)
df_dev, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED, shuffle=True)
df_train.reset_index(drop=True, inplace=True) # so that index starts with 0 again
df_dev.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)
print(df_train.shape, df_dev.shape, df_test.shape)

(712, 36) (89, 36) (90, 36)


In [207]:
df_train.head()

Unnamed: 0,articleId,pageviews,entrances,exits,bounces,timeOnPage,conversions,avgTimeOnPage,stickiness,entranceRate,...,titelH3,wordcount,category,city,text_preprocessed,avgTimeOnPage/wordcount,nr_tokens,mean_token_length,nr_tokens_teaser,nr_tokens_titelH1
0,49583837,28,8,14,7,3680,,262.857143,50.0,28.571429,...,,630,vm,Berlin,Viele Deutsche dürften bis zuletzt die Hoffnun...,0.417234,655,6.09771,27,7
1,51204297,4,0,1,0,417,,139.0,75.0,0.0,...,,80,vm,Stuttgart,"Geht in einem Auto ein Assistenzsystem kaputt,...",1.7375,77,6.363636,24,6
2,49230331,4,1,1,1,171,,57.0,75.0,25.0,...,,127,vm,Berlin,Vor der Unterschrift unter den Bauvertrag brau...,0.448819,124,6.201613,24,6
3,49825661,5,2,2,2,1501,,500.333333,60.0,40.0,...,,176,vm,Losheim am See,Beim Kauf von Kettenspray können Motorradbesit...,2.842803,178,5.52809,28,8
4,48897105,5,2,2,2,124,,41.333333,60.0,40.0,...,,149,vm,Krefeld,"Wer häufiger Schmerzen im Gesicht hat, kann Ze...",0.277405,146,6.410959,25,7


In [208]:
ds_train = INWT_Dataset(df=df_train,
                  target = 'pageviews',
                  text_base = 'teaser',
                  tokenizer=tokenizer,
                  max_len = 200)
ds_dev = INWT_Dataset(df=df_dev,
                  target = 'pageviews',
                  text_base = 'teaser',
                  tokenizer=tokenizer,
                  max_len = 200)
ds_test = INWT_Dataset(df=df_test,
                  target = 'pageviews',
                  text_base = 'teaser',
                  tokenizer=tokenizer,
                  max_len = 200)

In [209]:
len(ds_train), len(ds_dev), len(ds_test)

(712, 89, 90)

In [210]:
ex = 4
print(ds_train[ex])

{'text': 'Kopfschmerzen sind eine Volkskrankheit - Gesichtsschmerzen eher nicht. Für Betroffene sind sie aber oft eine große Belastung. Wichtig ist dann, die Symptome ganz genau zu beobachten.', 'input_ids': tensor([    3,  3506, 22330,   287,   155,  2238, 11853, 26917,   484, 26935,
        23645, 22330,  3077,   149, 26914,   864, 11946,   287,   213,   386,
         2177,   155,  1856, 10703, 26914, 18840,   127,   670, 26918,    30,
        21800,  1346,  2971,    81, 12530, 26914,     4,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            

In [211]:
# creating DataLoaders
BATCH_SIZE = 16
dl_train = DataLoader(ds_train, batch_size=BATCH_SIZE)#, num_workers=4)
dl_dev = DataLoader(ds_dev, batch_size=BATCH_SIZE)#, num_workers=4)
dl_test = DataLoader(ds_test, batch_size=BATCH_SIZE)#, num_workers=4)

In [217]:
# have a look at one batch
data = next(iter(dl_train))
print(data.keys())
print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['target'].shape)

dict_keys(['text', 'input_ids', 'attention_mask', 'target'])
torch.Size([16, 200])
torch.Size([16, 200])
torch.Size([16])
