In [1]:
from itertools import chain

def get_dict_map(data, token_or_tag):
    tok2idx = {}
    idx2tok = {}
    
    if token_or_tag == 'token':
        vocab = list(set(data['Wort'].to_list()))
    else:
        vocab = list(set(data['Attribut'].to_list()))
    
    idx2tok = {idx:tok for  idx, tok in enumerate(vocab)}
    tok2idx = {tok:idx for  idx, tok in enumerate(vocab)}
    return tok2idx, idx2tok

In [2]:
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
#from keras.utils import to_categorical
from keras.utils.np_utils import to_categorical

def get_pad_train_test_val(data_group, data):
    n_token = len(list(set(data['Wort'].to_list())))
    n_tag = len(list(set(data['Attribut'].to_list())))

    tokens = data_group['Word_idx'].to_list()
    maxlen = max([len(s) for s in tokens])
    pad_tokens = pad_sequences(tokens, maxlen=maxlen, dtype='int32', padding='post', value=n_token-1)

    tags = data_group['Tag_idx'].to_list()
    pad_tags = pad_sequences(tags, maxlen=maxlen, dtype='int32', padding='post', value=tag2idx["O"])

    n_tags = len(tag2idx)
    pad_tags = [to_categorical(i, num_classes=n_tags) for i in pad_tags]

    train_tokens, test_tokens, train_tags, test_tags = train_test_split(pad_tokens, pad_tags, test_size=0.1, train_size=0.9, random_state=2020)

    print(
        'train_tokens length:', len(train_tokens),
        '\ntest_tokens length: ', len(test_tokens),
        '\ntrain_tags length: ', len(train_tags),
        '\ntest_tags length: ', len(test_tags)
    )

    return train_tokens, test_tokens, train_tags, test_tags

In [3]:
## class weights berechnen

import numpy as np
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import MultiLabelBinarizer


def generate_class_weights(class_series, multi_class=True, one_hot_encoded=False):
  """
  Method to generate class weights given a set of multi-class or multi-label labels, both one-hot-encoded or not.
  Some examples of different formats of class_series and their outputs are:
    - generate_class_weights(['mango', 'lemon', 'banana', 'mango'], multi_class=True, one_hot_encoded=False)
    {'banana': 1.3333333333333333, 'lemon': 1.3333333333333333, 'mango': 0.6666666666666666}
    - generate_class_weights([[1, 0, 0], [0, 1, 0], [0, 0, 1], [1, 0, 0]], multi_class=True, one_hot_encoded=True)
    {0: 0.6666666666666666, 1: 1.3333333333333333, 2: 1.3333333333333333}
    - generate_class_weights([['mango', 'lemon'], ['mango'], ['lemon', 'banana'], ['lemon']], multi_class=False, one_hot_encoded=False)
    {'banana': 1.3333333333333333, 'lemon': 0.4444444444444444, 'mango': 0.6666666666666666}
    - generate_class_weights([[0, 1, 1], [0, 0, 1], [1, 1, 0], [0, 1, 0]], multi_class=False, one_hot_encoded=True)
    {0: 1.3333333333333333, 1: 0.4444444444444444, 2: 0.6666666666666666}
  The output is a dictionary in the format { class_label: class_weight }. In case the input is one hot encoded, the class_label would be index
  of appareance of the label when the dataset was processed. 
  In multi_class this is np.unique(class_series) and in multi-label np.unique(np.concatenate(class_series)).
  Author: Angel Igareta (angel@igareta.com)
  """
  if multi_class:
    # If class is one hot encoded, transform to categorical labels to use compute_class_weight   
    if one_hot_encoded:
      class_series = np.argmax(class_series, axis=1)
  
    # Compute class weights with sklearn method
    class_labels = np.unique(class_series)
    class_weights = compute_class_weight(class_weight='balanced', classes=class_labels, y=class_series)
    return dict(zip(class_labels, class_weights))
  else:
    # It is neccessary that the multi-label values are one-hot encoded
    mlb = None
    if not one_hot_encoded:
      mlb = MultiLabelBinarizer()
      class_series = mlb.fit_transform(class_series)

    n_samples = len(class_series)
    n_classes = len(class_series[0])

    # Count each class frequency
    class_count = [0] * n_classes
    for classes in class_series:
        for index in range(n_classes):
            if classes[index] != 0:
                class_count[index] += 1
    
    # Compute class weights using balanced method
    class_weights = [n_samples / (n_classes * freq) if freq > 0 else 1 for freq in class_count]
    class_labels = range(len(class_weights)) if mlb is None else mlb.classes_
    return dict(zip(class_labels, class_weights))

In [4]:
import pandas as pd


df = pd.read_csv('data/trainingdata.csv',escapechar="\\",sep=",",error_bad_lines=False,warn_bad_lines=False)
#print(len(df))

token2idx, idx2token = get_dict_map(df, 'token')
tag2idx, idx2tag = get_dict_map(df, 'tag')

df['Word_idx'] = df['Wort'].map(token2idx)
df['Tag_idx'] = df['Attribut'].map(tag2idx)
print(df)

df_group = df.groupby(by = ['satzId'], as_index=False)['Wort', 'Attribut', 'Word_idx', 'Tag_idx'].agg(lambda x: list(x))
print(df_group)
train_tokens, test_tokens, train_tags, test_tags = get_pad_train_test_val(df_group, df)

         satzId           Wort       Attribut  Word_idx  Tag_idx
0             0          Broan        B-Brand      7419        5
1             0  TEN136WWBroan              O     21047        1
2             0       TEN136WW  B-Modelnumber     50616        4
3             0       Overview              O     51588        1
4             0            The              O     32135        1
...         ...            ...            ...       ...      ...
5441801   16000            any              O     23027        1
5441802   16000           size              O     44899        1
5441803   16000           hood              O      2191        1
5441804   16000              /              O     21107        1
5441805   16000     ventilator              O     31326        1

[5441806 rows x 5 columns]
  df_group = df.groupby(by = ['satzId'], as_index=False)['Wort', 'Attribut', 'Word_idx', 'Tag_idx'].agg(lambda x: list(x))
       satzId                                               Wort  \
0

In [5]:
print(train_tags[0])

[[0. 0. 0. 0. 0. 1.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 ...
 [0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]]


In [7]:

input_dim = len(list(set(df['Wort'].to_list())))+1
output_dim = 32
input_length = max([len(s) for s in df_group['Word_idx'].tolist()])
n_tags = len(tag2idx)
print('input_dim: ', input_dim, '\noutput_dim: ', output_dim, '\ninput_length: ', input_length, '\nn_tags: ', n_tags)

input_dim:  51605 
output_dim:  32 
input_length:  963 
n_tags:  6


In [8]:
from numpy.random import seed
import tensorflow
seed(1)
tensorflow.random.set_seed(2)

In [9]:
import numpy as np
#import tensorflow
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional

def get_bilstm_lstm_model():
    model = Sequential()

    model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))

    model.add(Bidirectional(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2), merge_mode='concat'))

    model.add(LSTM(units=output_dim, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))

    model.add(TimeDistributed(Dense(n_tags, activation="softmax")))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'], sample_weight_mode='temporal')
    model.summary()

    return model

In [12]:
array = []

for x in range(0, 5000):
    for y in range(0, 962):
        array.append(train_tags[x][y])

#print(array)

In [13]:
class_weights = generate_class_weights(array, multi_class=True, one_hot_encoded=True)
print(class_weights)

{0: 27643.67816091954, 1: 0.16753142137140872, 2: 633.2280147446024, 3: 438.7885422368181, 4: 154.5828512662296, 5: 48.527037933817596}


In [14]:
df_zu_testzwecken = pd.DataFrame(data=[0,1,2,3,4,5], columns=['IDX'])
df_zu_testzwecken['Tag'] = df_zu_testzwecken['IDX'].map(idx2tag)
print(df_zu_testzwecken)

   IDX            Tag
0    0  E-Modelnumber
1    1              O
2    2        I-Brand
3    3        E-Brand
4    4  B-Modelnumber
5    5        B-Brand


In [15]:
# ToDo: Array korreckt befüllen:
# https://github.com/keras-team/keras/issues/3653#issuecomment-761085597
def generate_sample_weights(train_tags, class_weights): 
    #replaces values for up to 3 classes with the values from class_weights#
    # ToDo: Train Tags sind OnHot Encodings 
        # If class is one hot encoded, transform to categorical labels to use compute_class_weight   

    train_tags = np.argmax(train_tags, axis=1)

    sample_weights = [np.where(y==0,class_weights[0],
                        np.where(y==1,class_weights[1],
                        np.where(y==2,class_weights[2],
                        np.where(y==3,class_weights[3],
                        np.where(y==4,class_weights[4],
                        np.where(y==5,class_weights[5],y)))))) for y in train_tags]
    return np.asarray(sample_weights)


#class_weights = generate_class_weights(array, multi_class=True, one_hot_encoded=True)
#print(class_weights)
#class_weights = {0: 26722.222222222223, 1: 0.167523264314445, 2: 619.047619047619, 3: 513.8888888888889, 4: 157.03558602677114, 5: 48.278630934457496} 
# generiert mit 1000 Beispiel Daten
class_weights = {0: 27643.67816091954, 1: 0.16753142137140872, 2: 633.2280147446024, 3: 438.7885422368181, 4: 154.5828512662296, 5: 48.527037933817596}
# generiert mit 5000 Beispiel Daten
# 0 = E-Modelnumber, 1 = O , 2 = I-Brand, 3 = E-Brand, 4 = B-Modelnumber, 5 = B-Brand

sample_weights = np.zeros((11520, 963))
for x in range(0, 11519):
    sample_weights[x] = generate_sample_weights(train_tags[x], class_weights)
    
print(sample_weights)

[[ 48.52703793   0.16753142 154.58285127 ...   0.16753142   0.16753142
    0.16753142]
 [ 48.52703793   0.16753142 154.58285127 ...   0.16753142   0.16753142
    0.16753142]
 [ 48.52703793   0.16753142 154.58285127 ...   0.16753142   0.16753142
    0.16753142]
 ...
 [ 48.52703793   0.16753142 154.58285127 ...   0.16753142   0.16753142
    0.16753142]
 [ 48.52703793   0.16753142 154.58285127 ...   0.16753142   0.16753142
    0.16753142]
 [  0.           0.           0.         ...   0.           0.
    0.        ]]


In [16]:
print(sample_weights[12])

[4.85270379e+01 6.33228015e+02 4.38788542e+02 1.67531421e-01
 1.67531421e-01 1.67531421e-01 1.54582851e+02 1.67531421e-01
 1.67531421e-01 1.67531421e-01 1.67531421e-01 1.67531421e-01
 1.67531421e-01 1.67531421e-01 4.85270379e+01 6.33228015e+02
 4.38788542e+02 1.67531421e-01 1.67531421e-01 1.67531421e-01
 1.67531421e-01 1.67531421e-01 1.67531421e-01 1.67531421e-01
 1.67531421e-01 1.67531421e-01 1.67531421e-01 1.67531421e-01
 1.67531421e-01 1.67531421e-01 1.67531421e-01 1.67531421e-01
 1.67531421e-01 1.67531421e-01 1.67531421e-01 1.67531421e-01
 1.67531421e-01 1.67531421e-01 1.67531421e-01 1.67531421e-01
 1.67531421e-01 1.67531421e-01 1.67531421e-01 1.67531421e-01
 1.67531421e-01 1.67531421e-01 1.67531421e-01 1.67531421e-01
 1.67531421e-01 1.67531421e-01 1.67531421e-01 1.67531421e-01
 1.67531421e-01 1.67531421e-01 1.67531421e-01 1.67531421e-01
 1.67531421e-01 1.67531421e-01 1.67531421e-01 1.67531421e-01
 1.67531421e-01 1.67531421e-01 1.67531421e-01 1.67531421e-01
 1.67531421e-01 1.675314

In [17]:

def train_model(X, y, model):
    loss  = list()
    for _ in range(1):
        hist = model.fit(X, y, batch_size=256, verbose=1, epochs=1, validation_split=0.2, sample_weight=sample_weights)
        loss.append(hist.history['loss'][0])
    return loss

In [19]:
model_bilstm_lstm = get_bilstm_lstm_model()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 963, 32)           1651360   
_________________________________________________________________
bidirectional (Bidirectional (None, 963, 64)           16640     
_________________________________________________________________
lstm_1 (LSTM)                (None, 963, 32)           12416     
_________________________________________________________________
time_distributed (TimeDistri (None, 963, 6)            198       
Total params: 1,680,614
Trainable params: 1,680,614
Non-trainable params: 0
_________________________________________________________________


In [20]:
results = pd.DataFrame()
results['with_add_lstm'] = train_model(train_tokens, np.array(train_tags), model_bilstm_lstm)



In [34]:
#model_bilstm_lstm.save('models/mein_model_first_try_mit_gewichten_400_fuer_alle_Klassen')

INFO:tensorflow:Assets written to: models/mein_model_first_try_mit_gewichten_400_fuer_alle_Klassen\assets


In [35]:
#from tensorflow import keras
#reconstructed_model = keras.models.load_model("models/mein_model_first_try_mit_gewichten_400_fuer_alle_Klassen")

In [35]:
test_pred_array = model_bilstm_lstm.predict(test_tokens[56])

In [32]:
test_pred_array

array([[[0.17599447, 0.15667543, 0.16210331, 0.16892578, 0.17187174,
         0.16442932]],

       [[0.17531826, 0.1583959 , 0.16269267, 0.1687426 , 0.17056361,
         0.16428699]],

       [[0.17489554, 0.15950447, 0.16221257, 0.16812703, 0.17050196,
         0.16475841]],

       ...,

       [[0.17324765, 0.16633794, 0.16230917, 0.16630983, 0.16806856,
         0.16372687]],

       [[0.17324765, 0.16633794, 0.16230917, 0.16630983, 0.16806856,
         0.16372687]],

       [[0.17324765, 0.16633794, 0.16230917, 0.16630983, 0.16806856,
         0.16372687]]], dtype=float32)

In [36]:
import numpy as np
predictions = np.argmax(test_pred_array, axis=1)

In [34]:
train_tags[29]

array([[0., 0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       ...,
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.]], dtype=float32)

In [30]:
predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [37]:
predictions

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]], dtype=int64)