# 6.4.4 Rechtschreibkorrektur mit einem Generator anlernen

## 01 - Fertige Spelling-Daten laden 
Produktion der Spelling-Daten vgl. vorheriges Jupyter Notebook im gleichen Ordner

In [9]:
from os.path import join
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
from math import ceil

path = r'..\Data'

df_spelling = pd.read_csv(  join(path, 'spelling_data.csv'),
                             na_filter=False)

### Einteilung in Train/Testpartitionen 
df_spelling.head()
df_train = df_spelling.sample(frac=.8)
df_test = df_spelling.drop(index=df_train.index)

### Abspeichern von Trainings- und Testdaten als CSV-Datei (wird später in Generator gebraucht)
df_train.to_csv(join(path, 'train_spelling.csv'), index=None)
df_test.to_csv(join(path, 'test_spelling.csv'), index=None)

df_train.shape, df_test.shape, df_train.head()

((12737, 2),
 (3184, 2),
           misspelling            word
 11595          elwern          eltern
 11628       gemeinsan       gemeinsam
 14713  beispielsweiße  beispielsweise
 15648        promramm        programm
 8182           yeinen          keinen)

## 02 - Generator zur Anlieferung der Daten

#### Zunächst: Laden des Objects deq_encoder der Klasse *SequenceEncoder*. 
Dazu verwenden wir ein bereits angelerntes Objekt der Klasse *SequenceEncoder*; 
vgl. vorheriges Jupyter Notebook im gleichen Ordner 

In [5]:
import joblib
from sequence_encoder import SequenceEncoder

seq_encoder = joblib.load('seq_encoder.pkl')
seq_encoder

<sequence_encoder.SequenceEncoder at 0x161a3615e48>

#### Generator-Funktion erzeugen

In [7]:
def generator_spelling_data( filepath: str, 
                             batch_size: int, 
                             epochs: int,
                             seq_encoder: SequenceEncoder):
    for epoch in range(epochs):
        gen = pd.read_csv(filepath, chunksize=batch_size)
        for df in gen:
            X, y = df['misspelling'].values, df['word'].values
            X = seq_encoder.gen_one_hot_data(X)
            y = np.array([ seq_encoder.word_to_int(y_) for y_ in y])
            yield X, y

#### Generator-Funktion aufrufen und testen

In [13]:
gen = generator_spelling_data(join(path, 'spelling_data.csv'), 
                        batch_size=1, epochs=1, 
                        seq_encoder=seq_encoder)

### Erstes Datenbatch ansehen
next(gen)

(array([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0.

## 03 - Schätzmodell zusammenstellen

In [14]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (Dense, 
                                    GRU, Bidirectional)

seq_length = seq_encoder.max_word_length # =15
num_chars = len(seq_encoder.charset) # =32
num_words = len(seq_encoder.word_idx) # =500

model = Sequential()
model.add(Bidirectional(GRU(units=num_chars),
              input_shape=(seq_length, num_chars)))
model.add(Dense(units=num_words, activation='softmax'))
model.summary()

model.compile(  loss='sparse_categorical_crossentropy', 
                optimizer='adam',
                metrics=['accuracy'])

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional (Bidirectional (None, 64)                12672     
_________________________________________________________________
dense (Dense)                (None, 500)               32500     
Total params: 45,172
Trainable params: 45,172
Non-trainable params: 0
_________________________________________________________________


## 04 - Modell anlernen

#### Bestimmung der Steps per Epoch für Trainings- und Testdaten

In [15]:
steps_train = ceil(df_train.shape[0] / 32)
steps_test = ceil(df_test.shape[0] / 32)
steps_train, steps_test

(399, 100)

#### Callbacks zusammenstellen, Generator-Funktionen aufrufen und Modell anlernen

In [16]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

stopping = EarlyStopping( monitor='val_loss', 
                          patience=3,
                          restore_best_weights=True)
checkpoint = ModelCheckpoint( filepath='model_auto_correction_bid.h5',
                              monitor='val_loss',
                              save_best_only=True)

gen_train = generator_spelling_data(join(path, 'train_spelling.csv'), 
                        batch_size=32, epochs=200, 
                        seq_encoder=seq_encoder)
gen_test = generator_spelling_data(join(path, 'test_spelling.csv'), 
                        batch_size=32, epochs=200, 
                        seq_encoder=seq_encoder)


history = model.fit(gen_train, epochs=200,
            steps_per_epoch=399,
            callbacks=[stopping, checkpoint],
            validation_data=gen_test,
            validation_steps=100)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 32/200
Epoch 33/200
Epoch 34/200


## 05 - Schätzungen durchführen

In [17]:
test_example = np.array(['alllein', 'frahe', 'beistiel'])
test_example = seq_encoder.gen_one_hot_data(test_example)

pred_word_prob = model.predict(test_example)
pred_word_idx = np.argmax(pred_word_prob, axis=1)

### 3) Daten decodieren
for idx in pred_word_idx:
    print(seq_encoder.int_to_word(idx))

allein
frage
beispiel
