# 6.3	Praxis rekurrenter Netze: eine automatische Rechtschreibkorrektur

### 01 - Rechtschreibfehler aus einer Liste korrekter Wörter produzieren

#### Daten mit korrekten Wörtern laden 

In [2]:
from os.path import join
import pandas as pd
import numpy as np

path = r'..\Data'
file = 'german-word-list-total.csv'

df = pd.read_csv(join(path, file), sep='\t')
print(df.head())
### Wörter in Kleinbuchstaben umwandeln
y = df['word form'].map(lambda x: x.lower())
y[:5]

   Unnamed: 0 word form  frequence
0           1       die  527056159
1           2       und  488790440
2           3       der  477357554
3           4        in  267256433
4           5       das  201678723


0    die
1    und
2    der
3     in
4    das
Name: word form, dtype: object

#### Klasse zur Produktion von Rechtschreibfehlern 

In [3]:
class CreateSpellingMistakes:

    def __init__(self):
        self.alphabet = [word for word in 'abcdefghijklmnopqrstuvwxyzßäöü']
    
    def gen_misspellings_as_df(self, y: np.array):
        spelling_data = self.__gen_misspellings(y)
        df_spelling = pd.DataFrame(spelling_data)
        df_spelling.columns = ['misspelling', 'word']
        return df_spelling

    def __gen_misspellings(self, y: np.array):
        spelling_data = []
        for word in y:
            for i in range(len(word)*len(word)):
                rand_num = np.random.randint(0,6)
                if rand_num == 0:
                    mistake = self.__random_char_exchange(word, add=True)
                if rand_num==1 or rand_num==2:
                    mistake = self.__char_dublication(word)
                else:
                    mistake = self.__random_char_exchange(word)
                if len(word) > 5 and np.random.randint(0,10) == 0:
                    mistake = self.__random_char_exchange(mistake)       
                spelling_data.append((mistake, word))
        return spelling_data

    def __char_dublication(self, word):
        idx = np.random.randint(0,len(word))
        char = word[idx]
        if idx==len(word)-1:
            mistake = word + char
        else:
            mistake = word[:idx] + char + word[idx:]
        return mistake

    def __random_char_exchange(self, word, add=False):
        char = self.alphabet[np.random.randint(0,len(self.alphabet))]
        idx = np.random.randint(0,len(word))
        if not add:
            if idx==len(word)-1:
                mistake = word[:idx] + char
            else:
                mistake = word[:idx] + char + word[idx+1:]
        else:
            if idx==len(word)-1:
                mistake = word + char
            else:
                mistake = word[:idx] + char + word[idx:]
        return mistake

#### Fehlerhafte aus korrekten Wörtern produzieren

In [7]:
df_spelling = CreateSpellingMistakes().gen_misspellings_as_df(y.values)
misspelling, word = df_spelling['misspelling'], df_spelling['word']
misspelling[-5:], word[-5:]

(15916    werrt
 15917     wcrt
 15918     lert
 15919     gert
 15920     werd
 Name: misspelling, dtype: object,
 15916    wert
 15917    wert
 15918    wert
 15919    wert
 15920    wert
 Name: word, dtype: object)

## 02 - Daten encodieren 

In [None]:
#### Funktion für Encodierung erzeugen: SequenceEncoder

In [8]:
import numpy as np
import pandas as pd

class SequenceEncoder:
    
    def __init__( self, x: np.array, 
                        y: np.array, 
                        filler='\t'):
        if len(y) != len(x):
            raise ValueError('x and y must have same length!')
        self.y = y
        self.x = x
        self.filler = filler
        self.__variables_one_hot_encoding()
        self.__variables_target()

    def gen_feature_target_data(self):
        X = self.gen_one_hot_data(self.x)
        y = np.array([self.word_to_int(word) for word in self.y])
        return X, y
    
    def gen_one_hot_data(self, data: np.array):
        one_hot_data = np.zeros(shape=(len(data), 
                                self.max_word_length, 
                                len(self.char_idx)))
        for idx, word in enumerate(data):
            for i, char in enumerate(word):
                if char in self.char_idx:
                    one_hot_data[idx, i, self.char_idx[char]] = 1
            for i in range(len(word), self.max_word_length):
                one_hot_data[idx, i, self.char_idx[self.filler]] = 1
        return one_hot_data

    def one_hot_to_word(self, one_hot: np.array):
        word = []
        for col in one_hot:
            idx = np.argmax(col)
            word.append(self.idx_char[idx])
        return ''.join(word)
    
    def int_to_word(self, word_int: int):
        return self.idx_word[word_int]

    def word_to_int(self, word: str):
        return self.word_idx[word]

    def __variables_one_hot_encoding(self):
        self.max_word_length = len(max(self.x, key=len))
        self.__gen_charset()
        self.char_idx = dict([(char, idx) for idx, char in enumerate(self.charset)])
        self.idx_char = dict([(idx, char) for idx, char in enumerate(self.charset)])

    def __variables_target(self):
        target_word_list = sorted(list(set(self.y.tolist())))
        self.word_idx = dict([word, i] for i, word in enumerate(target_word_list))
        self.idx_word = dict([i, word] for i, word in enumerate(target_word_list))

    def __gen_charset(self):
        charset = set()
        for word in self.x:
            for char in word:
                charset.add(char)
        charset = list(charset)
        charset.sort()
        if self.filler in charset:
            raise ValueError('x contains filler!')
        else:
            charset.append(self.filler)
        self.charset = charset

#### Objekt von SeqenceEncoder erzeugen

In [13]:
import joblib 

### a) Dictionary aus Buchstaben des Alphabets + Trennzeichen erzeugen
charset = 'abcdefghijklmnopqrstuvwxyzßäöü.\t'
charset = [char for char in charset]

### b) SequenceEncoder-Objekt instanziieren
seq_encoder = SequenceEncoder(df_spelling['misspelling'].values, 
            df_spelling['word'].values)
print(charset, '\n', seq_encoder)

### c) Serialisieurng des SequenceEncoder-Objekts 
joblib.dump(seq_encoder, 'seq_encoder.pkl')

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ß', 'ä', 'ö', 'ü', '.', '\t'] 
 <__main__.SequenceEncoder object at 0x00000208C35EA188>


#### Mit SequenceEncoder X, y - Trainingsdaten erzeugen

In [14]:
X, y = seq_encoder.gen_feature_target_data()
X.shape, y.shape

((15921, 15, 32), (15921,))

#### Überprüfen, ob SequenceEncoder funktioniert

In [15]:
misspelling = seq_encoder.one_hot_to_word(X[0])
word = seq_encoder.int_to_word(y[0])
misspelling, word

('diee\t\t\t\t\t\t\t\t\t\t\t', 'die')

#### Train-/Test-Split erzeugen

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                    test_size=.2, random_state=11)
X_train.shape, y_train.shape

((12736, 15, 32), (12736,))

## 03 - Rekurrentes Modell mit TensorFlow aufbauen

In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU

seq_length = seq_encoder.max_word_length # =15
num_chars = len(seq_encoder.charset) # =32
num_words = len(seq_encoder.word_idx) # =500

model = Sequential()
model.add(GRU(units=num_chars*2,
              input_shape=(seq_length, num_chars)))
model.add(Dense(units=num_words, activation='softmax'))

model.compile(  loss='sparse_categorical_crossentropy', 
                optimizer='adam',
                metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru (GRU)                    (None, 64)                18816     
_________________________________________________________________
dense (Dense)                (None, 500)               32500     
Total params: 51,316
Trainable params: 51,316
Non-trainable params: 0
_________________________________________________________________


## 04 - Training mit Callbacks starten

In [18]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

stopping = EarlyStopping( monitor='val_loss', 
                          patience=10,
                        restore_best_weights=True)
checkpoint = ModelCheckpoint( filepath='model_auto_correction_1.h5',
                              monitor='val_loss',
                              save_best_only=True)

history = model.fit(X_train, y_train, epochs=200, batch_size=32,
            callbacks=[stopping, checkpoint],
            validation_data=(X_test, y_test))

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 75/200
Epoch 76/200
Epoch 77/200


## 05 - Schätzungen durchführen 

#### Modell nach dem Training wieder laden

In [20]:
from tensorflow.keras.models import load_model
model = load_model('model_auto_correction_1.h5')

#### Schätzungen mit falschen Wörtern durchführen

In [21]:
test_example = np.array(['alllein', 'frahe', 'beistiel'])
test_example = seq_encoder.gen_one_hot_data(test_example)

pred_word_prob = model.predict(test_example)
pred_word_idx = np.argmax(pred_word_prob, axis=1)

### 3) Daten decodieren
for idx in pred_word_idx:
    print(seq_encoder.int_to_word(idx))

allein
frage
beispiel
