<img src="http://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

# NLP Basics

**Prediction of Sequences of Characters**

&copy; Dr. Yves J. Hilpisch

<a href="http://tpq.io" target="_blank">http://tpq.io</a> | <a href="http://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:team@tpq.io">team@tpq.io</a>

## Imports

In [None]:
!git clone https://github.com/tpq-classes/natural_language_processing.git
import sys
sys.path.append('natural_language_processing')


In [None]:
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, SimpleRNN, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [None]:
np.set_printoptions(suppress=True)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [None]:
import warnings
warnings.simplefilter('ignore')

In [None]:
from pylab import plt
plt.style.use('seaborn-v0_8')
%config InlineBackend.figure_format = 'svg'

## From Characters to Numbers and Back

In [None]:
from string import ascii_lowercase

In [None]:
ab = ascii_lowercase + ' .'
ab

In [None]:
cti = {c: i for i, c in enumerate(ab)}

In [None]:
# cti

In [None]:
cti['a']

In [None]:
cti['m']

In [None]:
iv = [cti[c] for c in 'word']
iv

In [None]:
itc = {i: c for i, c in enumerate(ab)}

In [None]:
# itc

In [None]:
itc[4]

In [None]:
cl = [itc[i] for i in iv]
cl

In [None]:
''.join(cl)

In [None]:
s = 'this is a short sentence.'

In [None]:
iv = [cti[c] for c in s]
iv[:10]

In [None]:
cl = [itc[i] for i in iv]
''.join(cl)

## Predicting the Alphabet

In [None]:
ab = ab[:-2]

In [None]:
abi = np.array([cti[c] for c in ab])
abi[:10]

In [None]:
lags = 3

In [None]:
def create_matrix(n, lags=lags):
    data = list()
    for i in range(0, len(n) - lags):
        data.append(n[i:i+lags+1])
    return np.array(data)

In [None]:
data = create_matrix(abi)
data[:3]

In [None]:
def generate_model(hu=24, lags=lags):
    model = Sequential()
    model.add(Dense(hu, activation='relu', input_dim=lags))
    model.add(Dense(hu, activation='relu'))
    model.add(Dense(1, activation='linear'))
    model.compile(loss='mse', optimizer=Adam())
    return model

In [None]:
model = generate_model()

In [None]:
f = data[:, :lags]
l = data[:, lags]

In [None]:
%time model.fit(f, l, epochs=250, verbose=False)

In [None]:
model.predict(f)[:5].flatten()

In [None]:
model.predict(f)[:5].round().flatten()

In [None]:
model.evaluate(f, l)

In [None]:
sample = 'lmn'

In [None]:
sample_ = np.array([cti[c] for c in sample])
sample_

In [None]:
p = model.predict(sample_.reshape(1, -1))
p

In [None]:
p[0, 0]

In [None]:
itc[round(p[0, 0])]

## Predicting Characters in Words

In [None]:
import requests

In [None]:
text = requests.get('https://hilpisch.com/walden.txt').text

In [None]:
text[:500]

In [None]:
text = text.replace('\n', ' ')

In [None]:
tokens = text.split()
tokens[:5]

In [None]:
tokens = [t.lower() for t in tokens if len(t) > 3]

In [None]:
tokens[:5]

In [None]:
words = [t for t in tokens if len(t) == 4 and t.isalpha()]

In [None]:
len(words)

In [None]:
words[:10]

In [None]:
len(set(words))

In [None]:
words = sorted(list(set(words)))

In [None]:
words[50:60]

In [None]:
wl = list()
for w in words:
    il = list()
    for c in w:
        il.append(cti[c])
    wl.append(il)
wl = np.array(wl)

In [None]:
wl[50:60]

In [None]:
n = 500

In [None]:
words[n:n + 10]

In [None]:
data = wl[n:n + 10]
data

In [None]:
f = data[:, :lags]
l = data[:, lags]

In [None]:
model = generate_model(hu=128)

In [None]:
%time model.fit(f, l, epochs=2500, verbose=False)

In [None]:
model.evaluate(f, l)

In [None]:
sample = 'nic'
# sample = 'nin'
# sample = 'new'

In [None]:
sample_ = np.array([cti[c] for c in sample])
sample_

In [None]:
p = model.predict(sample_.reshape(1, -1))
p

In [None]:
p[0, 0]

In [None]:
itc[round(p[0, 0])]

<img src="http://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

<a href="http://tpq.io" target="_blank">http://tpq.io</a> | <a href="http://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:team@tpq.io">team@tpq.io</a>