<img src="http://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

# NLP Basics

**Prediction of Sequences of Numbers**

&copy; Dr. Yves J. Hilpisch

<a href="http://tpq.io" target="_blank">http://tpq.io</a> | <a href="http://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:team@tpq.io">team@tpq.io</a>

## Imports

In [None]:
!git clone https://github.com/tpq-classes/natural_language_processing.git
import sys
sys.path.append('natural_language_processing')


In [None]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers.legacy import Adam
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [None]:
np.set_printoptions(suppress=True)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [None]:
import warnings
warnings.simplefilter('ignore')

In [None]:
from pylab import plt
plt.style.use('seaborn-v0_8')
%config InlineBackend.figure_format = 'svg'

## Binary Classification

In [None]:
# increased number of binary features
# improves prediction results (= fewer overlaps in patterns)
n = np.random.randint(0, 2, (15, 5))

In [None]:
lags = n.shape[1]

In [None]:
n[:3]

In [None]:
model = Sequential()
model.add(Dense(24, activation='relu', input_dim=lags))
model.add(Dense(24, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer=Adam(learning_rate=0.0001),
             metrics=['accuracy'])

In [None]:
f = n[:, :lags]
f[:4]

In [None]:
mu = f.mean(axis=0)
mu

In [None]:
std = f.std(axis=0)
std

In [None]:
f_ = (f - mu) / std

In [None]:
f_.mean(axis=0)

In [None]:
f_.std(axis=0)

In [None]:
l = n[:, 3]
l[:4]

In [None]:
%time model.fit(f, l, epochs=1000, verbose=False)

In [None]:
model.predict(f)

In [None]:
np.where(model.predict(f) > 0.5, 1, 0).flatten()

In [None]:
l

In [None]:
model.evaluate(f, l)

## Non-Binary Classification

In [None]:
n = np.arange(0, 15)
n

In [None]:
lags = 3

In [None]:
def create_matrix(n, lags=lags):
    data = list()
    for i in range(0, len(n) - lags):
        data.append(n[i:i+lags+1])
    return np.array(data)

In [None]:
data = create_matrix(n)

In [None]:
data

In [None]:
f = data[:, :lags]
# f

In [None]:
l = data[:, lags]
l

In [None]:
nc = len(set(l))
nc

In [None]:
model = Sequential()
model.add(Dense(24, activation='relu', input_dim=lags))
model.add(Dense(nc, activation='softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(learning_rate=0.001))

In [None]:
encoder = OneHotEncoder(sparse_output=False)

In [None]:
l_ = encoder.fit_transform(l.reshape(-1, 1))

In [None]:
%time model.fit(f, l_, epochs=1000, verbose=False)

In [None]:
model.predict(f)[:3]

In [None]:
np.argmax(model.predict(f), axis=1)

In [None]:
l[np.argmax(model.predict(f), axis=1)]

In [None]:
# model.weights

<img src="http://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

<a href="http://tpq.io" target="_blank">http://tpq.io</a> | <a href="http://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:team@tpq.io">team@tpq.io</a>