<img src="http://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

# NLP Basics

**Prediction of Text (based on Characters)**

&copy; Dr. Yves J. Hilpisch

<a href="http://tpq.io" target="_blank">http://tpq.io</a> | <a href="http://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:team@tpq.io">team@tpq.io</a>

## Imports

In [None]:
!git clone https://github.com/tpq-classes/natural_language_processing.git
import sys
sys.path.append('natural_language_processing')


In [None]:
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, SimpleRNN, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers.legacy import Adam
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from keras.preprocessing.sequence import TimeseriesGenerator

In [None]:
np.set_printoptions(suppress=True)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [None]:
import warnings
warnings.simplefilter('ignore')

In [None]:
from pylab import plt
plt.style.use('seaborn-v0_8')
%config InlineBackend.figure_format = 'svg'

## The Text

In [None]:
# text = 'this is a short sentence. this is another one. and yet another one.'

In [None]:
# text = '''this is a short sentence. this is another one. and yet another one. but what about adding even more text to the string? this might be more difficult.'''

In [None]:
text = '''import math
from scipy.stats import norm

def black_scholes_call(S, K, T, r, sigma):
    d1 = (math.log(S / K) + (r + 0.5 * sigma**2) * T) / (sigma * math.sqrt(T))
    d2 = d1 - sigma * math.sqrt(T)
    call_price = S * norm.cdf(d1) - K * math.exp(-r * T) * norm.cdf(d2)
    return call_price'''

In [None]:
text = text.lower()  # .replace('\n', ' ')

In [None]:
text

In [None]:
length = 10

In [None]:
snippets = list()
next_chars = list()

In [None]:
for i in range(len(text) - length):
    snippets.append(text[i:i + length])
    next_chars.append(text[i + length])

In [None]:
snippets[:5]

In [None]:
next_chars[:5]

In [None]:
chars = sorted(set(text))
chars[:10]

In [None]:
len(chars)

In [None]:
cti = {c: i for i, c in enumerate(chars)}

In [None]:
itc = {i: c for i, c in enumerate(chars)}

In [None]:
X = list()
for s in snippets:
    il = list()
    for c in s:
        il.append(cti[c])
    X.append(il)
X = np.array(X)

In [None]:
X[:5]

In [None]:
y = np.array([cti[c] for c in next_chars])

In [None]:
y[:5]

## RNNs for Classification 

In [None]:
encoder = OneHotEncoder(sparse_output=False)

In [None]:
y_ = encoder.fit_transform(y.reshape(-1, 1))

In [None]:
len(chars)

In [None]:
y_.shape

In [None]:
model = Sequential()
model.add(LSTM(64, activation='relu',
               return_sequences=True, input_shape=(length, 1)))
model.add(LSTM(64, activation='relu'))
model.add(Dense(len(chars), activation='softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(learning_rate=0.0001))

In [None]:
%time model.fit(X, y_, epochs=750, verbose=False)

In [None]:
model.predict(X)[:1]

In [None]:
p = np.argmax(model.predict(X), axis=1)
p[:10]

In [None]:
tp = [itc[max(i, 0)] for i in p]
textp = ''.join(tp)
textp

In [None]:
print(textp)

In [None]:
sum([text[length:][i] == textp[i] for i in range(len(textp))]) / len(textp)

<img src="http://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

<a href="http://tpq.io" target="_blank">http://tpq.io</a> | <a href="http://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:team@tpq.io">team@tpq.io</a>