<img src="http://certificate.tpq.io/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

# AI in Finance

**Workshop at Texas State University (October 2023)**

**_Simple Financial Examples_**

Dr. Yves J. Hilpisch | The Python Quants GmbH | http://tpq.io

## Imports

In [None]:
import numpy as np
import pandas as pd
from pylab import plt
plt.style.use('seaborn-v0_8')
%config InlineBackend.figure_format = 'svg'

## Stock Clustering

Data from [EODHistoricalData](https://eodhistoricaldata.com/r/?ref=X8R79ISB).

### The Data

In [None]:
f = pd.read_csv('https://certificate.tpq.io/eod_fundamentals.csv', index_col=0)

In [None]:
f

In [None]:
data = f.T[['QuarterlyRevenueGrowthYOY', 'ReturnOnEquityTTM']].astype(float)

In [None]:
data

In [None]:
data.columns = ['Growth', 'ROE']

In [None]:
data.info()

### Raw Data

In [None]:
from sklearn.cluster import KMeans

In [None]:
model = KMeans(n_clusters=3, n_init=2)  # 1.step: model instatiation

In [None]:
model.fit(data)  # 2. step: model fitting

In [None]:
c = model.predict(data)  # 3. step: prediction
c

In [None]:
data.plot.scatter(x='Growth', y='ROE', c=c, cmap='brg');

### Normalized Data

In [None]:
data_ = (data - data.mean()) / data.std()  # Gaussian normalization/ z-score normalization

In [None]:
data_

In [None]:
model = KMeans(n_clusters=5, init='random', n_init='auto', algorithm='lloyd')

In [None]:
model.fit(data_)

In [None]:
c = model.predict(data_)
c

In [None]:
data_.plot.scatter(x='Growth', y='ROE', c=c, cmap='brg');

In [None]:
data.plot.scatter(x='Growth', y='ROE', c=c, cmap='brg');

### Adding 3rd Feature 

In [None]:
cols = ['QuarterlyRevenueGrowthYOY', 'ReturnOnEquityTTM', 'DividendYield']

In [None]:
data = f.T[cols].astype(float)

In [None]:
data

In [None]:
data.columns = ['Growth', 'ROE', 'DY']

In [None]:
model = KMeans(n_clusters=4, n_init=2)

In [None]:
model.fit(data)

In [None]:
c = model.predict(data)
c

In [None]:
from mpl_toolkits import mplot3d

In [None]:
fig = plt.figure(figsize = (10, 10))
ax = plt.axes(projection ='3d')
ax.scatter3D(data['Growth'], data['ROE'], data['DY'],
             c=c, s=100, cmap='brg')
ax.set_xlabel('Growth')
ax.set_ylabel('ROE')
ax.set_zlabel('DY')
ax.view_init(elev=15, azim=-30);

In [None]:
data_ = (data - data.mean()) / data.std()

In [None]:
data_

In [None]:
model = KMeans(n_clusters=5, init='random', n_init='auto', algorithm='lloyd')

In [None]:
model.fit(data_)

In [None]:
c = model.predict(data_)
c

In [None]:
fig = plt.figure(figsize = (10, 10))
ax = plt.axes(projection ='3d')
ax.scatter3D(data['Growth'], data['ROE'], data['DY'],
             c=c, s=100, cmap='brg')
ax.set_xlabel('Growth')
ax.set_ylabel('ROE')
ax.set_zlabel('DY')
ax.view_init(elev=15, azim=-30);

## Stock Price Prediction

### The Data

In [None]:
path = 'https://certificate.tpq.io/eod_prices.csv'

In [None]:
raw = pd.read_csv(path, index_col=0, parse_dates=True)

In [None]:
raw.info()

In [None]:
symbol = 'GOOG'

In [None]:
data = pd.DataFrame(raw[symbol])

In [None]:
data.plot();

### Generating the Features (Prices)

In [None]:
lags = 5

In [None]:
cols = list()
for lag in range(1, lags + 1):
    col = f'lag_{lag}'
    data[col] = data[symbol].shift(lag)
    cols.append(col)

In [None]:
data.head(7)

In [None]:
data.dropna(inplace=True)

### Prediction Stock Prices (OLS)

In [None]:
reg = np.linalg.lstsq(data[cols], data[symbol], rcond=-1)[0]
reg

In [None]:
data['pred_ols'] = np.dot(data[cols], reg)

In [None]:
data[[symbol, 'pred_ols']].iloc[-100:].plot();

In [None]:
sum((data[symbol] - data['pred_ols']) ** 2) / len(data)

### Predicting Stock Prices (ML)

In [None]:
data_ = (data - data.mean()) / data.std()

In [None]:
from sklearn.neural_network import MLPRegressor

In [None]:
model = MLPRegressor(hidden_layer_sizes=[24, 24],
                     max_iter=1000)

In [None]:
%time model.fit(data_[cols], data[symbol])

In [None]:
data['pred_mlp'] = model.predict(data_[cols])

In [None]:
data[[symbol, 'pred_mlp']].plot();

In [None]:
data[[symbol, 'pred_mlp']].iloc[-100:].plot();

In [None]:
sum((data[symbol] - data['pred_mlp']) ** 2) / len(data)

### Generating the Features (Log Returns)

In [None]:
data = pd.DataFrame(raw[symbol])

In [None]:
data['r'] = np.log(data[symbol] / data[symbol].shift(1))

In [None]:
data['d'] = np.sign(data['r'])

In [None]:
data.head()

In [None]:
lags = 5

In [None]:
cols = list()
for lag in range(1, lags + 1):
    col = f'lag_{lag}'
    data[col] = data['r'].shift(lag)
    cols.append(col)

In [None]:
data.head(7)

In [None]:
data.dropna(inplace=True)

### Prediction Stock Prices (OLS)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
reg = np.linalg.lstsq(data[cols], data['d'], rcond=-1)[0]
reg

In [None]:
data['pred_ols'] = np.dot(data[cols], reg)

In [None]:
data.head()

In [None]:
data['pred_ols'].plot();

In [None]:
p = np.sign(data['pred_ols'])
p[:10]

In [None]:
accuracy_score(data['d'], p)

In [None]:
data['s'] = p * data['r']

In [None]:
data[['r', 's']].cumsum().apply(np.exp).plot();

### Predicting Stock Prices (ML)

In [None]:
data_ = (data - data.mean()) / data.std()

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
model = MLPClassifier(hidden_layer_sizes=[24],
                     max_iter=1000)

In [None]:
%time model.fit(data_[cols], data['d'])

In [None]:
data['pred_mlp'] = model.predict(data_[cols])

In [None]:
data['pred_mlp'].value_counts()

In [None]:
accuracy_score(data['d'], data['pred_mlp'])

In [None]:
data['s'] = data['pred_mlp'] * data['r']

In [None]:
data[['r', 's']].cumsum().apply(np.exp).plot();

In [None]:
model.predict_proba(data_[cols])[:10]

In [None]:
np.argmax(model.predict_proba(data_[cols]), axis=1)[:10]

In [None]:
model.predict(data_[cols])[:10]

## Exercise

Using NumPy, generate a random walk with a fixed seed.

Try to predict the future movement (direction) of the random walk and a real financial time series with

* OLS regression and
* ML supervised learning (e.g. MLP)

Implement the analysis both

* (only in-sample and)
* with train-**test** split (!!!)

What can you say about the accuracy ratios for all cases?

In [None]:
raw['RWALK'] = np.random.standard_normal(len(raw))
raw['RWALK'].iloc[0] = 0.0
raw['RWALK'] = raw['RWALK'].cumsum() + 100

In [None]:
raw['RWALK'].plot();

In [None]:
symbol = 'GS'
symbol = 'AAPL'
symbol = 'RWALK'
data = pd.DataFrame(raw[symbol])
data['r'] = np.log(data / data.shift(1))
data['d'] = np.sign(data['r'])

In [None]:
lags = 5
cols = list()
for lag in range(1, lags + 1):
    col = f'lag_{lag}'
    data[col] = data['r'].shift(lag)
    cols.append(col)
data.dropna(inplace=True)

In [None]:
split = int(len(data) * 0.8)
split

In [None]:
train = data.iloc[:split].copy()

In [None]:
mu, std = train.mean(), train.std()

In [None]:
train_ = (train - mu) / std

In [None]:
test = data.iloc[split:].copy()

In [None]:
test_ = (test - mu) / std

In [None]:
model = MLPClassifier(max_iter=1000)

In [None]:
%time model.fit(train_[cols], train['d'])

In [None]:
accuracy_score(train['d'], model.predict(train_[cols]))

In [None]:
test['p'] = model.predict(test_[cols])

In [None]:
test['p'].value_counts()

In [None]:
accuracy_score(test['d'], model.predict(test_[cols]))

In [None]:
test['s'] = test['p'] * test['r']

In [None]:
test[['r', 's']].cumsum().apply(np.exp).plot();

<img src="http://certificate.tpq.io/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

<img src='http://hilpisch.com/tpq_logo.png' width="35%" align="right">

<br><br><a href="http://tpq.io" target="_blank">http://tpq.io</a> | <a href="http://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:team@tpq.io">ai@tpq.io</a>