<img src="http://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

# Python for Asset Management

### Machine Learning (Basics)

&copy; Dr. Yves J. Hilpisch | The Python Quants GmbH

http://tpq.io | [training@tpq.io](mailto:trainin@tpq.io) | [@dyjh](http://twitter.com/dyjh)


<img src="http://hilpisch.com/images/py4fi_2nd.png" width="35%" align="left">

### The use of the "Python 3.10, Numpy 1.26.4" kernel is recommended.

## Machine Learning

Topics of interest include:

* unsupervised learning
* supervised learning
* feature transforms
* time series prediction

# Imports & Configurations

In [None]:
!git clone https://github.com/tpq-classes/python_for_asset_management.git
import sys
sys.path.append('python_for_asset_management')


In [None]:
import numpy as np
import pandas as pd
import datetime as dt
from pylab import mpl, plt
import warnings

In [None]:
# warnings.simplefilter('ignore')
np.set_printoptions(suppress=True, precision=4)
plt.style.use('seaborn-v0_8')
np.random.seed(1000)
%config InlineBackend.figure_format = 'svg'
# %matplotlib inline

## Unsupervised Learning

### The Data

In [None]:
from sklearn.datasets import make_blobs

In [None]:
X, y = make_blobs(n_samples=250, centers=4,
                  random_state=500, cluster_std=1.25)

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(X[:, 0], X[:, 1], s=50);

### K-Means Clustering

In [None]:
from sklearn.cluster import KMeans

In [None]:
model = KMeans(n_clusters=4, random_state=0)  # 1. step

In [None]:
model.fit(X)  # 2. step 

In [None]:
y_kmeans = model.predict(X)  # 3. prediction

In [None]:
y_kmeans[:12]

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(X[:, 0], X[:, 1], c=y_kmeans,  cmap='coolwarm');

### Gaussian Mixtures

In [None]:
from sklearn.mixture import GaussianMixture

In [None]:
model = GaussianMixture(n_components=4, random_state=0)

In [None]:
model.fit(X)

In [None]:
y_gm = model.predict(X)

In [None]:
y_gm[:12]

In [None]:
(y_gm == y_kmeans).all()

## Supervised Learning

### The Data

In [None]:
from sklearn.datasets import make_classification

In [None]:
n_samples = 100

In [None]:
X, y = make_classification(n_samples=n_samples, n_features=2, n_informative=2,
                           n_redundant=0, n_repeated=0, random_state=250)

In [None]:
X[:5]

In [None]:
X.shape

In [None]:
y[:5]

In [None]:
y.shape

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(x=X[:, 0], y=X[:, 1], c=y, cmap='coolwarm');

### Gaussian Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [None]:
model = GaussianNB()

In [None]:
# model.fit(X)  # unsupervised learning (features only)

In [None]:
model.fit(X, y)  # supervised learning (features and labels)

In [None]:
model.predict_proba(X).round(4)[:5]

In [None]:
pred = model.predict(X)

In [None]:
pred

In [None]:
pred == y

In [None]:
accuracy_score(y, pred)

In [None]:
Xc = X[y == pred]
Xf = X[y != pred]

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(x=Xc[:, 0], y=Xc[:, 1], c=y[y == pred],
            marker='o', cmap='coolwarm')
plt.scatter(x=Xf[:, 0], y=Xf[:, 1], c=y[y != pred],
            marker='x', cmap='coolwarm');

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model = LogisticRegression(C=1)

In [None]:
model.fit(X, y)

In [None]:
model.predict_proba(X).round(4)[:5]

In [None]:
pred = model.predict(X)

In [None]:
pred

In [None]:
accuracy_score(y, pred)

In [None]:
Xc = X[y == pred]
Xf = X[y != pred]

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(x=Xc[:, 0], y=Xc[:, 1], c=y[y == pred],
            marker='o', cmap='coolwarm')
plt.scatter(x=Xf[:, 0], y=Xf[:, 1], c=y[y != pred],
            marker='x', cmap='coolwarm');

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
model = DecisionTreeClassifier(max_depth=1)

In [None]:
model.fit(X, y)

In [None]:
model.predict_proba(X).round(4)[:5]

In [None]:
pred = model.predict(X)

In [None]:
pred

In [None]:
accuracy_score(y, pred)

In [None]:
Xc = X[y == pred]
Xf = X[y != pred]

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(x=Xc[:, 0], y=Xc[:, 1], c=y[y == pred],
            marker='o', cmap='coolwarm')
plt.scatter(x=Xf[:, 0], y=Xf[:, 1], c=y[y != pred],
            marker='x', cmap='coolwarm');

In [None]:
print('{:>8s} | {:8s}'.format('depth', 'accuracy'))
print(20 * '-')
for depth in range(1, 7):
    model = DecisionTreeClassifier(max_depth=depth)
    model.fit(X, y)
    acc = accuracy_score(y, model.predict(X))
    print('{:8d} | {:8.2f}'.format(depth, acc))

### Deep Neural Network

#### scikit-learn

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
model = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=2 * [75], random_state=10,
                     max_iter=500)

In [None]:
%time model.fit(X, y)

In [None]:
pred = model.predict(X)
pred

In [None]:
accuracy_score(y, pred)

#### Keras

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [None]:
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense

In [None]:
keras.__version__

In [None]:
model = Sequential()
model.add(Dense(256, activation='relu',
                input_shape=[2]))
model.add(Dense(256, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam',
             metrics=['accuracy'])

In [None]:
%time model.fit(X, y, epochs=50, verbose=False)

In [None]:
model.evaluate(X, y)

In [None]:
# model.predict(X)

In [None]:
pred =np.where(model.predict(X) > 0.5, 1, 0)
pred[:10]

In [None]:
%time model.fit(X, y, epochs=1000, verbose=False)

In [None]:
model.evaluate(X, y)

## Feature Transforms

In [None]:
from sklearn import preprocessing

In [None]:
X[:5]

In [None]:
Xs = preprocessing.StandardScaler().fit_transform(X)
Xs[:5]

In [None]:
Xm = preprocessing.MinMaxScaler().fit_transform(X)
Xm[:5]

In [None]:
Xn1 = preprocessing.Normalizer(norm='l1').transform(X)
Xn1[:5]

In [None]:
Xn2 = preprocessing.Normalizer(norm='l2').transform(X)
Xn2[:5]

In [None]:
plt.figure(figsize=(10, 6))
markers = ['o', '.', 'x', '^', 'v']
data_sets = [X, Xs, Xm, Xn1, Xn2]
labels = ['raw', 'standard', 'minmax', 'norm(1)', 'norm(2)']
for x, m, l in zip(data_sets, markers, labels):
    plt.scatter(x=x[:, 0], y=x[:, 1], c=y,
            marker=m, cmap='coolwarm', label=l)
plt.legend();

In [None]:
X[:5]

In [None]:
Xb = preprocessing.Binarizer().fit_transform(X)
Xb[:8]

In [None]:
2 ** 2

In [None]:
Xd = np.digitize(X, bins=[-1, 0, 1])
Xd[:5]

In [None]:
4 ** 2

## Train-Test Splits 

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [None]:
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.33,
                                                    random_state=0)

In [None]:
model = SVC(C=1, kernel='linear')

In [None]:
model.fit(train_x, train_y)

In [None]:
pred_train = model.predict(train_x)

In [None]:
accuracy_score(train_y, pred_train)

In [None]:
pred_test = model.predict(test_x)

In [None]:
test_y == pred_test

In [None]:
accuracy_score(test_y, pred_test)

In [None]:
test_c = test_x[test_y == pred_test]
test_f = test_x[test_y != pred_test]

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(x=test_c[:, 0], y=test_c[:, 1], c=test_y[test_y == pred_test],
            marker='o', cmap='coolwarm')
plt.scatter(x=test_f[:, 0], y=test_f[:, 1], c=test_y[test_y != pred_test],
            marker='x', cmap='coolwarm');

In [None]:
bins = np.linspace(-4.5, 4.5, 50)
bins

In [None]:
Xd = np.digitize(X, bins=bins)

In [None]:
Xd[:5]

In [None]:
train_x, test_x, train_y, test_y = train_test_split(Xd, y, test_size=0.33,
                                                    random_state=0)

In [None]:
print('{:>8s} | {:8s}'.format('kernel', 'accuracy'))
print(20 * '-')
for kernel in ['linear', 'poly', 'rbf', 'sigmoid']:
    model = SVC(C=1, kernel=kernel)
    model.fit(train_x, train_y)
    acc = accuracy_score(test_y, model.predict(test_x))
    print('{:>8s} | {:8.3f}'.format(kernel, acc))

# Algorithmic Trading Strategies

## Linear OLS Regression

### The Data

In [None]:
raw = pd.read_csv('http://hilpisch.com/aiif_eikon_eod_data.csv',
                  index_col=0, parse_dates=True).dropna()

In [None]:
raw.columns

In [None]:
raw.info()

In [None]:
symbol = 'EUR='

In [None]:
data = pd.DataFrame(raw[symbol])

In [None]:
data['returns'] = np.log(data / data.shift(1))

In [None]:
data.dropna(inplace=True)

In [None]:
data['direction'] = np.sign(data['returns']).astype(int)

In [None]:
data.head()

In [None]:
data['returns'].hist(bins=35, figsize=(10, 6));

In [None]:
lags = 2

In [None]:
def create_lags(data):
    global cols
    cols = []
    for lag in range(1, lags + 1):
        col = 'lag_{}'.format(lag)
        data[col] = data['returns'].shift(lag)
        cols.append(col)

In [None]:
create_lags(data)

In [None]:
data.head()

In [None]:
data.dropna(inplace=True)

In [None]:
data.plot.scatter(x='lag_1', y='lag_2', c='returns', 
                  cmap='coolwarm', figsize=(10, 6), colorbar=True)
plt.axvline(0, c='r', ls='--')
plt.axhline(0, c='r', ls='--');

### Regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
model = LinearRegression()

In [None]:
data['pos_ols_1'] = model.fit(data[cols], data['returns']).predict(data[cols])

In [None]:
data['pos_ols_2'] = model.fit(data[cols], data['direction']).predict(data[cols])

In [None]:
data[['pos_ols_1', 'pos_ols_2']].head()

In [None]:
data[['pos_ols_1', 'pos_ols_2']] = np.where(
            data[['pos_ols_1', 'pos_ols_2']] > 0, 1, -1)

In [None]:
data['pos_ols_1'].value_counts()

In [None]:
data['pos_ols_2'].value_counts()

In [None]:
(data['pos_ols_1'].diff() != 0).sum()

In [None]:
(data['pos_ols_2'].diff() != 0).sum()

In [None]:
data['strat_ols_1'] = data['pos_ols_1'] * data['returns']

In [None]:
data['strat_ols_2'] = data['pos_ols_2'] * data['returns']

In [None]:
data[['returns', 'strat_ols_1', 'strat_ols_2']].sum().apply(np.exp)

In [None]:
(data['direction'] == data['pos_ols_1']).value_counts()

In [None]:
(data['direction'] == data['pos_ols_2']).value_counts()

In [None]:
data[['returns', 'strat_ols_1', 'strat_ols_2']].cumsum(
        ).apply(np.exp).plot(figsize=(10, 6));

## Classification Algorithms

In [None]:
from sklearn import linear_model
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [None]:
def create_bins(data, bins=[0]):
    global cols_bin
    cols_bin = []
    for col in cols:
        col_bin = col + '_bin'
        data[col_bin] = np.digitize(data[col], bins=bins)
        cols_bin.append(col_bin)

In [None]:
create_bins(data)

In [None]:
data[cols_bin + ['direction']].head()

In [None]:
C = 1

In [None]:
models = {
    'log_reg': linear_model.LogisticRegression(C=C),
    'gauss_nb': GaussianNB(),
    'svm': SVC(C=C)
}

In [None]:
def fit_models(data):
    mfit = {model: models[model].fit(data[cols_bin], data['direction'])
            for model in models.keys()} 

In [None]:
fit_models(data)

In [None]:
def derive_positions(data):
    for model in models.keys():
        data['pos_' + model] = models[model].predict(data[cols_bin])

In [None]:
derive_positions(data)

In [None]:
def evaluate_strats(data):
    global sel
    sel = []
    for model in models.keys():
        col = 'strat_' + model 
        data[col] = data['pos_' + model] * data['returns']
        sel.append(col)
    sel.insert(0, 'returns')

In [None]:
evaluate_strats(data)

In [None]:
data[sel].sum().apply(np.exp)

In [None]:
data[sel].cumsum().apply(np.exp).plot(figsize=(10, 6));

In [None]:
data = pd.DataFrame(raw[symbol])

In [None]:
data['returns'] = np.log(data / data.shift(1))

In [None]:
data['direction'] = np.sign(data['returns'])

In [None]:
lags = 5
create_lags(data)
data.dropna(inplace=True)

In [None]:
create_bins(data)
cols_bin

In [None]:
data[cols_bin].head()

In [None]:
data.dropna(inplace=True)

In [None]:
fit_models(data)

In [None]:
derive_positions(data)

In [None]:
evaluate_strats(data)

In [None]:
data[sel].sum().apply(np.exp)

In [None]:
data[sel].cumsum().apply(np.exp).plot(figsize=(10, 6));

In [None]:
mu = data['returns'].mean()
v = data['returns'].std()

In [None]:
bins = [mu - v, mu, mu + v]
bins

In [None]:
create_bins(data, bins)

In [None]:
data[cols_bin].head()

In [None]:
fit_models(data)

In [None]:
derive_positions(data)

In [None]:
evaluate_strats(data)

In [None]:
data[sel].sum().apply(np.exp)

In [None]:
data[sel].cumsum().apply(np.exp).plot(figsize=(10, 6));

<img src="http://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

<a href="http://tpq.io" target="_blank">http://tpq.io</a> | <a href="http://twitter.com/dyjh" target="_blank">@dyjh</a> | <a href="mailto:training@tpq.io">training@tpq.io</a>