<img src="http://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

# Python for Finance Basics

&copy; Dr. Yves J. Hilpisch | The Python Quants GmbH

http://tpq.io | [training@tpq.io](mailto:trainin@tpq.io) | [@dyjh](http://twitter.com/dyjh)

## `scikit-learn` package

In [None]:
!git clone https://github.com/tpq-classes/pff_basics.git
import sys
sys.path.append('pff_basics')


In [None]:
import numpy as np
import pandas as pd
from pylab import plt
plt.style.use('seaborn-v0_8')
%config InlineBackend.figure_format = 'svg'

## Unsupervised Learning

### Sample Data

In [None]:
from sklearn.datasets import make_blobs

In [None]:
X, y = make_blobs(n_samples=250, n_features=2, random_state=500,
                  centers=3, cluster_std=1)

In [None]:
X[:5]

In [None]:
y[:5]

In [None]:
plt.scatter(X[:, 0], X[:, 1]);

### `KMeans` 

In [None]:
from sklearn.cluster import KMeans

In [None]:
# KMeans?

In [None]:
model = KMeans(n_clusters=3)  # 1. step: model instantiation

In [None]:
model.fit(X)  # 2. step: model fitting

In [None]:
p = model.predict(X)  # 3. step: label/cluster prediction
p[:10]

Note: the algorithm assigns the numbers for the different clusters "randomly".

In [None]:
y[:10]

In [None]:
(y == p)[:10]

In [None]:
# plt.scatter?

In [None]:
plt.scatter(X[:, 0], X[:, 1], c=p, cmap='coolwarm');

### `GaussianMixture`

In [None]:
from sklearn.mixture import GaussianMixture

In [None]:
model = GaussianMixture(n_components=3)  # 1. step

In [None]:
model.fit(X)  # 2. step

In [None]:
p = model.predict(X)  # 3. step
p[:10]

In [None]:
plt.scatter(X[:, 0], X[:, 1], c=p, cmap='coolwarm');

## Financial Data

In [None]:
url = 'https://certificate.tpq.io/mlfin.csv'

In [None]:
raw = pd.read_csv(url, index_col=0, parse_dates=True)

In [None]:
raw.info()

In [None]:
data = pd.DataFrame(raw['.SPX']).dropna()

In [None]:
data.info()

In [None]:
data['r'] = data.pct_change()

In [None]:
data['r'].hist(bins=50);

### Creating Features Data

In [None]:
f = ['mom', 'vol']  # momentum & volatility as features

In [None]:
window = 42

In [None]:
data['mom'] = data['r'].rolling(window).mean()

In [None]:
data['vol'] = data['r'].rolling(window).std()

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
data.dropna(inplace=True)

In [None]:
data[f].plot();

In [None]:
# annualizing the features data
data['mom'] = data['mom'] * 252
data['vol'] = data['vol'] * 252 ** 0.5

In [None]:
data[f].plot(kind='scatter', x='vol', y='mom');

In [None]:
data_ = (data - data.mean()) / data.std()

In [None]:
data_.mean().round(9)

In [None]:
data_.std()

### Clustering

Basic idea for applying clustering as an unsupervised learning method is to identify _**regimes**_ based on the two features "momentum" and "volatility".

In [None]:
model = KMeans(n_clusters=4)  # 1. step

In [None]:
model.fit(data_[f])  # 2. step

In [None]:
p = model.predict(data_[f])  # 3. step
p[:15]

In [None]:
plt.scatter(data['vol'], data['mom'],
            c=p, cmap='coolwarm')
plt.xlabel('volatility')
plt.ylabel('momentum');

In [None]:
# plt.scatter?

In [None]:
plt.scatter(data.index, data['.SPX'],
            c=p, cmap='coolwarm', marker='.');

<img src="http://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

<a href="http://tpq.io" target="_blank">http://tpq.io</a> | <a href="mailto:training@tpq.io">training@tpq.io</a> | <a href="http://twitter.com/dyjh" target="_blank">@dyjh</a> 