In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage

%matplotlib inline
np.set_printoptions(precision=5, suppress=True)  # suppress scientific float notation

In [None]:
from scipy.spatial.distance import pdist

X = [[0, 0], [0, 3], [4, 0], [4, 3]]
pd = pdist(X)
print(pd)

In [None]:
Z = linkage(X)
print(Z)
Z = linkage(X, 'ward')
print(Z)

https://joernhees.de/blog/2015/08/26/scipy-hierarchical-clustering-and-dendrogram-tutorial/

In [None]:
# generate two clusters: a with 100 points, b with 50:
np.random.seed(4711)  # for repeatability of this tutorial
a = np.random.multivariate_normal([10, 0], [[3, 1], [1, 4]], size=[100,])
b = np.random.multivariate_normal([0, 20], [[3, 1], [1, 4]], size=[50,])
X = np.concatenate((a, b),)
print(X.shape)  # 150 samples with 2 dimensions
plt.scatter(X[:,0], X[:,1])
plt.show()

In [None]:
# generate the linkage matrix
Z = linkage(X, 'ward')
print(Z.shape)

In [None]:
print(Z[:20])

In [None]:
print(Z[129:, :])

In [None]:
# calculate full dendrogram
plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(
    Z,
    leaf_rotation=90.,  # rotates the x axis labels
    leaf_font_size=8.,  # font size for the x axis labels
)
plt.show()

<b>Scikit-Learn 4.2  Feature Extraction</b>

In [None]:
measurements = [
    {'city': 'Dubai', 'temperature': 33.},
    {'city': 'London', 'temperature': 12.},
    {'city': 'San Francisco', 'temperature': 18.},
    {'city': 'San Francisco', 'temperature': 16.},
]
measurements

In [None]:
from sklearn.feature_extraction import DictVectorizer

vec = DictVectorizer()
vec.fit_transform(measurements).toarray()

In [None]:
vec.get_feature_names()

<b>Sandbox</b>

In [None]:
r = np.random.randn(10000)
plt.hist(r, bins=100)
plt.show()

In [None]:
from scipy.stats import norm
r = np.random.randn(1000, 2)
plt.scatter(r[:, 0], r[:, 1])

In [None]:
r[:, 1] = 5 * r[:, 1] + 2

plt.scatter(r[:, 0], r[:, 1])

In [None]:
plt.axis('equal')
plt.scatter(r[:, 0], r[:, 1])

<b>Multivariate Normal (using Scipy)</b>

In [None]:
cov = np.array([[1, 0.8], [0.8, 3]])
from scipy.stats import multivariate_normal as mvn
mu = np.array([0, 2])
r = mvn.rvs(mean=mu, cov=cov, size=1000)
plt.scatter(r[:, 0], r[:, 1])
plt.gcf().set_size_inches(5, 5 * 3**0.5)
plt.show()

<b>Multivariate Normal (using Numpy)</b>

In [None]:
r = np.random.multivariate_normal(mean=mu, cov=cov, size=1000)
plt.scatter(r[:, 0], r[:, 1])
plt.gcf().set_size_inches(5, 5 * 3**0.5)
plt.show()

<b>Numpy Fourier Transform</b>

In [None]:
x = np.linspace(0, 100, 10000)
y = np.sin(x) + np.sin(3 * x) + np.sin(5 * x)
plt.plot(y)
plt.show()

In [None]:
Y = np.fft.fft(y)
Y[100]

In [None]:
plt.plot(np.abs(Y))
a = plt.gca()
plt.axis([0, 100, 0, 5000])
plt.show()

In [None]:
print(np.abs(Y)[10:20])
print(np.abs(Y)[40:50])
print(np.abs(Y)[75:85])

In [None]:
print(16 / 100 * 2 * np.pi)
print(48 / 100 * 2 * np.pi)
print(80 / 100 * 2 * np.pi)

<b>Central Limit Theorem</b>

In [None]:
# N Uniform samples
N = 1000
X = np.random.random(N)
print(X.std()**2)
print(X.mean())
#plt.hist(X)
#plt.show()

In [None]:
# M samples from the sum of N random samples
M = 10000
Y = np.zeros(M)

for i in range(M):
    X = np.random.random(N)
    Y[i] = X.sum()

print(Y.std()**2)
print(Y.mean())
#plt.hist(Y, bins=100)
#plt.show()

In [None]:
1 / 12