# 1.1 - Unsupervised learning

#### > k-means clustering with scikit-learn

In [None]:
samples = [[ 5.0, 3.3, 1.4, 0.2]
 [ 5.0, 3.5, 1.3, 0.3]
 [ 4.9, 2.4, 3.3, 1.0]
 [ 6.3, 2.8, 5.1, 1.5]
 [ 7.2, 3.2, 6.0, 1.8]]
from sklearn.cluster import KMeans
model = KMeans(n_clusters=3)
model.fit(samples)
KMeans(algorithm='auto')
labels = model.predict(samples)
print(labels)

#### > Cluster labels for new samples

In [None]:
print(new_samples)
#[[ 5.7 4.4 1.5 0.4]
# [ 6.5 3. 5.5 1.8]
# [ 5.8 2.7 5.1 1.9]]
new_labels = model.predict(new_samples)
print(new_labels)
#[0 2 1]

#### > Scatter plots

In [None]:
import matplotlib.pyplot as plt
xs = samples[:,0]
ys = samples[:,2]
plt.scatter(xs, ys, c=labels)
plt.show()

# 1.2 - Evaluating a clustering

#### > Aligning labels and species

In [None]:
import pandas as pd
df = pd.DataFrame({'labels': labels, 'species': species})
print(df)

#### > Crosstab of labels and species

ct = pd.crosstab(df['labels'], df['species'])
print(ct)

#### > Inertia measures clustering quality

In [None]:
from sklearn.cluster import KMeans
model = KMeans(n_clusters=3)
model.fit(samples)
print(model.inertia_)

# 1.3 - Transforming features for better clusterings

#### > Clustering the wines

In [None]:
from sklearn.cluster import KMeans
model = KMeans(n_clusters=3)
labels = model.fit_predict(samples)

#### > Clusters vs. varieties

In [None]:
df = pd.DataFrame({'labels': labels,'varieties': varieties})
ct = pd.crosstab(df['labels'], df['varieties'])
print(ct)

#### > sklearn StandardScaler

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(samples)
samples_scaled = scaler.transform(samples)

#### > Pipelines combine multiple steps

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

scaler = StandardScaler()
kmeans = KMeans(n_clusters=3)
from sklearn.pipeline import make_pipeline
pipeline = make_pipeline(scaler, kmeans)
pipeline.fit(samples)
labels = pipeline.predict(samples)

#### > Feature standardization improves clustering

In [None]:
df = pd.DataFrame({'labels': labels, 'varieties': varieties})
ct = pd.crosstab(df['labels'], df['varieties'])
print(ct)

# 2.1 - Visualizing hierarchies

#### > Hierarchical clustering with SciPy

In [None]:
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage, dendrogram
mergings = linkage(samples, method='complete')
dendrogram(mergings, labels=country_names,leaf_rotation=90,leaf_font_size=6)
plt.show()

# 2.2 - Cluster labels in hierarchical clustering

#### > Extracting cluster labels using fcluster

In [None]:
from scipy.cluster.hierarchy import linkage
mergings = linkage(samples, method='complete')
from scipy.cluster.hierarchy import fcluster
labels = fcluster(mergings, 15, criterion='distance')
print(labels) 

#### > Aligning cluster labels with country names

In [None]:
import pandas as pd
pairs = pd.DataFrame({'labels': labels,'countries': country_names})
print(pairs.sort_values('labels')) 

# 2.3 - t-SNE for 2-dimensional maps

#### > t-SNE in sklearn

In [None]:
print(samples)
[[ 5. 3.3 1.4 0.2]
 [ 5. 3.5 1.3 0.3]
 [ 4.9 2.4 3.3 1. ]
 [ 6.3 2.8 5.1 1.5]
 [ 4.9 3.1 1.5 0.1]]
print(species)

In [None]:
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
model = TSNE(learning_rate=100)
transformed = model.fit_transform(samples)
xs = transformed[:,0]
ys = transformed[:,1]
plt.scatter(xs, ys, c=species)
plt.show()

# 3.1 - Visualizing the PCA transformation

#### > Using scikit-learn PCA

In [None]:
print(samples)
[[ 2.8 3.92]
 [ 2.65 3.4 ]
 [ 2.05 1.6 ]]
from sklearn.decomposition import PCA
model = PCA()
model.fit(samples)
transformed = model.transform(samples)

#### > PCA features

In [None]:
print(transformed)

# 3.2 - Intrinsic dimension

#### > Plotting the variances of PCA features

In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(samples)
features = range(pca.n_components_)

In [None]:
plt.bar(features, pca.explained_variance_)
plt.xticks(features)
plt.ylabel('variance')
plt.xlabel('PCA feature')
plt.show()

# 3.3 - Dimension reduction with PCA

#### > Dimension reduction of iris dataset

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca.fit(samples)
transformed = pca.transform(samples)
print(transformed.shape)

#### > Iris dataset in 2 dimensions

In [None]:
import matplotlib.pyplot as plt
xs = transformed[:,0]
ys = transformed[:,1]
plt.scatter(xs, ys, c=species)
plt.show()

#### > TruncatedSVD and csr_matrix

In [None]:
from sklearn.decomposition import TruncatedSVD
model = TruncatedSVD(n_components=3)
model.fit(documents) # documents is csr_matrix
transformed = model.transform(documents)

# 4.1 - Non-negative matrix factorization (NMF)

#### > Example usage of NMF

In [None]:
from sklearn.decomposition import NMF

model = NMF(n_components=2)
model.fit(samples)
nmf_features = model.transform(samples)

# 4.2 - NMF learns interpretable parts

#### > Applying NMF to the articles

In [None]:
print(articles.shape)
from sklearn.decomposition import NMF
nmf = NMF(n_components=10)
nmf.fit(articles)
print(nmf.components_.shape)

#### > Visualizing samples

In [None]:
print(sample)
bitmap = sample.reshape((2, 3))
print(bitmap)
from matplotlib import pyplot as plt
plt.imshow(bitmap, cmap='gray', interpolation='nearest')
plt.show()

# 4.3 - Building recommender systems using NMF 

#### > Apply NMF to the word-frequency array

In [None]:
from sklearn.decomposition import NMF
nmf = NMF(n_components=6)
nmf_features = nmf.fit_transform(articles)

#### > Calculating the cosine similarities

In [None]:
from sklearn.preprocessing import normalize

norm_features = normalize(nmf_features)
current_article = norm_features[23,:] # if has index 23
similarities = norm_features.dot(current_article)
print(similarities)

#### > DataFrames and labels

In [None]:
import pandas as pd

norm_features = normalize(nmf_features)
df = pd.DataFrame(norm_features, index=titles)
current_article = df.loc['Dog bites man']
similarities = df.dot(current_article)

# 4.4 - Building recommender systems using NMF

#### > Apply NMF to the word-frequency array

In [None]:
from sklearn.decomposition import NMF
nmf = NMF(n_components=6)
nmf_features = nmf.fit_transform(articles)

#### > Calculating the cosine similarities

In [None]:
from sklearn.preprocessing import normalize
norm_features = normalize(nmf_features)
current_article = norm_features[23,:] # if has index 23
similarities = norm_features.dot(current_article)
print(similarities)

#### > DataFrames and labels

In [None]:
import pandas as pd
norm_features = normalize(nmf_features)
df = pd.DataFrame(norm_features, index=titles)
current_article = df.loc['Dog bites man']
similarities = df.dot(current_article)