In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.datasets import fetch_olivetti_faces

In [None]:
data = fetch_olivetti_faces()
type(data)

In [None]:
data.keys()

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.imshow(data['images'][0])
plt.axis('off')
plt.show()

In [None]:
data['data'].shape

In [None]:
data['target'][:20]

In [None]:
X = data['data']
y = data['target']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_n_train, y_train, y_n_train = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [None]:
X_test, X_val, y_test, y_val = train_test_split(X_n_train, y_n_train, test_size=0.5, stratify=y_n_train, random_state=42)

In [None]:
from sklearn.cluster import KMeans

### Finding the best number of clusters

In [None]:
from sklearn.metrics import silhouette_score

In [None]:
k_values = []
silhouette_scores = []
inertia_values = []
for k in range(2, 200, 5):
    k_values.append(k)
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(X_train)
    inertia_values.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X_train, kmeans.labels_))

In [None]:
plt.plot(k_values, inertia_values)
plt.show()

In [None]:
plt.plot(k_values, silhouette_scores)
plt.show()

In [None]:
best_k_value = k_values[np.array(silhouette_scores).argmax()]

In [None]:
kmeans = KMeans(n_clusters=best_k_value)

In [None]:
kmeans.fit(X_train)

In [None]:
def ShowImage(x):
    x_array = np.array(x)
    x_image = x_array.reshape((64,64))
    plt.imshow(x_image)
    plt.axis('off')
    plt.show()

In [None]:
kmeans.labels_

In [None]:
kmeans.n_clusters

In [None]:
representative_images = []
for cluster_id in range(kmeans.n_clusters):
    cluster_images = X_train[kmeans.labels_ == cluster_id]
    representative_images.append(cluster_images)

### Visualizing the clusters

In [None]:
def PlotFaces(faces):
    fig, axes = plt.subplots(1, len(faces), figsize=(12, 7))
    for i in range(len(faces)):
        if len(faces) != 1:
            axes[i].imshow(faces[i].reshape((64, 64)))
            axes[i].axis('off')
        else:
            axes.imshow(faces[i].reshape((64, 64)))
            axes.axis('off')
    plt.show()

In [None]:
i = 0
for cluster_images in representative_images:
    print("Cluster: ", i)
    i +=1
    PlotFaces(cluster_images)

## Using KMeans as dimensionality reduction technique

### Training a Classifier 

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
log_clf = LogisticRegression()
log_clf.fit(X_train, y_train)

### Testing on Validation set

In [None]:
y_pred_log = log_clf.predict(X_val)
accuracy_log = accuracy_score(y_pred_log, y_val)
print("accuracy of logistic regression: ", round(accuracy_log*100,2))

In [None]:
rnd_clf = RandomForestClassifier()
rnd_clf.fit(X_train, y_train)

In [None]:
y_pred_rnd = rnd_clf.predict(X_val)
accuracy_rnd = accuracy_score(y_pred_rnd, y_val)
print("accuracy of random forest:", round(accuracy_rnd*100, 2))

### Dimentionality Reduction with KMeans

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [None]:
pipeline = Pipeline([
    ('kmeans', KMeans(n_clusters=300)),
    ('log_reg', LogisticRegression())
])

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
print("accuracy: ",round(pipeline.score(X_val, y_val)*100, 2))

In [None]:
grid_params = {'kmeans__n_clusters': np.arange(120, 500, 10)}

In [None]:
grid_search = GridSearchCV(pipeline, grid_params, cv=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
pipeline_best = grid_search.best_estimator_

In [None]:
pipeline_best.fit(X_train, y_train)

In [None]:
print("accuracy with dimensionality reduction using Kmeans: ", round(pipeline_best.score(X_val, y_val)*100,2))