In [1]:
import numpy as np
import pandas as pd
import random
import pickle

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits
from scipy.sparse.linalg import svds
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KernelDensity
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt

np.random.seed(1299827)

In [2]:
class PCA:
    
    def get_best_k(self, X, error_percent):
        m = len(X)
        for col in X:
            mean = X[col].mean()
            std = X[col].std()
            if std == 0:
                zeros = [0]*m
                X[col] = zeros
            else:
                X[col] = (X[col] - mean)/std
#             X[col] = (X[col]-mean)/std
        max_cols = X.shape[1]
        for K in range(1,max_cols):
            U,S,V = svds(X,k=K)
            X_approx = np.dot(U,np.dot(np.diag(S),V))
            new_s = np.sum(np.sum(np.square(np.subtract(X,X_approx))))
            old_s = np.sum(np.sum(np.square(X)))
            diff = float(new_s)/float(old_s)
#             print diff
            if diff < error_percent:
                return K
    
    def reduce_dimensions(self, X, K):
        U, S, V = svds(X, k=K)
        return U

In [3]:
digits = load_digits()
X_data = digits.data
Y_data = np.reshape(digits.target,(digits.target.shape[0],1))
data_np = np.append(X_data,Y_data,axis=1)
data = pd.DataFrame(data_np)
cols = list()
for i in range(1,65):
    cols.append('A'+str(i))
cols.append('digit')
data.columns = cols
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,...,A56,A57,A58,A59,A60,A61,A62,A63,A64,digit
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0,1.0
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0,2.0
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0,3.0
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0,4.0


In [4]:
pca = PCA()
cols = list()
for i in range(1,65):
    cols.append('A'+str(i))
X = data[cols]
k = pca.get_best_k(X,0.1)
X_reduced = pca.reduce_dimensions(X,k)

cols = list()
for i in range(0,k):
    cols.append('A'+str(i+1))

data_reduced = pd.DataFrame(X_reduced)
data_reduced.columns = cols
data_reduced['digit'] = data['digit'].tolist()
data_reduced.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,...,A23,A24,A25,A26,A27,A28,A29,A30,A31,digit
0,-0.018959,0.011347,0.013693,-0.005185,-0.010642,0.022754,0.014206,0.022288,0.014233,0.014963,...,-0.002179,-0.024544,-0.02153,0.007803,0.00366,0.024037,-0.041014,-0.009324,-0.016667,0.0
1,-0.03894,0.009686,-0.028943,0.024518,0.0496,0.002107,-0.024176,-0.014425,-0.013384,-0.006493,...,-0.018837,-0.009127,0.028578,-0.009941,0.01361,-0.021088,0.040793,0.009032,-0.005128,1.0
2,0.026586,-0.041293,-0.012247,-0.006773,-0.012141,-0.003779,0.012465,0.011437,-0.035698,0.001855,...,-0.013704,-0.018968,-0.019718,0.013759,0.028513,-0.024211,0.031424,-0.003098,-0.011337,2.0
3,-0.009524,0.043507,0.009873,0.013316,-0.005014,-0.014507,0.002579,0.015654,0.016038,-0.004499,...,-0.004519,0.022683,0.0146,0.010698,0.007629,-0.025913,-0.008333,-0.008486,0.026301,3.0
4,0.023419,-0.010406,0.036398,-0.024656,-0.002096,-0.066583,-0.017249,0.005423,0.026878,-0.004973,...,0.02896,0.015885,0.016333,0.021063,0.023498,-0.016819,0.010114,-0.010681,-0.039433,4.0


In [5]:
params = {'bandwidth': np.logspace(-1, 1, 20)}
grid = GridSearchCV(KernelDensity(), params, cv=5)
grid.fit(data_reduced[cols])

print("best bandwidth: {0}".format(grid.best_estimator_.bandwidth))
kde = grid.best_estimator_

# sample 44 new points from the data
new_data = kde.sample(44, random_state=0)
new_data = pca.inverse_transform(new_data)

# turn data into a 4x11 grid
new_data = new_data.reshape((4, 11, -1))
real_data = digits.data[:44].reshape((4, 11, -1))

# plot real digits and resampled digits
fig, ax = plt.subplots(9, 11, subplot_kw=dict(xticks=[], yticks=[]))
for j in range(11):
    ax[4, j].set_visible(False)
    for i in range(4):
        im = ax[i, j].imshow(real_data[i, j].reshape((8, 8)),
                             cmap=plt.cm.binary, interpolation='nearest')
        im.set_clim(0, 16)
        im = ax[i + 5, j].imshow(new_data[i, j].reshape((8, 8)),
                                 cmap=plt.cm.binary, interpolation='nearest')
        im.set_clim(0, 16)

ax[0, 5].set_title('Selection from the input data')
ax[5, 5].set_title('"New" digits drawn from the kernel density model')

plt.show()


best bandwidth: 0.1


AttributeError: PCA instance has no attribute 'inverse_transform'