In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.image import imsave
from sklearn import preprocessing
import seaborn as sns
from sklearn.decomposition import PCA as sklearnPCA
from sklearn.manifold import TSNE
from ipywidgets import interactive, HBox, VBox
from sklearn.cluster import KMeans

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import warnings
warnings.filterwarnings('ignore')

In [5]:
from plotly.offline import init_notebook_mode, iplot
import plotly.offline as py
import plotly.graph_objs as go
init_notebook_mode(connected=True)

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import datasets

In [262]:
sources = []
imgs = []
NCOMP = []

def saveImgs(images, imgp):
    for i, img in enumerate(images):
        n = imgp + str(i) + ".png"
        imsave(n, img, cmap=plt.cm.gray_r)
        
def Iris():
    df = datasets.load_iris(as_frame=True).frame
    NCOMP.extend([2])
    for y in df.iloc[:,-1].values:
        p = "./imgs/iris/" + str(y) + ".jpg"
        sources.append(p)
    return df
    
    
def Digits():
    df = datasets.load_digits(as_frame=True).frame
    NCOMP.extend([0.95])
    imgs.extend(datasets.load_digits().images)
    for i in range(len(df)):
        p = "./imgs/digits/" + str(i) + ".png"
        sources.append(p)
    return df

def saveDImgs(originals, apprs, ncomp):
    for i, img in enumerate(originals):
        plt.figure(figsize=(8,4));

        # Original Image
        plt.subplot(1, 2, 1);
        plt.imshow(img, cmap = plt.cm.gray_r, interpolation='nearest');
        plt.xlabel('784 components', fontsize = 14)
        plt.title('Original Image', fontsize = 20);
        plt.axis('off')

        # 154 principal components
        plt.subplot(1, 2, 2);
        plt.imshow(apprs[i].reshape(img.shape), cmap = plt.cm.gray_r, interpolation='nearest');
        plt.xlabel('{ncomp} components', fontsize = 14)
        plt.title('95% of Explained Variance', fontsize = 20);
        plt.axis('off')
        p = "./imgs/digits/" + str(i) + ".png"
        print(i)
        plt.savefig(p)
        
    

In [263]:
df = Iris()
#df = Digits()

#X = StandardScaler().fit_transform(df.iloc[:,0:-1].values)  
X = df.iloc[:,0:-1].values
y = df.iloc[:,-1].values

In [264]:
def get_cmap(n, name='hsv'):
    '''Returns a function that maps each index in 0, 1, ..., n-1 to a distinct 
    RGB color; the keyword argument name must be a standard mpl colormap name.'''
    return plt.cm.get_cmap(name, n)

In [265]:
def formatColor(color):
    return 'rgb('+str(int(color[0]*255))+','+str(int(color[1]*255))+','+str(int(color[2]*255))+')'

    
def pca(df, currentLabel,attributes,n_clusters=[7]):
    sklearn_pca = sklearnPCA(n_components=NCOMP[0])
    X_embedded = sklearn_pca.fit_transform(X)   
    approximations = sklearn_pca.inverse_transform(X_embedded)
    
    if(len(imgs)):
        approximations = np.array(approximations).reshape( np.array(imgs).shape)
        saveImgs(approximations, "./imgs/digits/")

    numColors = len(np.unique(y))

    cmap = get_cmap(numColors+1)
    colorList = [cmap(i) for i in range(1, numColors+1)]


    traceArr = []
    mapPoints = []
    labelArray = np.unique(y)
    for lab, col in zip(labelArray,
                        colorList):
        mapPoints.extend(df[y==lab].index)
        trace = go.Scatter(
            x=X_embedded[y==lab, 0],
            y=X_embedded[y==lab, 1],
            mode='markers',
            name = str(lab),
            marker=dict(
                size=5,
                color=formatColor(col),
                opacity=0.8
            ),

        )
        traceArr.append(trace)
    f = go.FigureWidget(data=traceArr, layout=go.Layout(
            margin=dict(
                l=0,
                r=0,
                b=0,
                t=0
        )))
    
    t = go.FigureWidget([go.Table(
    header=dict(values=attributes,
                fill = dict(color='#C2D4FF'),
                align = ['left'] * 5),
    cells=dict(values=[df.copy()[col] for col in attributes],
               fill = dict(color='#F5F8FF'),
               align = ['left'] * 5))])

    

    #display(VBox((slider,f,t)))
    return f
       

In [266]:
f = pca(df,4,df.columns.values.tolist())

In [267]:
allY = []
allX = []
for d in f.data:
    allY.extend(d.y)
    allX.extend(d.x)

maxy = np.amax(allY)
maxx = np.amax(allX)
            
f.add_layout_image(dict(
                xref="x",
                yref="y",
                sizex= 0.25*(maxx-np.amin(allX)),
                sizey= 0.25*(maxy-np.amin(allY)),
))


from PIL import Image; 
def showImg(trace, points, selector):
    for i in points.point_inds:
        with f.batch_update():
            nyanchor='bottom'
            nxanchor='left'
            if(points.ys[0] > maxy - 15):
                nyanchor='top'
            if(points.xs[0] > maxx - 15):
                nxanchor='right'
            
            f.update_layout_images(dict(
                
                source=sources[(points.trace_index+1)*points.point_inds[0]],
                x=points.xs[0],
                y=points.ys[0],
                yanchor = nyanchor,
                xanchor=nxanchor,
            ))  

In [268]:
for d in f.data:
    d.on_click(showImg)
    
f

FigureWidget({
    'data': [{'marker': {'color': 'rgb(7,255,0)', 'opacity': 0.8, 'size': 5},
              'mo…