In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.image import imsave
import seaborn as sns
from ipywidgets import interactive, HBox, VBox
from decimal import *

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import warnings
warnings.filterwarnings('ignore')

In [3]:
from plotly.offline import init_notebook_mode, iplot
import plotly.offline as py
import plotly.graph_objs as go
from plotly.subplots import make_subplots
init_notebook_mode(connected=True)

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.manifold import MDS
#from sklearn.neighbors import KNeighborsClassifier
#from sklearn.cluster import KMeans
from sklearn import datasets

In [5]:
def rData():
    dfC = pd.read_csv('ImagensCorel.data', sep=';')
    dfM = pd.read_csv('Medical12Classes.data', sep=';')
    srcCorel = [("./imgs/corel/" + n) for n in dfC.index.values]
    srcMedical = [("./imgs/medical/" + n) for n in dfM.index.values]
    return dfC, dfM, srcCorel, srcMedical

dfC, dfM, srcCorel, srcMedical = rData()


XC = StandardScaler().fit_transform(dfC.iloc[:,0:-1].values)
XM = StandardScaler().fit_transform(dfM.iloc[:,0:-1].values)
YC = dfC.iloc[:,-1].values
YM = dfM.iloc[:,-1].values


In [6]:
'''This section analyzes the effect of the number of components 
of the pca calculation on the final F1 Score.'''

def applypca(trainX, testX, ncomp):
    pca = PCA(n_components=ncomp)
    pca.fit(trainX)
    trainX = pca.transform(trainX)
    testX = pca.transform(testX)
    return trainX, testX, pca.n_components_

def classifier(trainX, trainY, testX, testY):
    clf = svm.SVC(kernel='linear')
    clf.fit(trainX, trainY)
    predY = clf.predict(testX)
    return classification_report(testY, predY, output_dict=True)


In [7]:
trainXC, testXC, trainYC, testYC = train_test_split(XC, YC)
trainXM, testXM, trainYM, testYM = train_test_split(XM, YM)

n = 80
ncompsC, ncompsM = [], []
f1scoreC, f1scoreM = [], []
traceC, traceM = [], []
while n <= 100:
    nc = n/100
    if(n == 100):
        nc = None
    redtrainX, redtestX, ncomponents = applypca(trainXC, testXC, nc)
    report = classifier(redtrainX, trainYC, redtestX, testYC)
    ncompsC.append(ncomponents)
    f1scoreC.append(report["weighted avg"]["f1-score"])
    
    redtrainX, redtestX, ncomponents = applypca(trainXM, testXM, nc)
    report = classifier(redtrainX, trainYM, redtestX, testYM)
    ncompsM.append(ncomponents)
    f1scoreM.append(report["weighted avg"]["f1-score"])
    n += 1

fig = make_subplots(rows=2, cols=1, x_title ="number of components",
               y_title ="F1 score")
fig.add_trace(go.Scatter
  (x=ncompsC, y=f1scoreC, mode='lines+markers', 
   name="Corel (150 components)"), row=1, col=1)
fig.add_trace(go.Scatter
  (x=ncompsM, y=f1scoreM, mode='lines+markers', 
   name="Medical 28 components"), row=2, col=1)

fig.show()

In [8]:
'''------Data visualization using multidimensional scale (MDS)-----'''

def applymds(X, ncomp):
    mds = MDS(n_components=ncomp)
    return mds.fit_transform(X)

mdsXC = applymds(XC, 2)
mdsXM = applymds(XM, 2)

In [9]:
def get_cmap(n, name='hsv'):
    '''Returns a function that maps each index in 0, 1, ..., n-1 to a distinct 
    RGB color; the keyword argument name must be a standard mpl colormap name.'''
    return plt.cm.get_cmap(name, n)

def formatColor(color):
    return 'rgb('+str(int(color[0]*255))+','+str(int(color[1]*255))+','+str(int(color[2]*255))+')'

def getTraces(X, Y):
    numColors = len(np.unique(Y))

    cmap = get_cmap(numColors+1)
    colorList = [cmap(i) for i in range(1, numColors+1)]


    traceArr = []
    mapPoints = []
    labelArray = np.unique(Y)
    for lab, col in zip(labelArray,
                        colorList):
        mapPoints.extend(np.where(Y==lab))
        trace = go.Scatter(
            x=X[Y==lab, 0],
            y=X[Y==lab, 1],
            mode='markers',
            name = str(lab),
            marker=dict(
                size=5,
                color=formatColor(col),
                opacity=0.8
            ),

        )
        traceArr.append(trace)
    return traceArr, mapPoints

def Fig(X, Y, labels):
    
    traces, mapp = getTraces(X, Y)
    mask = []
    mask.append(np.repeat(True, len(traces)))
    for i in range(len(traces)):
        b = np.repeat(False, len(traces))
        b[i] = True
        mask.append(b)
    
    butts = []
    labels0 = labels
    labels0[0] = "All"
    for i in range(len(traces)+1):
        butts.append(dict(label=labels0[i],
                         method="update",
                         args=[{"visible": mask[i]},
                               {"title": labels[i]}]))
    
    f = go.FigureWidget(data = traces, layout=go.Layout(    
        margin=dict(
                l=0,
                r=0,
                b=0,
                t=0
        )))
    
    f.update_layout(
        updatemenus=[
            dict(
                type="buttons",
                buttons=butts
            )
        ])
    
    return f, mapp
       

In [10]:
lC = ["Corel data", "African", "Beach", "Building", "Bus", "Dino", 
      "Elephant", "Flower", "Horse", "Montain", "Food"]
lM = np.append("Medical data", np.unique(YM))

fc, mapC = Fig(mdsXC, YC, lC)
fm, mapM = Fig(mdsXM, YM, lM)

In [11]:
maxy = np.array([])
maxx = np.array([])
def config(f, corel=True):
    allY = []
    allX = []
    for d in f.data:
        allY.extend(d.y)
        allX.extend(d.x)

    maxy = np.amax(allY)
    maxx = np.amax(allX)

    f.add_layout_image(dict(
                    xref="x",
                    yref="y",
                    sizex= 0.25*(maxx-np.amin(allX)),
                    sizey= 0.25*(maxy-np.amin(allY)),
    ))
    
    for d in f.data:
        if(corel):
            d.on_click(showImgC)
        else:
            d.on_click(showImgM)
            
def showImgC(trace, points, selector):
    for i in points.point_inds:
        with fc.batch_update():
            nyanchor='bottom'
            nxanchor='left'
            if(points.ys[0] > maxy - 15):
                nyanchor='top'
            if(points.xs[0] > maxx - 15):
                nxanchor='right'
            
            fc.update_layout_images(dict(
                source = srcCorel[mapC[points.trace_index][points.point_inds[0]]],
                x=points.xs[0],
                y=points.ys[0],
                yanchor = nyanchor,
                xanchor=nxanchor,
            )) 
            
def showImgM(trace, points, selector):
    for i in points.point_inds:
        with fm.batch_update():
            nyanchor='bottom'
            nxanchor='left'
            if(points.ys[0] > maxy - 30):
                nyanchor='top'
            if(points.xs[0] > maxx - 30):
                nxanchor='right'
            
            fm.update_layout_images(dict(
                source = srcMedical[mapM[points.trace_index][points.point_inds[0]]],
                x=points.xs[0],
                y=points.ys[0],
                yanchor = nyanchor,
                xanchor=nxanchor,
            ))  
            


In [12]:

'''
- Image size is calculated according to the x-axis and y-axis of 
the graph with all the data, so when choosing to see specific data 
the images can get big.
- The title is not shown'''
config(fc)
fc



FigureWidget({
    'data': [{'marker': {'color': 'rgb(255,150,0)', 'opacity': 0.8, 'size': 5},
              '…

In [13]:
config(fm, False)
fm

FigureWidget({
    'data': [{'marker': {'color': 'rgb(255,125,0)', 'opacity': 0.8, 'size': 5},
              '…