In [7]:
from sklearn.decomposition import KernelPCA
from sklearn.cross_validation import KFold
from sklearn import svm
import numpy as np
import random
import math
from sklearn.metrics import confusion_matrix as CM
import random
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.metrics import mutual_info_score as MI

##linear correlation with tree

In [8]:
from sklearn.datasets import load_iris
from sklearn.datasets import load_digits

In [9]:
def IndOfSortedFea(feaInfo):
    """return sorted index of a list"""
    sortInfo=sorted(feaInfo,reverse=True)
    ##for test
    ##print sortInfo
    return [feaInfo.index(i) for i in sortInfo]
#a=[3,4,6,2]
#print IndOfSortedFea(a)

In [10]:
def corDis(x,y):
    """
    calculate the correlation distance, given x and y calculate how similar the x and y is.
    return a value, 0 means x and y are independent, higher mean x are determinstic of y.
    """
    assert len(x)==len(y)
    n=len(x)
    a=np.zeros((n,n))
    b=np.zeros((n,n))
    ##calculate pair wise distance
    for i in xrange(n):
        for j in xrange(n):
            a[i,j]=np.linalg.norm(x[i]-x[j])
            b[i,j]=np.linalg.norm(y[i]-y[j])
    aRowMean=np.mean(a,axis=1)
    aColMean=np.mean(a,axis=0)
    bRowMean=np.mean(b,axis=1)
    bColMean=np.mean(b,axis=0)
    aMean=np.mean(a)
    bMean=np.mean(b)
    A=np.zeros((n,n))
    B=np.zeros((n,n))
    ##normalization
    for i in xrange(n):
        for j in xrange(n):
            A[i,j]=a[i,j]-aRowMean[i]-aColMean[j]+aMean
            B[i,j]=b[i,j]-bRowMean[i]-bColMean[j]+bMean
    cordis=0.
    for i in xrange(n):
        for j in xrange(n):
            cordis += A[i,j]*B[i,j]
    return cordis*(1./n**2)

In [11]:
def cosine_distance(u, v):
    return np.dot(u, v) / (math.sqrt(np.dot(u, u)) * math.sqrt(np.dot(v, v)))

In [12]:
%matplotlib inline
from matplotlib import pyplot as plt
def plotFunction(x,y,text):
    n=x.shape[1]
    cols=2
    rows=int(math.ceil(n/2.))
    fig,ax=plt.subplots(ncols=cols,nrows=rows)
    if rows==1:
        for j in range(cols):
            if j<n:
                ax[j].scatter(x[:,j],y,c='b')
    else:
        for i in range(rows):
            for j in range(cols):
                if i*cols+j<n:
                    ax[i,j].scatter(x[:,i*cols+j],y,c='b')
                    #ax[i].title('x%d'%i,fontsize='small')
    plt.title(text)
    plt.show()


In [13]:
def crossValidation(x,y):
    """
    version 2
    according to the true y, predict
    greedy algorithm: first using index
    return
    """
    crossErrors=[]
    confusion_matrix=[]
    #svc=svm.LinearSVC()
    svc=svm.SVC(kernel='linear')
    kf = KFold(x.shape[0], n_folds=5,shuffle=True)
    for train,test in kf:
        svc.fit(x[train],y[train])
        prediction=svc.predict(x[test])
        crossErrors.append(((prediction!=y[test]).sum())/float(len(y[test])))
        confusion_matrix.append(CM(y[test],prediction))
    return np.mean(crossErrors),confusion_matrix

In [14]:
def crossValidation_forest(x,y):
    """
    version 3
    
    """
    crossErrors=[]
    confusion_matrix=[]
    #svc=svm.LinearSVC()
    rf=RFC(n_estimators=100,max_features=None)
    kf = KFold(x.shape[0], n_folds=5,shuffle=True)
    for train,test in kf:
        rf.fit(x[train],y[train])
        prediction=rf.predict(x[test])
        crossErrors.append(((prediction!=y[test]).sum())/float(len(y[test])))
        confusion_matrix.append(CM(y[test],prediction))
    return np.mean(crossErrors),confusion_matrix

In [58]:
def layer_rbf(x,y):
    """version 2"""
    kpca=KernelPCA(kernel='rbf',gamma=10.,n_components=x.shape[1])
    xT=kpca.fit_transform(x)
    rank=[]
    numOfFeatures=xT.shape[1]
    for i in xrange(numOfFeatures):
        rank.append(corDis(xT[:,i],y))
    feaSelected=IndOfSortedFea(rank)
    
    featureSelectedErrors=[]
    for i in xrange(numOfFeatures):
        crossErrors,cms=crossValidation_forest(xT[:,feaSelected[:i+1]],y)
        featureSelectedErrors.append(crossErrors)
    choice=featureSelectedErrors.index(min(featureSelectedErrors))
    return xT[:,feaSelected[:choice+1]],featureSelectedErrors,cms

In [16]:
def pretrain(x,y):
    rank=[]
    numOfFeatures=x.shape[1]
    for i in xrange(numOfFeatures):
        rank.append(corDis(x[:,i],y))
    feaSelected=IndOfSortedFea(rank)
    
    featureSelectedErrors=[]
    for i in xrange(numOfFeatures):
        crossErrors,cms=crossValidation_forest(x[:,feaSelected[:i+1]],y)
        featureSelectedErrors.append(crossErrors)
    choice=featureSelectedErrors.index(min(featureSelectedErrors))
    return x[:,feaSelected[:choice+1]],featureSelectedErrors,cms 

In [17]:
from matplotlib import pyplot as plt
def plotdata(x,y):
    n=x.shape[1]
    fig,ax=plt.subplots(ncols=n,nrows=n)
    fig.set_size_inches(9,6)
    for i in xrange(n):
        for j in xrange(i+1):
            for t,color in zip(xrange(3),'rgb'):
                ax[i][j].scatter(x[y == t,i],x[y == t,j],c=color)
    plt.show()
#plotdata(x,y)

In [18]:
def plotErr(err):
    x=range(len(err))
    #y=[e/min(err) for e in err]
    plt.bar(x,err)
    plt.title('Error Bar_Chart')
    plt.show()

In [19]:
def printCM(cm):
    """
    print cm in a nice way
    """
    for i in cm:
        print i
        print 

In [20]:
digits=load_digits()
x=digits.data
y=digits.target

In [21]:
sample=random.sample(range(len(y)), int(len(y)*0.2))

In [22]:
x=x[sample]
y=y[sample]

In [23]:
print len(sample)
print len(x),len(y)

359
359 359


In [74]:
def layer_rbf(x,y):
    """version 2"""
    kpca=KernelPCA(kernel='rbf',gamma=10.,n_components=x.shape[1])
    xT=kpca.fit_transform(x)
    
    #xT=np.concatenate((x,x_kpca),axis=1)
    
#     rank=[]
    numOfFeatures=xT.shape[1]
#     for i in xrange(numOfFeatures):
#         rank.append(cosine_distance(xT[:,i],y))
    selecting_tree=RFC(n_estimators=100,max_depth=2,n_jobs=2)
    selecting_tree.fit(xT,y)
    rank=[i for i in selecting_tree.feature_importances_]
    feaSelected=IndOfSortedFea(rank)
    
    featureSelectedErrors=[]
    for i in xrange(numOfFeatures):
        crossErrors,cms=crossValidation_forest(xT[:,feaSelected[:i+1]],y)
        featureSelectedErrors.append(crossErrors)
    choice=featureSelectedErrors.index(min(featureSelectedErrors))
    return xT[:,feaSelected[:choice+1]],featureSelectedErrors,cms

In [75]:
def pretrain(x,y):
#     rank=[]
#     numOfFeatures=x.shape[1]
#     for i in xrange(numOfFeatures):
#         rank.append(cosine_distance(x[:,i],y))
#     feaSelected=IndOfSortedFea(rank)
    numOfFeatures=x.shape[1]
    selecting_tree=RFC(n_estimators=100,max_depth=2,n_jobs=2)
    selecting_tree.fit(x,y)
    rank=[i for i in selecting_tree.feature_importances_]
    feaSelected=IndOfSortedFea(rank)
    
    featureSelectedErrors=[]
    for i in xrange(numOfFeatures):
        crossErrors,cms=crossValidation_forest(x[:,feaSelected[:i+1]],y)
        featureSelectedErrors.append(crossErrors)
    choice=featureSelectedErrors.index(min(featureSelectedErrors))
    return x[:,feaSelected[:choice+1]],featureSelectedErrors,cms

In [76]:
result,error,cm=pretrain(x,y)

In [61]:
%matplotlib

Using matplotlib backend: WXAgg


In [77]:
printCM(cm)

[[ 9  0  0  0  0  0  0  0  0  0]
 [ 0  9  0  0  0  0  1  0  0  0]
 [ 0  0  4  0  0  0  0  0  0  0]
 [ 0  0  0  4  0  0  0  0  0  0]
 [ 0  0  0  0  8  0  0  0  0  0]
 [ 0  0  0  0  0  7  0  0  0  0]
 [ 0  0  0  0  0  0  5  0  0  0]
 [ 0  0  0  0  0  0  0 11  1  0]
 [ 0  1  0  1  0  0  0  0  5  1]
 [ 0  0  0  0  0  0  0  0  0  5]]

[[8 0 0 0 0 0 0 0 0 0]
 [0 9 0 0 1 0 0 0 1 0]
 [0 1 6 0 0 0 0 1 1 0]
 [0 0 0 6 0 1 0 0 0 0]
 [0 0 0 0 4 0 0 0 0 0]
 [0 0 0 0 0 6 0 0 0 0]
 [0 0 0 0 0 0 6 0 0 0]
 [0 0 0 0 0 0 0 8 0 0]
 [0 2 0 0 0 0 0 0 3 3]
 [0 0 0 0 0 0 0 0 0 5]]

[[ 6  0  0  0  0  0  0  0  0  0]
 [ 0  6  0  1  0  0  0  1  0  0]
 [ 0  0 11  0  1  0  0  0  0  0]
 [ 0  0  0  9  0  0  0  0  1  0]
 [ 1  0  0  0  6  0  0  0  0  0]
 [ 0  0  0  1  0  2  1  0  0  0]
 [ 0  0  0  0  0  1 10  0  0  0]
 [ 0  0  0  0  0  0  0  2  0  0]
 [ 0  0  0  0  0  0  0  1  4  0]
 [ 0  0  0  0  0  0  0  0  0  7]]

[[ 7  0  0  0  0  0  0  0  0  0]
 [ 0  4  0  1  0  0  0  0  0  0]
 [ 0  0  8  1  0  0  0  0  0  0]
 [ 0 

In [80]:
plotErr(error)

In [81]:
print result.shape

(359L, 46L)


In [82]:
result1,e1,cm1=layer_rbf(result,y)
printCM(cm1)

[[1 2 0 2 0 1 1 1 0 1]
 [1 0 0 1 0 1 2 1 3 0]
 [2 0 0 1 1 1 3 2 0 1]
 [1 2 0 0 0 0 1 0 2 0]
 [0 1 1 1 1 1 0 2 1 2]
 [0 0 1 2 0 0 1 1 1 0]
 [1 0 1 0 0 0 2 0 0 0]
 [1 0 0 5 1 0 1 0 0 2]
 [0 1 0 3 0 0 2 0 0 0]
 [0 0 0 0 1 0 0 0 0 0]]

[[1 1 2 1 0 0 1 0 0 0]
 [1 0 0 1 0 0 5 1 0 0]
 [1 2 0 1 0 0 0 0 0 1]
 [2 1 0 2 0 0 2 0 0 0]
 [4 2 2 0 0 0 0 1 0 0]
 [2 2 2 1 0 0 1 0 1 0]
 [1 0 1 0 1 0 0 0 1 1]
 [4 1 3 1 0 0 0 0 0 0]
 [1 1 3 0 0 0 2 0 1 0]
 [1 1 2 0 2 0 0 0 0 0]]

[[1 2 0 3 2 0 1 3 0 0]
 [3 0 0 0 1 0 1 1 0 0]
 [0 0 0 1 0 1 0 3 0 2]
 [0 1 1 1 0 0 3 2 5 0]
 [2 0 1 0 0 0 0 1 0 0]
 [1 0 2 1 0 1 0 1 0 0]
 [1 0 0 1 0 0 1 0 1 0]
 [0 3 0 0 0 0 0 1 1 0]
 [1 3 0 2 0 0 0 1 1 0]
 [0 1 1 0 0 1 2 2 0 0]]

[[0 1 1 2 1 1 0 0 2 0]
 [1 2 1 1 0 0 1 3 1 0]
 [1 1 1 0 0 0 2 1 1 1]
 [0 0 1 0 0 0 0 2 1 0]
 [2 0 1 1 0 1 2 0 0 0]
 [0 1 1 0 0 1 0 0 0 0]
 [1 5 0 2 0 1 0 1 0 1]
 [0 1 2 0 1 0 1 1 1 0]
 [3 1 0 2 1 0 1 0 0 1]
 [0 0 1 2 0 0 0 1 1 0]]

[[0 1 3 0 1 0 0 0 1 0]
 [1 3 1 1 0 0 0 0 0 0]
 [1 1 0 0 2 0 0 0 1 1]
 [3

In [83]:
plotErr(e1)

In [84]:
print result1.shape

(359L, 2L)


In [48]:
##wrong result

In [85]:

result2,e2,cm2=layer_rbf(result1,y)
printCM(cm2)

[[2 2 2 2 0 0 1 0 2 0]
 [1 1 0 0 0 1 0 1 0 0]
 [1 1 0 1 0 0 0 4 1 0]
 [0 0 0 3 0 2 0 1 1 3]
 [0 1 1 1 2 0 0 1 0 0]
 [0 0 0 0 3 1 0 2 0 0]
 [0 2 0 0 1 1 2 0 1 0]
 [0 0 2 0 1 0 0 1 2 0]
 [2 1 0 0 0 0 0 1 0 0]
 [1 1 1 1 3 0 0 2 1 0]]

[[0 0 1 0 0 1 1 0 1 2]
 [1 2 0 1 0 0 1 1 0 0]
 [1 0 1 2 1 1 3 1 0 0]
 [0 0 0 0 2 0 1 0 2 0]
 [1 1 0 0 1 2 1 1 1 2]
 [0 1 1 0 0 0 1 1 1 0]
 [0 1 0 0 0 1 1 0 0 0]
 [2 0 1 3 1 0 1 2 1 1]
 [0 0 0 2 0 0 2 1 2 0]
 [0 1 1 1 1 1 3 0 0 0]]

[[1 2 2 0 1 0 0 1 0 0]
 [1 2 0 0 0 0 1 1 2 2]
 [1 1 0 0 0 1 1 1 0 1]
 [0 0 1 3 0 1 0 2 0 0]
 [1 0 0 5 0 2 0 0 0 1]
 [0 0 0 2 1 0 1 0 0 0]
 [2 4 1 0 0 1 2 0 2 0]
 [0 1 2 0 2 0 2 0 0 0]
 [2 1 0 1 0 1 1 0 2 1]
 [0 0 0 0 0 0 0 1 0 1]]

[[2 0 1 0 0 1 0 3 0 1]
 [2 2 1 0 1 0 1 0 1 1]
 [2 0 1 0 0 0 0 2 0 0]
 [0 2 0 2 0 1 0 1 1 1]
 [0 0 1 0 0 1 1 1 0 1]
 [0 1 1 2 2 0 2 0 0 0]
 [1 0 0 1 1 0 1 3 0 0]
 [1 1 1 2 0 1 1 1 0 0]
 [3 1 1 1 0 2 0 0 1 0]
 [2 0 0 1 0 0 0 1 0 1]]

[[1 0 0 1 1 0 3 1 1 1]
 [0 1 0 2 3 0 3 0 1 1]
 [1 1 1 0 1 0 1 1 2 0]
 [0

In [86]:
plotErr(e2)

In [87]:
print result2.shape

(359L, 2L)


In [88]:
result3,e3,cm3=layer_rbf(result2,y)
printCM(cm3)

[[2 0 1 1 1 0 1 0 1 0]
 [2 0 2 1 1 0 1 0 0 0]
 [2 0 1 1 1 1 2 0 1 0]
 [1 0 0 1 2 1 0 0 3 1]
 [0 1 0 0 3 2 2 1 1 1]
 [1 2 0 0 0 0 2 0 0 0]
 [0 0 1 0 0 1 0 0 2 0]
 [3 0 2 0 0 1 1 1 1 0]
 [1 0 0 0 0 0 1 0 1 0]
 [1 2 0 3 0 1 0 0 1 0]]

[[2 1 1 1 0 0 0 0 1 0]
 [0 0 1 0 2 0 1 0 1 2]
 [2 0 1 0 0 3 0 1 1 0]
 [2 2 1 2 1 1 0 0 0 2]
 [1 0 1 0 3 1 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [3 0 0 0 2 1 1 0 0 2]
 [0 4 2 1 0 1 0 0 1 0]
 [1 0 1 2 0 0 2 0 2 2]
 [0 2 1 0 1 1 0 0 0 0]]

[[0 0 0 0 1 0 4 1 3 2]
 [0 0 3 2 1 1 1 1 2 1]
 [0 1 0 2 2 0 0 0 1 0]
 [0 1 1 1 1 0 0 0 0 0]
 [1 2 0 0 2 0 0 1 0 0]
 [0 1 1 1 1 0 0 1 1 0]
 [0 1 0 0 1 1 1 1 1 1]
 [1 0 1 1 0 0 0 2 1 2]
 [1 0 0 2 1 2 0 1 0 0]
 [0 0 0 1 1 0 1 0 1 1]]

[[0 2 2 0 0 1 0 2 3 1]
 [0 1 0 0 1 1 0 2 1 1]
 [0 4 1 1 0 0 0 1 1 1]
 [0 0 2 0 2 0 1 1 1 1]
 [0 0 1 2 1 0 0 0 2 0]
 [1 0 1 0 3 0 0 2 0 0]
 [0 2 0 0 0 0 3 1 0 1]
 [0 0 1 0 0 0 1 2 0 0]
 [2 2 1 1 0 0 1 1 0 0]
 [2 1 0 1 0 0 0 0 1 0]]

[[2 1 0 1 1 0 0 0 0 1]
 [0 1 0 2 2 0 0 1 0 0]
 [1 0 1 0 0 1 0 1 1 0]
 [0

In [89]:
plotErr(e3)

In [56]:
result4,e4,cm4=layer_rbf(result3,y)
printCM(cm3)

[[2 0 1 2 0 1 0 2 0 1]
 [1 0 2 0 0 0 1 1 2 1]
 [0 1 1 2 0 0 2 0 1 0]
 [1 3 0 1 1 1 0 0 1 0]
 [0 0 2 1 1 1 0 2 0 0]
 [0 2 1 0 1 0 0 0 1 0]
 [2 3 0 0 0 1 1 1 0 1]
 [0 2 1 0 0 0 0 1 1 0]
 [1 1 2 0 1 0 0 0 0 0]
 [2 2 1 1 0 1 0 1 1 0]]

[[2 4 0 0 0 2 2 0 0 1]
 [0 1 0 2 0 0 1 2 0 0]
 [2 0 1 2 1 1 1 2 0 0]
 [0 0 1 0 0 0 0 0 1 0]
 [0 0 0 1 0 0 1 1 2 1]
 [2 3 0 1 2 0 0 0 1 0]
 [0 0 3 0 1 1 0 1 0 0]
 [2 0 0 1 3 0 0 2 1 0]
 [1 2 1 0 1 0 1 1 0 0]
 [0 2 1 0 0 0 1 1 1 0]]

[[1 0 0 2 2 1 0 1 2 2]
 [1 0 2 1 0 2 1 0 1 2]
 [0 2 0 0 0 0 0 1 0 1]
 [0 1 0 0 0 2 2 0 1 1]
 [0 1 0 1 0 1 2 0 1 2]
 [1 0 0 1 0 1 2 0 0 0]
 [0 2 1 0 2 0 0 0 3 2]
 [0 1 1 0 1 2 0 0 1 2]
 [0 2 0 0 0 0 0 2 1 1]
 [1 0 1 0 0 0 0 1 0 0]]

[[1 0 0 1 0 1 0 1 1 0]
 [1 0 0 1 0 0 0 1 0 1]
 [0 2 2 0 1 2 1 0 0 0]
 [1 1 3 2 0 0 2 1 1 0]
 [2 0 1 0 1 1 1 0 1 0]
 [1 1 0 0 1 1 1 0 0 1]
 [1 0 0 2 1 0 1 0 0 0]
 [2 0 2 0 2 0 2 0 2 2]
 [0 0 0 1 1 1 2 0 1 0]
 [1 0 0 0 2 0 2 1 1 1]]

[[1 0 1 0 1 0 0 0 1 1]
 [0 0 2 1 2 1 3 2 0 0]
 [1 0 1 0 2 0 1 1 0 2]
 [0

In [57]:
plotErr(e4)