# 1. Import libraries

In [1]:
#----------------------------Reproducible----------------------------------------------------------------------------------------
import numpy as np
import random as rn
import os

seed=0
os.environ['PYTHONHASHSEED'] = str(seed)

np.random.seed(seed)
rn.seed(seed)

#----------------------------Reproducible----------------------------------------------------------------------------------------

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

#--------------------------------------------------------------------------------------------------------------------------------
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline
matplotlib.style.use('ggplot')

import random
import scipy.sparse as sparse
import scipy.io

from keras.utils import to_categorical
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from skfeature.function.similarity_based import lap_score
from skfeature.utility import construct_W
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.linalg import qr
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from collections import defaultdict
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import time
import pandas as pd

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
def column_subset_selector(A, k):
    eps = 1e-6
    A_scaled = A / np.sqrt(np.sum(np.square(A), axis=0) / (A.shape[0] - 1))
    u, d, v = np.linalg.svd(A_scaled)
    u_, d_, v_ = np.linalg.svd(A, k)
    n = np.where(d_ < eps)[0]
    if(len(n)>0 and k > n[0]):
        k = n[0] - 1
        print("k was reduced to match the rank of A")
    Q, R, P = qr((v[:,:k]).T, pivoting=True)
    indices = P[:k]
    return indices

def pfa_selector(A, k, debug = False):
    class PFA(object):
        def __init__(self, n_features, q=0.5):
            self.q = q
            self.n_features = n_features
        
        def fit(self, X):
            if not self.q:
                self.q = X.shape[1]

            sc = StandardScaler()
            X = sc.fit_transform(X)

            pca = PCA(n_components=self.q).fit(X)
            self.n_components_ = pca.n_components_
            A_q = pca.components_.T

            kmeans = KMeans(n_clusters=self.n_features).fit(A_q)
            clusters = kmeans.predict(A_q)
            cluster_centers = kmeans.cluster_centers_

            self.indices_ = [] 
            for cluster_idx in range(self.n_features):
                indices_in_cluster = np.where(clusters==cluster_idx)[0]
                points_in_cluster = A_q[indices_in_cluster, :]
                centroid = cluster_centers[cluster_idx]
                distances = np.linalg.norm(points_in_cluster - centroid, axis=1)
                optimal_index = indices_in_cluster[np.argmin(distances)]
                self.indices_.append(optimal_index) 
  
    pfa = PFA(n_features = k)
    pfa.fit(A)
    if debug:
        print('Performed PFW with q=', pfa.n_components_)
    column_indices = pfa.indices_
    return column_indices

def pfa_transform(A, B, k, debug = False):
    indices = pfa_selector(A[0], k, debug)
    return A[0][:, indices], B[0][:, indices]

In [3]:
#--------------------------------------------------------------------------------------------------------------------------------
def ETree(p_train_feature,p_train_label,p_test_feature,p_test_label,p_seed):
    clf = ExtraTreesClassifier(n_estimators=50, random_state=p_seed)
    
    # Training
    clf.fit(p_train_feature, p_train_label)
    
    # Training accuracy
    print('Training accuracy：',clf.score(p_train_feature, np.array(p_train_label)))
    print('Training accuracy：',accuracy_score(np.array(p_train_label),clf.predict(p_train_feature)))
    #print('Training accuracy：',np.sum(clf.predict(p_train_feature)==np.array(p_train_label))/p_train_label.shape[0])

    # Testing accuracy
    print('Testing accuracy：',clf.score(p_test_feature, np.array(p_test_label)))
    print('Testing accuracy：',accuracy_score(np.array(p_test_label),clf.predict(p_test_feature)))
    #print('Testing accuracy：',np.sum(clf.predict(p_test_feature)==np.array(p_test_label))/p_test_label.shape[0])

In [4]:
#--------------------------------------------------------------------------------------------------------------------------------
def write_to_csv(p_data,p_path):
    dataframe = pd.DataFrame(p_data)
    dataframe.to_csv(p_path, mode='a',header=False,index=False,sep=',')

# 2. Loading data

In [5]:
data_path="./Dataset/Prostate_GE.mat"
Data = scipy.io.loadmat(data_path)

data_arr=Data['X']
label_arr=Data['Y'][:, 0]-1

Data=MinMaxScaler(feature_range=(0,1)).fit_transform(data_arr)

C_train_x,C_test_x,C_train_y,C_test_y= train_test_split(Data,label_arr,test_size=0.2,random_state=seed)

print('Shape of C_train_x: ' + str(C_train_x.shape)) 
print('Shape of C_train_y: ' + str(C_train_y.shape)) 
print('Shape of C_test_x: ' + str(C_test_x.shape)) 
print('Shape of C_test_y: ' + str(C_test_y.shape)) 

Shape of C_train_x: (81, 5966)
Shape of C_train_y: (81,)
Shape of C_test_x: (21, 5966)
Shape of C_test_y: (21,)


In [6]:
key_feture_number=64

# 3. Model

In [7]:
train=(C_train_x,C_train_x)
test=(C_test_x,C_test_x)

start = time.clock()

C_train_selected_x, C_test_selected_x = pfa_transform(train, test,  key_feture_number)

time_cost=time.clock() - start

write_to_csv(np.array([time_cost]),"./log/PFA_time"+str(key_feture_number)+".csv")

  after removing the cwd from sys.path.
  


# 4. Classifying

### Extra Trees

In [8]:
train_feature=C_train_x
train_label=C_train_y
test_feature=C_test_x
test_label=C_test_y

print('Shape of train_feature: ' + str(train_feature.shape)) 
print('Shape of train_label: ' + str(train_label.shape)) 
print('Shape of test_feature: ' + str(test_feature.shape)) 
print('Shape of test_label: ' + str(test_label.shape)) 

p_seed=seed
ETree(train_feature,train_label,test_feature,test_label,p_seed)

Shape of train_feature: (81, 5966)
Shape of train_label: (81,)
Shape of test_feature: (21, 5966)
Shape of test_label: (21,)
Training accuracy： 1.0
Training accuracy： 1.0
Testing accuracy： 0.9523809523809523
Testing accuracy： 0.9523809523809523


In [9]:
train_feature=C_train_selected_x
train_label=C_train_y

test_feature=C_test_selected_x
test_label=C_test_y

print('Shape of train_feature: ' + str(train_feature.shape)) 
print('Shape of train_label: ' + str(train_label.shape)) 
print('Shape of test_feature: ' + str(test_feature.shape)) 
print('Shape of test_label: ' + str(test_label.shape)) 

p_seed=seed
ETree(train_feature,train_label,test_feature,test_label,p_seed)

Shape of train_feature: (81, 64)
Shape of train_label: (81,)
Shape of test_feature: (21, 64)
Shape of test_label: (21,)
Training accuracy： 1.0
Training accuracy： 1.0
Testing accuracy： 0.9047619047619048
Testing accuracy： 0.9047619047619048


# 6. Reconstruction loss

In [10]:
from sklearn.linear_model import LinearRegression

def mse_check(train, test):
    LR = LinearRegression(n_jobs = -1)
    LR.fit(train[0], train[1])
    MSELR = ((LR.predict(test[0]) - test[1]) ** 2).mean()
    return MSELR

In [11]:
train_feature_tuple=(C_train_selected_x,C_train_x)
test_feature_tuple=(C_test_selected_x,C_test_x)

reconstruction_loss=mse_check(train_feature_tuple, test_feature_tuple)
print(reconstruction_loss)

0.18045226674357112
