# 1. Import libraries

In [1]:
#----------------------------Reproducible----------------------------------------------------------------------------------------
import numpy as np
import random as rn
import os

seed=0
os.environ['PYTHONHASHSEED'] = str(seed)

np.random.seed(seed)
rn.seed(seed)

#----------------------------Reproducible----------------------------------------------------------------------------------------

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

#--------------------------------------------------------------------------------------------------------------------------------
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline
matplotlib.style.use('ggplot')

import random
import scipy.sparse as sparse
import scipy.io

from keras.utils import to_categorical
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from skfeature.utility import construct_W
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.linalg import qr
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from collections import defaultdict
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import time
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
from skimage import io
from PIL import Image
from sklearn.model_selection import train_test_split
import scipy.sparse as sparse
from keras.datasets import fashion_mnist

#--------------------------------------------------------------------------------------------------------------------------------
#Import ourslef defined methods
import sys
sys.path.append(r"../Defined")
import Functions as F

Using TensorFlow backend.


In [2]:
def column_subset_selector(A, k):
    eps = 1e-6
    A_scaled = A / np.sqrt(np.sum(np.square(A), axis=0) / (A.shape[0] - 1))
    u, d, v = np.linalg.svd(A_scaled)
    u_, d_, v_ = np.linalg.svd(A, k)
    n = np.where(d_ < eps)[0]
    if(len(n)>0 and k > n[0]):
        k = n[0] - 1
        print("k was reduced to match the rank of A")
    Q, R, P = qr((v[:,:k]).T, pivoting=True)
    indices = P[:k]
    return indices

def pfa_selector(A, k, debug = False):
    class PFA(object):
        def __init__(self, n_features, q=0.5):
            self.q = q
            self.n_features = n_features
        
        def fit(self, X):
            if not self.q:
                self.q = X.shape[1]

            sc = StandardScaler()
            X = sc.fit_transform(X)

            pca = PCA(n_components=self.q).fit(X)
            self.n_components_ = pca.n_components_
            A_q = pca.components_.T

            kmeans = KMeans(n_clusters=self.n_features).fit(A_q)
            clusters = kmeans.predict(A_q)
            cluster_centers = kmeans.cluster_centers_

            self.indices_ = [] 
            for cluster_idx in range(self.n_features):
                indices_in_cluster = np.where(clusters==cluster_idx)[0]
                points_in_cluster = A_q[indices_in_cluster, :]
                centroid = cluster_centers[cluster_idx]
                distances = np.linalg.norm(points_in_cluster - centroid, axis=1)
                optimal_index = indices_in_cluster[np.argmin(distances)]
                self.indices_.append(optimal_index) 
  
    pfa = PFA(n_features = k)
    pfa.fit(A)
    if debug:
        print('Performed PFW with q=', pfa.n_components_)
    column_indices = pfa.indices_
    return column_indices

def pfa_transform(A, B, k, debug = False):
    indices = pfa_selector(A[0], k, debug)
    return A[0][:, indices], B[0][:, indices]

# 2. Loading data

In [3]:
num_data_used=10000
(x_train_, y_train_), (x_test_, y_test_) = fashion_mnist.load_data()
x_data=np.r_[x_train_,x_test_].reshape(70000, 28*28).astype('float32')/255.0
y_data=np.r_[y_train_,y_test_]

np.random.seed(seed)
x_data_num,_=x_data.shape
index=np.arange(x_data_num)
np.random.shuffle(index)

data_arr=x_data[index][0:num_data_used]
label_arr_onehot=y_data[index][0:num_data_used]

In [4]:
key_feture_number=25

# 3. Calculation

In [5]:
#--------------------------------------------------------------------------------------------------------------------------------
def write_to_csv(p_data,p_path):
    dataframe = pd.DataFrame(p_data)
    dataframe.to_csv(p_path, mode='a',header=False,index=False,sep=',')
    del dataframe

#--------------------------------------------------------------------------------------------------------------------------------       
def mse_check(train, test):
    LR = LinearRegression(n_jobs = -1)
    LR.fit(train[0], train[1])
    MSELR = ((LR.predict(test[0]) - test[1]) ** 2).mean()
    return MSELR
 
#--------------------------------------------------------------------------------------------------------------------------------       
def cal(p_data_arr,\
        p_label_arr_onehot,\
        p_key_feture_number,\
        p_seed):
    
    C_train_x,C_test_x,C_train_y,C_test_y= train_test_split(p_data_arr,p_label_arr_onehot,test_size=0.2,random_state=p_seed)

    os.environ['PYTHONHASHSEED'] = str(p_seed)
    np.random.seed(p_seed)
    rn.seed(p_seed)
    
    #--------------------------------------------------------------------------------------------------------------------------------
    train=(C_train_x,C_train_x)
    test=(C_test_x,C_test_x)

    t_start = time.time()

    C_train_selected_x, C_test_selected_x = pfa_transform(train, test,  p_key_feture_number)
    t_used=time.time() - t_start
    write_to_csv(np.array([t_used]),"./log"+str(key_feture_number)+"/PFA_time.csv")
    
    # Classification on original features
    train_feature=C_train_x
    train_label=C_train_y
    test_feature=C_test_x
    test_label=C_test_y
    
    orig_train_acc,orig_test_acc=F.ETree(train_feature,train_label,test_feature,test_label,0)
    
    # Classification on selected features
    train_feature=C_train_selected_x
    train_label=C_train_y
    test_feature=C_test_selected_x
    test_label=C_test_y

    selec_train_acc,selec_test_acc=F.ETree(train_feature,train_label,test_feature,test_label,0)

    # Linear reconstruction
    train_feature_tuple=(C_train_selected_x,C_train_x)
    test_feature_tuple=(C_test_selected_x,C_test_x)

    reconstruction_loss=mse_check(train_feature_tuple, test_feature_tuple)
    results=np.array([orig_train_acc,orig_test_acc,selec_train_acc,selec_test_acc,reconstruction_loss])
    
    write_to_csv(results.reshape(1,len(results)),"./log"+str(key_feture_number)+"/PFA_results.csv")
    
    return orig_train_acc,orig_test_acc,selec_train_acc,selec_test_acc,reconstruction_loss

In [6]:
p_data_arr=data_arr
p_label_arr_onehot=label_arr_onehot
p_key_feture_number=key_feture_number

In [7]:
for p_seed in np.arange(0,5):
    orig_train_acc,orig_test_acc,selec_train_acc,selec_test_acc,reconstruction_loss=cal(p_data_arr,\
                                                                                        p_label_arr_onehot,\
                                                                                        p_key_feture_number,\
                                                                                        p_seed)

Training accuracy： 1.0
Training accuracy： 1.0
Testing accuracy： 0.8525
Testing accuracy： 0.8525
Training accuracy： 1.0
Training accuracy： 1.0
Testing accuracy： 0.777
Testing accuracy： 0.777
Training accuracy： 1.0
Training accuracy： 1.0
Testing accuracy： 0.8435
Testing accuracy： 0.8435
Training accuracy： 1.0
Training accuracy： 1.0
Testing accuracy： 0.7665
Testing accuracy： 0.7665
Training accuracy： 1.0
Training accuracy： 1.0
Testing accuracy： 0.839
Testing accuracy： 0.839
Training accuracy： 1.0
Training accuracy： 1.0
Testing accuracy： 0.747
Testing accuracy： 0.747
Training accuracy： 1.0
Training accuracy： 1.0
Testing accuracy： 0.848
Testing accuracy： 0.848
Training accuracy： 1.0
Training accuracy： 1.0
Testing accuracy： 0.7595
Testing accuracy： 0.7595
Training accuracy： 1.0
Training accuracy： 1.0
Testing accuracy： 0.8555
Testing accuracy： 0.8555
Training accuracy： 1.0
Training accuracy： 1.0
Testing accuracy： 0.78
Testing accuracy： 0.78
