# 1. Import libraries

In [1]:
#----------------------------Reproducible----------------------------------------------------------------------------------------
import numpy as np
import random as rn
import os

seed=0
os.environ['PYTHONHASHSEED'] = str(seed)

np.random.seed(seed)
rn.seed(seed)

#----------------------------Reproducible----------------------------------------------------------------------------------------

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

#--------------------------------------------------------------------------------------------------------------------------------
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline
matplotlib.style.use('ggplot')

import random
import scipy.sparse as sparse
import scipy.io

from keras.utils import to_categorical
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import scipy.io
from skfeature.function.sparse_learning_based import NDFS
from skfeature.utility import construct_W
from skfeature.utility.sparse_learning import feature_ranking
import os
from skimage import io
from PIL import Image
import time
import pandas as pd

In [2]:
#--------------------------------------------------------------------------------------------------------------------------------
def ETree(p_train_feature,p_train_label,p_test_feature,p_test_label,p_seed):
    clf = ExtraTreesClassifier(n_estimators=50, random_state=p_seed)
    
    # Training
    clf.fit(p_train_feature, p_train_label)
    
    # Training accuracy
    print('Training accuracy：',clf.score(p_train_feature, np.array(p_train_label)))
    print('Training accuracy：',accuracy_score(np.array(p_train_label),clf.predict(p_train_feature)))
    #print('Training accuracy：',np.sum(clf.predict(p_train_feature)==np.array(p_train_label))/p_train_label.shape[0])

    # Testing accuracy
    print('Testing accuracy：',clf.score(p_test_feature, np.array(p_test_label)))
    print('Testing accuracy：',accuracy_score(np.array(p_test_label),clf.predict(p_test_feature)))
    #print('Testing accuracy：',np.sum(clf.predict(p_test_feature)==np.array(p_test_label))/p_test_label.shape[0])

In [3]:
#--------------------------------------------------------------------------------------------------------------------------------
def write_to_csv(p_data,p_path):
    dataframe = pd.DataFrame(p_data)
    dataframe.to_csv(p_path, mode='a',header=False,index=False,sep=',')

# 2. Loading data

In [4]:
dataset_path='./Dataset/COIL-20/'

samples={}
for dirpath, dirnames, filenames in os.walk(dataset_path):
    #print(dirpath)
    #print(dirnames)
    #print(filenames)
    dirnames.sort()
    filenames.sort()
    for filename in [f for f in filenames if f.endswith(".png") and not f.find('checkpoint')>0]:
        full_path = os.path.join(dirpath, filename)
        file_identifier=filename.split('__')[0][3:]
        if file_identifier not in samples.keys():
            samples[file_identifier] = []
        # Direct read
        #image = io.imread(full_path)
        # Resize read
        image_=Image.open(full_path).resize((20, 20),Image.ANTIALIAS)
        image=np.asarray(image_)
        samples[file_identifier].append(image)
        
#plt.imshow(samples['1'][0].reshape(20,20))

In [5]:
data_arr_list=[]
label_arr_list=[]
for key_i in samples.keys():
    key_i_for_label=[int(key_i)-1]
    data_arr_list.append(np.array(samples[key_i]))
    label_arr_list.append(np.array(72*key_i_for_label))
    
data_arr=np.concatenate(data_arr_list).reshape(1440, 20*20).astype('float32') / 255.
label_arr_onehot=np.concatenate(label_arr_list)#to_categorical(np.concatenate(label_arr_list))

In [6]:
C_train_x,C_test_x,C_train_y,C_test_y= train_test_split(data_arr,label_arr_onehot,test_size=0.2,random_state=seed)
x_train,x_validate,y_train_onehot,y_validate_onehot= train_test_split(C_train_x,C_train_y,test_size=0.1,random_state=seed)
x_test=C_test_x
y_test_onehot=C_test_y

print('Shape of x_train: ' + str(x_train.shape)) 
print('Shape of x_validate: ' + str(x_validate.shape)) 
print('Shape of x_test: ' + str(x_test.shape))
print('Shape of y_train: ' + str(y_train_onehot.shape))
print('Shape of y_validate: ' + str(y_validate_onehot.shape))
print('Shape of y_test: ' + str(y_test_onehot.shape))

print('Shape of C_train_x: ' + str(C_train_x.shape)) 
print('Shape of C_train_y: ' + str(C_train_y.shape)) 
print('Shape of C_test_x: ' + str(C_test_x.shape)) 
print('Shape of C_test_y: ' + str(C_test_y.shape)) 

Shape of x_train: (1036, 400)
Shape of x_validate: (116, 400)
Shape of x_test: (288, 400)
Shape of y_train: (1036,)
Shape of y_validate: (116,)
Shape of y_test: (288,)
Shape of C_train_x: (1152, 400)
Shape of C_train_y: (1152,)
Shape of C_test_x: (288, 400)
Shape of C_test_y: (288,)


In [7]:
key_feture_number=50

# 3. Classifying 1

### Extra Trees

In [8]:
train_feature=C_train_x
train_label=C_train_y
test_feature=C_test_x
test_label=C_test_y

print('Shape of train_feature: ' + str(train_feature.shape)) 
print('Shape of train_label: ' + str(train_label.shape)) 
print('Shape of test_feature: ' + str(test_feature.shape)) 
print('Shape of test_label: ' + str(test_label.shape)) 

p_seed=seed
ETree(train_feature,train_label,test_feature,test_label,p_seed)

Shape of train_feature: (1152, 400)
Shape of train_label: (1152,)
Shape of test_feature: (288, 400)
Shape of test_label: (288,)
Training accuracy： 1.0
Training accuracy： 1.0
Testing accuracy： 1.0
Testing accuracy： 1.0


In [10]:
num_cluster=len(np.unique(C_test_y))

# 4. Model

In [11]:
start = time.clock()

# construct affinity matrix
kwargs_W =  {"metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1}

train_W = construct_W.construct_W(train_feature, **kwargs_W)

# obtain the scores of features, and sort the feature scores in an ascending order according to the feature scores
train_score = NDFS.ndfs(train_feature, W=train_W,n_clusters=num_cluster)

train_idx = feature_ranking(train_score)

# obtain the dataset on the selected features
train_selected_x = train_feature[:, train_idx[0:key_feture_number]]
print("train_selected_x",train_selected_x.shape)


test_W = construct_W.construct_W(test_feature, **kwargs_W)

# obtain the scores of features, and sort the feature scores in an ascending order according to the feature scores
test_score = NDFS.ndfs(test_feature, W=test_W,n_clusters=num_cluster)

test_idx = feature_ranking(test_score)

# obtain the dataset on the selected features
test_selected_x = test_feature[:, test_idx[0:key_feture_number]]
print("test_selected_x",test_selected_x.shape)



time_cost=time.clock() - start

write_to_csv(np.array([time_cost]),"./log/NDFS_time"+str(key_feture_number)+".csv")

  """Entry point for launching an IPython kernel.


train_selected_x (1152, 50)




test_selected_x (288, 50)




# 5. Classifying 2

### Extra Trees

In [12]:
train_feature=train_selected_x
train_label=C_train_y

test_feature=test_selected_x
test_label=C_test_y

print('Shape of train_feature: ' + str(train_feature.shape)) 
print('Shape of train_label: ' + str(train_label.shape)) 
print('Shape of test_feature: ' + str(test_feature.shape)) 
print('Shape of test_label: ' + str(test_label.shape)) 

p_seed=seed
ETree(train_feature,train_label,test_feature,test_label,p_seed)

Shape of train_feature: (1152, 50)
Shape of train_label: (1152,)
Shape of test_feature: (288, 50)
Shape of test_label: (288,)
Training accuracy： 1.0
Training accuracy： 1.0
Testing accuracy： 0.21180555555555555
Testing accuracy： 0.21180555555555555


# 6. Reconstruction loss

In [13]:
from sklearn.linear_model import LinearRegression

def mse_check(train, test):
    LR = LinearRegression(n_jobs = -1)
    LR.fit(train[0], train[1])
    MSELR = ((LR.predict(test[0]) - test[1]) ** 2).mean()
    return MSELR

In [14]:
train_feature_tuple=(train_selected_x,C_train_x)
test_feature_tuple=(test_selected_x,C_test_x)

reconstruction_loss=mse_check(train_feature_tuple, test_feature_tuple)
print(reconstruction_loss)

0.13383722
