In [16]:
import numpy as np
from sklearn import svm
from sklearn.linear_model import SGDClassifier
import pandas as pd
import matplotlib.pyplot as plt
plt.switch_backend('agg')
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
import random
from scipy.io import mmread
import math
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine
import os
from matplotlib import gridspec
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectKBest, SelectFromModel
from sklearn.feature_selection import chi2,f_classif,mutual_info_classif,mutual_info_regression
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from scipy.stats import chisquare
from scipy.stats import pearsonr
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import accuracy_score
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from time import sleep
from parfor import parfor

def text_create(path, name, msg):
    full_path = path + "/" + name + '.txt'
    file = open(full_path, 'w')
    file.write(str(msg))


def SVM(X, y):
    model=SGDClassifier(loss="hinge", penalty="l2", max_iter=1000000)
    model.fit(X, y)
    return model


def get_error(model, X, y):
    y_pred = model.predict(X)
    return mean_squared_error(y_pred, y)

def select_samples_w(X, y, num_samples):
    classes=np.unique(y)
    model = SVM(X, y)
    coef_=model.coef_
    w_padded = np.hstack((coef_, np.zeros((coef_.shape[0], 1))))
    y_pred = model.predict(X)
    sv = [i for i in range(len(y)) if y[i] != y_pred[i]]
    if len(sv)<num_samples:
        indices =sv
    else:
        @parfor(classes,bar=False)
        def c(i):
            #sleep(1)
            sv_class = list(set(list(np.where(y_train == i)[0])) & set(sv))
            return random.choice(sv_class)
        indices = c+random.sample(sv, num_samples-len(classes))
    return indices, w_padded, model

def select_feature(X, y, feature_list, w_padded):
    @parfor(range(X.shape[1]),bar=False)
    def angles(i):
        X_local = X[:, feature_list + [i]]
        w_new = SVM(X_local, y).coef_
        cos=cosine_similarity(w_padded, w_new)
        angle = 0
        for j in range(w_padded.shape[0]):
            angle = angle + math.acos(cos[j,j])
        return angle
    indices = sorted(range(X.shape[1]), key=lambda i: angles[i], reverse=True)
    return [i for i in indices if i not in feature_list][0]

def SVM_active_feature_selection(X_train, y_train, X_test, y_test, num_features, num_samples):
    feature_selected = []
    num_samples_list = []
    train_errors = []
    test_errors = []
    train_scores = []
    test_scores = []
    
    shuffle = np.arange(X_train.shape[0])
    np.random.shuffle(shuffle)
    samples = shuffle[:num_samples]
    X_global = X_train[samples, :]
    y_global = y_train[samples]
    samples_global=samples
    num_samples_list.append(len(samples_global))
    
    @parfor(range(X_global.shape[1]),bar=False)
    def scores(i):
        model=SVM(X_global[:,i].reshape(-1, 1),y_global)
        return model.score(X_global[:,i].reshape(-1, 1),y_global)  # R^2 for regression and mean accuracy for classificarion
    
    new_feature = sorted(range(X_global.shape[1]), key=lambda i: scores[i], reverse=True)[0]
    feature_selected.append(new_feature)
    
    for i in range(num_features - 1):
        t=Timer()
        t.start()
        
        X_measured_train = X_train[:,feature_selected]
        X_measured_test = X_test[:,feature_selected]
        
        samples, w_padded,model = select_samples_w(X_measured_train, y_train, num_samples)
        samples_global = list(set().union(samples_global, samples))
        num_samples_list.append(len(samples_global))
        
        train_error = get_error(model, X_measured_train, y_train)
        test_error = get_error(model, X_measured_test, y_test)
        train_score = model.score(X_measured_train, y_train)
        test_score = model.score(X_measured_test, y_test)
        train_errors.append(train_error)
        test_errors.append(test_error)
        train_scores.append(train_score)
        test_scores.append(test_score)
        print("feature " + str(i) + ' : ' + str(len(samples_global)) + ' samples')
        print('training error=' + str(train_error) + ' test error=' + str(test_error))
        print('training accuracy=' + str(train_score) + ' test accuracy=' + str(test_score))
            
        new_feature=select_feature(X_train[samples], y_train[samples],feature_selected,w_padded)
        feature_selected.append(new_feature)
        t.stop()
    
    X_measured_train = X_train[:,feature_selected]
    X_measured_test = X_test[:,feature_selected]
    model=SVM(X_measured_train,y_train)
    train_error = get_error(model, X_measured_train, y_train)
    test_error = get_error(model, X_measured_test, y_test)
    train_score = model.score(X_measured_train, y_train)
    test_score = model.score(X_measured_test, y_test)
    train_errors.append(train_error)
    test_errors.append(test_error)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print("feature " + str(i))
    print('training error=' + str(train_error) + ' test error=' + str(test_error))
    print('training accuracy=' + str(train_score) + ' test accuracy=' + str(test_score))
    
    return feature_selected, num_samples_list, train_errors, test_errors, train_scores, test_scores

# Timer

In [11]:
import time

class TimerError(Exception):
     """A custom exception used to report errors in use of Timer class"""
        
class Timer:
    def __init__(self):
        self._start_time = None
        
    def start(self):
        """Start a new timer"""
        if self._start_time is not None:
            raise TimerError(f"Timer is running. Use .stop() to stop it")
        
        self._start_time = time.perf_counter()

    def stop(self):
        """Stop the timer, and report the elapsed time"""
        if self._start_time is None:
            raise TimerError(f"Timer is not running. Use .start() to start it")
        
        elapsed_time = time.perf_counter() - self._start_time
        self._start_time = None
        print(f"Elapsed time: {elapsed_time:0.4f} seconds")

# Load Data as Dense Matrix

In [2]:
import pandas as pd
data_df = pd.read_csv('./PBMCnorm_final.csv')
label_df=pd.read_csv('./labels_final.csv',header = None)
gene_df=pd.read_csv('./genes_final.csv')
raw_data_data=data_df.values
target=label_df.values.reshape((raw_data_data.shape[1],)).astype(np.double)
gene=gene_df.values.reshape((raw_data_data.shape[0],))
data_tt = raw_data_data[~(raw_data_data==0).all(1)]
gene_tt=gene[~(raw_data_data==0).all(1)]
data_t = data_tt[~(np.std(data_tt, axis=1) == 0)]
genes_final=gene_tt[~(np.std(data_tt, axis=1) == 0)]
data_tmp =  np.transpose(data_t)
data = normalize(data_tmp,axis = 0)
print(np.shape(data),np.shape(target),np.shape(genes_final))

idx = np.arange(np.shape(data)[0])
random.shuffle(idx)
X_train = data[idx[:int(np.shape(data)[0]*4/5)],:]
y_train = target[idx[:int(np.shape(data)[0]*4/5)]]
X_test = data[idx[int(np.shape(data)[0]*4/5):],:]
y_test = target[idx[int(np.shape(data)[0]*4/5):]]
print(np.shape(X_train))
print(np.shape(X_test))

<class 'numpy.matrix'>
(796138, 5000)
(199035, 5000)
33
(995173, 5000) (995173,)


# Define Parameters and Work Space

In [14]:
folder='results'

num_features = 100
num_samples=200

path=folder+'/test1'+str(num_features)+'_'+str(num_samples)
try:
    os.mkdir(folder)
except OSError:
    print ("Creation of the directory %s failed" % folder)
else:
    print ("Successfully created the directory %s " % folder)
try:
    os.mkdir(path)
except OSError:
    print ("Creation of the directory %s failed" % path)
else:
    print ("Successfully created the directory %s " % path)

Successfully created the directory results 
Successfully created the directory results/test1100_500 


# Select Genes

In [17]:
feature_selected, num_samples_list, train_errors,test_errors,train_scores,test_scores= SVM_active_feature_selection(
    X_train,y_train,X_test,y_test,num_features=100,num_samples=500)

text_create(path,'feature_selected',feature_selected)
text_create(path,'error',train_errors+test_errors)
text_create(path,'accuracy',train_scores+test_scores)
text_create(path,'num_samples_list',num_samples_list)

feature 0
training error=81.00064184852374 test error=81.03931469339564
training accuracy=0.17675453250567114 test accuracy=0.1773959353882483
Elapsed time: 52.2394 seconds
feature 1
training error=49.073688229929985 test error=49.07606199914588
training accuracy=0.203537577656135 test accuracy=0.20246690280603913
Elapsed time: 50.0567 seconds
feature 2
training error=59.43555514245018 test error=59.24089732961539
training accuracy=0.15968337147579942 test accuracy=0.16069535508830105
Elapsed time: 52.2165 seconds
feature 3
training error=52.667395602269956 test error=52.567442912050645
training accuracy=0.140728114975042 test accuracy=0.14000050242419676
Elapsed time: 54.5294 seconds
feature 4
training error=144.1414101575355 test error=144.6238500766197
training accuracy=0.20428242339895847 test accuracy=0.20445147838319894
Elapsed time: 54.6885 seconds
feature 5
training error=221.5144723653437 test error=220.35161654985305
training accuracy=0.23572923287168807 test accuracy=0.23600

Elapsed time: 99.0537 seconds
feature 48
training error=27.18567886471943 test error=27.405602029793755
training accuracy=0.6909681487380328 test accuracy=0.6907428341748939
Elapsed time: 97.7498 seconds
feature 49
training error=26.332429804883073 test error=26.328535182254377
training accuracy=0.6906729737809274 test accuracy=0.6903911372371694
Elapsed time: 97.3840 seconds
feature 50
training error=27.440693698831108 test error=27.391835606802825
training accuracy=0.6867854567926666 test accuracy=0.6875624890094707
Elapsed time: 97.2834 seconds
feature 51
training error=27.921859024440487 test error=28.047237922978372
training accuracy=0.6920219861380816 test accuracy=0.6917275856005225
Elapsed time: 98.2362 seconds
feature 52
training error=28.087999819126836 test error=28.2909890220313
training accuracy=0.6856650480193133 test accuracy=0.6850101740899842
Elapsed time: 99.3091 seconds
feature 53
training error=27.606178828293586 test error=27.789690255482704
training accuracy=0.692

Elapsed time: 119.2151 seconds
feature 95
training error=22.91064614426142 test error=22.931157836561407
training accuracy=0.7437806008506063 test accuracy=0.7427889567161554
Elapsed time: 113.0524 seconds
feature 96
training error=20.5921372927809 test error=20.744004823272288
training accuracy=0.7542925975145013 test accuracy=0.752706810359987
Elapsed time: 113.5250 seconds
feature 97
training error=21.937064930954183 test error=22.062074509508378
training accuracy=0.7471769969527896 test accuracy=0.746074810962896
Elapsed time: 112.9806 seconds
feature 98
training error=20.87840299043633 test error=21.018253071067903
training accuracy=0.754301389960032 test accuracy=0.7526515436983445
Elapsed time: 113.6246 seconds
feature 98
training error=20.309164994008576 test error=20.31223151706986
training accuracy=0.7542410989049637 test accuracy=0.7529329012485241
