In [1]:
import numpy as np
import requests
import time

from keras.optimizers import *
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation

from pyspark import SQLContext, SparkContext
from pyspark import SparkConf

from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml.linalg import Vectors

from distkeras.trainers import *
from distkeras.predictors import *
from distkeras.transformers import *
from distkeras.evaluators import *
from distkeras.utils import *

from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold

Using TensorFlow backend.


In [2]:
# Load the dataset and labels
x=np.load('NMCx.npy')
y=np.load('NMCy.npy')

In [3]:
conf = SparkConf()
sc = SparkContext(conf = conf)
spark = SQLContext(sc)

In [4]:
# Calculate the rank of each feature
R=[]
for h in range(x.shape[1]):
    kmeans = KMeans(init='k-means++', n_clusters=np.unique(y).shape[0], n_init=10)
    ff=kmeans.fit_predict(x[:,h].reshape(-1,1))
    r=metrics.homogeneity_score(y,ff) #Use the homogeneity score as a rank of the feature
    R.append(r)

In [5]:
#Arrange feature accroding to thier ranks
Rnk=np.argsort(np.array(R))

In [6]:
#Initiate the cross-validation splitter
kfolds=StratifiedKFold(n_splits=5,shuffle=True)

In [7]:
#Per each set of ranks, use cross-validation to calculate accuracy.
smr=[]
for j in range(Rnk.shape[0]):
    fd=x[:,Rnk[j:]]
    pp=0
    lpa=np.zeros((0,2))
    for train,test in kfolds.split(fd,y):
        dff = map(lambda x: (int(float(x[-1])), Vectors.dense(x[:-1])),np.hstack((fd[train],y[train].reshape(-1,1))))
        TrD = spark.createDataFrame(dff,schema=["label", "features"])
        dff = map(lambda x: (int(float(x[-1])), Vectors.dense(x[:-1])),np.hstack((fd[test],y[test].reshape(-1,1))))
        TsD = spark.createDataFrame(dff,schema=["label", "features"])
        model = Sequential()
        model.add(Dense(128,input_dim=fd.shape[1],activation='relu',use_bias=True))
        model.add(Dropout(0.5))
        model.add(Dense(64,activation='relu',use_bias=True))
        model.add(Dropout(0.5))
        model.add(Dense(32,activation='relu',use_bias=True))
        model.add(Dropout(0.5))
        model.add(Dense(np.unique(y).shape[0],activation='softmax',use_bias=True)) #The number of neurons is equal to the number of classes
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])
        trainer = SingleTrainer(keras_model=model, worker_optimizer='adam', loss='categorical_crossentropy', num_epoch=1000)
        trained_model = trainer.train(TrD)
        st = time.time()
        predictor = ModelPredictor(keras_model=trained_model)
        et=time.time()-st
        ff=predictor.predict(TsD)
        ts=np.array(map(lambda x: x[0],ff.select('prediction').collect())).reshape(-1,1)
        pp=pp+metrics.accuracy_score(y[test].reshape(-1,1),(ts>=0.5).astype(int))
        lpa=np.vstack((lpa,np.hstack((y[test].reshape(-1,1),ts))))
    pp=pp/kfolds.n_splits
    np.savetxt('F%d.csv'%j,lpa,delimiter=',')
    smr.append([j, pp, et*1000000/x.shape[0]]) #Calculate the time required to predict a label per each object in uS.

In [8]:
smr

[[78.0, 0.9956, 0.6387],
 [77.0, 0.9953, 0.6424],
 [76.0, 0.9954, 0.663],
 [75.0, 0.9956, 0.7043],
 [74.0, 0.9945, 0.7152],
 [73.0, 0.9951, 0.7234],
 [72.0, 0.9956, 0.7346],
 [71.0, 0.9953, 0.7978],
 [70.0, 0.9947, 0.7822],
 [69.0, 0.9949, 0.7952],
 [68.0, 0.9953, 0.8467],
 [67.0, 0.9956, 0.8581],
 [66.0, 0.9955, 0.8746],
 [65.0, 0.9955, 0.9055],
 [64.0, 0.9954, 0.9267],
 [63.0, 0.9954, 0.9494],
 [62.0, 0.9945, 0.9463],
 [61.0, 0.9941, 0.9796],
 [60.0, 0.9938, 0.9927],
 [59.0, 0.9921, 1.0174],
 [58.0, 0.9935, 1.0407],
 [57.0, 0.9912, 1.043],
 [56.0, 0.9922, 1.0773],
 [55.0, 0.9927, 1.0942],
 [54.0, 0.9933, 1.1311],
 [53.0, 0.9911, 1.1382],
 [52.0, 0.9909, 1.1768],
 [51.0, 0.9921, 1.2052],
 [50.0, 0.9906, 1.2159],
 [49.0, 0.992, 1.2283],
 [48.0, 0.9908, 1.2602],
 [47.0, 0.9912, 1.2706],
 [46.0, 0.9893, 1.3038],
 [45.0, 0.9903, 1.3261],
 [44.0, 0.9909, 1.3498],
 [43.0, 0.9911, 1.3707],
 [42.0, 0.9903, 1.3947],
 [41.0, 0.9899, 1.4268],
 [40.0, 0.9905, 1.446],
 [39.0, 0.9909, 1.7069],
 [38