In [1]:
import numpy as np
import requests
import time

from tensorflow.keras.optimizers import *
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation

from pyspark import SQLContext, SparkContext
from pyspark import SparkConf

from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml.linalg import Vectors


from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold

Using TensorFlow backend.


In [2]:
# Load the dataset and labels
x=np.load('Bx.npy')
y=np.load('By.npy')

In [3]:
conf = SparkConf()
sc = SparkContext(conf = conf)
spark = SQLContext(sc)

In [4]:
# Calculate the rank of each feature
R=[]
for h in range(x.shape[1]):
    kmeans = KMeans(init='k-means++', n_clusters=np.unique(y).shape[0], n_init=10)
    ff=kmeans.fit_predict(x[:,h].reshape(-1,1))
    r=metrics.homogeneity_score(y,ff) #Use the homogeneity score as a rank of the feature
    R.append(r)

In [5]:
#Arrange feature accroding to thier ranks
Rnk=np.argsort(np.array(R))

In [6]:
#Initiate the cross-validation splitter
kfolds=StratifiedKFold(n_splits=5,shuffle=True)

In [7]:
#Per each set of ranks, use cross-validation to calculate accuracy.
smr=[]
for j in range(Rnk.shape[0]):
    fd=x[:,Rnk[j:]]
    pp=0
    lpa=np.zeros((0,2))
    for train,test in kfolds.split(fd,y):
        dff = map(lambda x: (int(float(x[-1])), Vectors.dense(x[:-1])),np.hstack((fd[train],y[train].reshape(-1,1))))
        TrD = spark.createDataFrame(dff,schema=["label", "features"])
        dff = map(lambda x: (int(float(x[-1])), Vectors.dense(x[:-1])),np.hstack((fd[test],y[test].reshape(-1,1))))
        TsD = spark.createDataFrame(dff,schema=["label", "features"])
        model = Sequential()
        model.add(Dense(128,input_dim=fd.shape[1],activation='relu',use_bias=True))
        model.add(Dropout(0.5))
        model.add(Dense(64,activation='relu',use_bias=True))
        model.add(Dropout(0.5))
        model.add(Dense(32,activation='relu',use_bias=True))
        model.add(Dropout(0.5))
        model.add(Dense(1,activation='sigmoid',use_bias=True)) #The number of neurons is equal to the number of classes
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        trainer = SingleTrainer(keras_model=model, worker_optimizer='adam', loss='binary_crossentropy', num_epoch=1000)
        trained_model = trainer.train(TrD)
        st = time.time()
        predictor = ModelPredictor(keras_model=trained_model)
        et=time.time()-st
        ff=predictor.predict(TsD)
        ts=np.array(map(lambda x: x[0],ff.select('prediction').collect())).reshape(-1,1)
        pp=pp+metrics.accuracy_score(y[test].reshape(-1,1),(ts>=0.5).astype(int))
        lpa=np.vstack((lpa,np.hstack((y[test].reshape(-1,1),ts))))
    pp=pp/kfolds.n_splits
    np.savetxt('F%d.csv'%j,lpa,delimiter=',')
    smr.append([j, pp, et*1000000/x.shape[0]]) #Calculate the time required to predict a label per each object in uS.

In [8]:
smr

[[43.0, 0.991675, 1.433292376],
 [42.0, 0.991898, 1.353315116],
 [41.0, 0.991903, 1.35947327],
 [40.0, 0.991466, 1.359475238],
 [39.0, 0.991416, 1.353318265],
 [38.0, 0.991899, 1.377923716],
 [37.0, 0.991851, 1.375533602],
 [36.0, 0.991544, 1.322565291],
 [35.0, 0.991434, 1.414835237],
 [34.0, 0.991404, 1.345726674],
 [33.0, 0.991377, 1.33066711],
 [32.0, 0.99169, 1.353318265],
 [31.0, 0.991562, 1.356685526],
 [30.0, 0.991196, 1.304102247],
 [29.0, 0.991627, 1.316402019],
 [28.0, 0.991339, 1.377897732],
 [27.0, 0.991492, 1.353291494],
 [26.0, 0.991355, 1.341018493],
 [25.0, 0.991169, 1.334840655],
 [24.0, 0.991322, 1.43329277],
 [23.0, 0.99138, 1.291778459],
 [22.0, 0.991193, 1.297958266],
 [21.0, 0.991334, 1.316420129],
 [20.0, 0.990726, 1.285655738],
 [19.0, 0.990726, 1.310253314],
 [18.0, 0.990866, 1.297949605],
 [17.0, 0.990612, 1.285645108],
 [16.0, 0.99086, 1.310232448],
 [15.0, 0.990418, 1.273341792],
 [14.0, 0.990551, 1.261048713],
 [13.0, 0.989806, 1.304109727],
 [12.0, 0.9897