## Import Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split 

## Load Dataset

In [163]:
data = pd.read_csv('data/uci-secom.csv')

In [169]:
data.fillna(0, inplace = True)

In [204]:
data

Unnamed: 0,Time,0,1,2,3,4,5,6,7,8,...,581,582,583,584,585,586,587,588,589,Pass/Fail
0,2008-07-19 11:55:00,3030.93,2564.00,2187.7333,1411.1265,1.3602,100.0,97.6133,0.1242,1.5005,...,0.0000,0.5005,0.0118,0.0035,2.3630,0.0000,0.0000,0.0000,0.0000,-1
1,2008-07-19 12:32:00,3095.78,2465.14,2230.4222,1463.6606,0.8294,100.0,102.3433,0.1247,1.4966,...,208.2045,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.0060,208.2045,-1
2,2008-07-19 13:17:00,2932.61,2559.94,2186.4111,1698.0172,1.5102,100.0,95.4878,0.1241,1.4436,...,82.8602,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602,1
3,2008-07-19 14:43:00,2988.72,2479.90,2199.0333,909.7926,1.3204,100.0,104.2367,0.1217,1.4882,...,73.8432,0.4990,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432,-1
4,2008-07-19 15:22:00,3032.24,2502.87,2233.3667,1326.5200,1.5334,100.0,100.3967,0.1235,1.5031,...,0.0000,0.4800,0.4766,0.1045,99.3032,0.0202,0.0149,0.0044,73.8432,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1562,2008-10-16 15:13:00,2899.41,2464.36,2179.7333,3085.3781,1.4843,100.0,82.2467,0.1248,1.3424,...,203.1720,0.4988,0.0143,0.0039,2.8669,0.0068,0.0138,0.0047,203.1720,-1
1563,2008-10-16 20:49:00,3052.31,2522.55,2198.5667,1124.6595,0.8763,100.0,98.4689,0.1205,1.4333,...,0.0000,0.4975,0.0131,0.0036,2.6238,0.0068,0.0138,0.0047,203.1720,-1
1564,2008-10-17 05:26:00,2978.81,2379.78,2206.3000,1110.4967,0.8236,100.0,99.4122,0.1208,0.0000,...,43.5231,0.4987,0.0153,0.0041,3.0590,0.0197,0.0086,0.0025,43.5231,-1
1565,2008-10-17 06:01:00,2894.92,2532.01,2177.0333,1183.7287,1.5726,100.0,98.7978,0.1213,1.4622,...,93.4941,0.5004,0.0178,0.0038,3.5662,0.0262,0.0245,0.0075,93.4941,-1


## Define Target and Feature Variables

In [171]:
#feature variables
X = data.drop(['Time','Pass/Fail'], axis=1).values
X

#target variable
y = data['Pass/Fail'].values
y

array([-1, -1,  1, ..., -1, -1, -1], dtype=int64)

## Train-Test Split

In [172]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

## Logistic Regression

In [207]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr = LogisticRegression()
lr.fit(X_train, y_train)
LogisticRegression()
y_pred = lr.predict(X_test)

accuracy_score(y_pred, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8439490445859873

## DFO Algorithm

In [173]:
import numpy as np

# FITNESS FUNCTION (SPHERE FUNCTION)
def f(x): # x IS A VECTOR REPRESENTING ONE FLY
    sum = 0.0
    for i in range(len(x)):
        sum = sum + np.power(x[i],2)
    return sum

N = X_train.shape[0] # POPULATION SIZE
D = X_train.shape[1] # DIMENSIONALITY 
delta = 0.001 # DISTURBANCE THRESHOLD 
maxIterations = 500 # ITERATIONS ALLOWED
lowerB = [-5.12]*D # LOWER BOUND (IN ALL DIMENSIONS)
upperB = [ 5.12]*D # UPPER BOUND (IN ALL DIMENSIONS)

# INITIALISATION PHASE
X = X_train # EMPTY FLIES ARRAY OF SIZE: (N,D)
fitness = [None]*N  # EMPTY FITNESS ARRAY OF SIZE N

# INITIALISE FLIES WITHIN BOUNDS
# for i in range(N):
#     for d in range(D):
#         X[i,d] = np.random.uniform(lowerB[d], upperB[d])

# MAIN DFO LOOP
for itr in range (maxIterations):
    for i in range(N): # EVALUATION
        fitness[i] = f(X_train[i,])
    s = np.argmin(fitness) # FIND BEST FLY

    if (itr%10 == 0): # PRINT BEST FLY EVERY 100 ITERATIONS
        print ("Iteration:", itr, "\tBest fly index:", s, 
               "\tFitness value:", fitness[s])

    # TAKE EACH FLY INDIVIDUALLY 
    for i in range(N): 
        if i == s: continue # ELITIST STRATEGY

        # FIND BEST NEIGHBOUR
        left = (i-1)%N
        right = (i+1)%N
        bNeighbour = right if fitness[right]<fitness[left] else left

        for d in range(D): # UPDATE EACH DIMENSION SEPARATELY 
            if (np.random.rand() < delta):
                X[i,d] = np.random.uniform(lowerB[d], upperB[d])
                continue;

            u = np.random.rand()
            X[i,d] = X[bNeighbour,d] + u*(X[s,d] - X[i,d])

            # OUT OF BOUND CONTROL
            if X[i,d] < lowerB[d] or X[i,d] > upperB[d]:
                X[i,d] = np.random.uniform(lowerB[d], upperB[d])

for i in range(N): fitness[i] = f(X[i,]) # EVALUATION
s = np.argmin(fitness) # FIND BEST FLY

print("\nFinal best fitness:\t", fitness[s])
print("\nBest fly position:\n",  X[s,])

Iteration: 0 	Best fly index: 570 	Fitness value: 84029662.46509002
Iteration: 10 	Best fly index: 885 	Fitness value: 937.5749514211522
Iteration: 20 	Best fly index: 100 	Fitness value: 724.9841479574267
Iteration: 30 	Best fly index: 165 	Fitness value: 619.1169444534988
Iteration: 40 	Best fly index: 1248 	Fitness value: 548.9838430395946
Iteration: 50 	Best fly index: 818 	Fitness value: 469.6050402203556
Iteration: 60 	Best fly index: 427 	Fitness value: 415.13820611775503
Iteration: 70 	Best fly index: 1121 	Fitness value: 372.63764333089574
Iteration: 80 	Best fly index: 440 	Fitness value: 331.02869440377003
Iteration: 90 	Best fly index: 557 	Fitness value: 298.9062331036329
Iteration: 100 	Best fly index: 165 	Fitness value: 275.24930837046725
Iteration: 110 	Best fly index: 906 	Fitness value: 253.066062504036
Iteration: 120 	Best fly index: 651 	Fitness value: 228.96245437629406
Iteration: 130 	Best fly index: 1107 	Fitness value: 209.83450159272675
Iteration: 140 	Best fl

In [190]:
X[s,].shape

(590,)

In [192]:
dim = 590

In [193]:
def binary_conversion(X, thres, N, dim):
    Xbin = np.zeros([N, dim], dtype='int')
    for i in range(N):
        for d in range(dim):
            if X[i,d] > thres:
                Xbin[i,d] = 1
            else:
                Xbin[i,d] = 0
    
    return Xbin

In [194]:
def Fun(xtrain, ytrain, x):
    # parameters
    cost = 0
    alpha = 0.99
    beta = 1 - alpha
    # original feature size
    max_feat = len(x)
    # Number of selected features
    num_feat = np.sum(x == 1)
    # Solve if no feature selected
    if num_feat == 0:
        cost = 1
    
    return cost


In [195]:
N=590
fit   = np.zeros([N, 1], dtype='float')
Xgb   = np.zeros([1, dim], dtype='float')
fitG  = float('inf')
Xpb   = np.zeros([N, dim], dtype='float')
fitP  = float('inf') * np.ones([N, 1], dtype='float')
curve = np.zeros([1, maxIterations], dtype='float') 
t     = 0

In [196]:
fit.shape

(590, 1)

In [197]:
while t < maxIterations:
    # Binary conversion
    Xbin = binary_conversion(X, thres, N, dim)

    # Fitness
    for i in range(N):
        fit[i,0] = Fun(X_train, y_train, Xbin[i,:])
        if fit[i,0] < fitP[i,0]:
            Xpb[i,:]  = X[i,:]
            fitP[i,0] = fit[i,0]
        if fitP[i,0] < fitG:
            Xgb[0,:]  = Xpb[i,:]
            fitG      = fitP[i,0]
    t += 1

In [198]:
# Best feature subset
Gbin       = binary_conversion(Xgb, thres, 1, dim) 
Gbin       = Gbin.reshape(dim)
pos        = np.asarray(range(0, dim))    
sel_index  = pos[Gbin == 1]
num_feat   = len(sel_index)
# Create dictionary
pso_data = {'sf': sel_index, 'nf': num_feat}

In [199]:
print(Gbin)
print(pos)
print(sel_index)
print(num_feat)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 

In [201]:
pso_data['sf']

array([127, 179, 296, 299, 345, 411, 444, 447, 476, 501])

In [202]:
new_data = data.iloc[:,pso_data['sf']]

In [203]:
new_data

Unnamed: 0,126,178,295,298,344,410,443,446,475,500
0,3.353,0.0,398.3185,0.0373,31.8843,2.9329,0.6288,1.4324,2.3394,0.0000
1,2.771,0.0,26.5879,0.0502,41.7080,3.2858,0.8123,1.5683,2.1627,0.0000
2,3.094,0.0,329.6406,0.0800,24.7959,3.7696,0.6263,1.4698,3.1842,0.0000
3,2.480,0.0,157.0889,0.0285,29.0339,4.9881,0.5183,1.3141,3.4234,711.6418
4,3.027,0.0,128.0296,0.0755,32.0537,4.9184,0.6076,1.2524,4.5375,0.0000
...,...,...,...,...,...,...,...,...,...,...
1562,2.454,0.0,149.1755,0.0520,21.0569,6.2897,0.6004,0.9927,2.2192,0.0000
1563,2.783,0.0,43.8368,0.0249,51.7258,8.7134,0.9383,1.0330,4.4892,874.5098
1564,0.000,0.0,236.9079,0.0452,15.8426,5.6308,0.7563,1.3602,4.2035,0.0000
1565,2.658,0.0,129.7832,0.0444,17.4598,2.6303,0.5680,1.0130,2.2567,433.3952


## Logistic Regression after Optimisation

In [209]:
#feature variables
X = new_data.values
X

#target variable
y = data['Pass/Fail'].values
y

# sc = StandardScaler()
# X = sc.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

print("Shape of X_train:", X.shape)
print("Shape of y_train:", y.shape)

lr = LogisticRegression()
lr.fit(X_train, y_train)
LogisticRegression()
y_pred = lr.predict(X_test)

accuracy_score(y_pred, y_test)

Shape of X_train: (1567, 10)
Shape of y_train: (1567,)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9140127388535032