In [1]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn import svm
from sklearn.datasets import load_iris
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold

In [2]:
# Load data and initial all parameters

# Define a dictionary containing Students data 

df = pd.read_csv(r"D:\Applying job Berlin\Smart steel test\task\task_data.csv")
X = df.iloc[:, 2:]
y = df.iloc[:, 1]

# Define grid for three algorithms: Support vector machine, Random forest and K-Neighbor
param_grid_SVC = [  {'C': [0.1, 0.5, 0.7, 1, 1.5], 'kernel': ['rbf']}]
param_grid_RF = [{'max_depth':[2, 3, 4], 'n_estimators':[ 5, 10, 20]}]
param_grid_KNN = [{'n_neighbors': [5, 7, 9, 11]}]


# Define cross validation for Gridsearch and compute accuracy
inner_cv = KFold(n_splits=5, shuffle=True) #use for GridSearch
outer_cv = KFold(n_splits=5, shuffle=True)

# Define three algorithms
Class_SVM = svm.SVC(gamma='scale')
Class_RandomForest = RandomForestClassifier()
Class_KNN = KNeighborsClassifier()

Class_SVM_Gridsearch = GridSearchCV(Class_SVM, param_grid_SVC, cv=inner_cv, iid=False)
Class_RandomForest_GridSearch = GridSearchCV(Class_RandomForest, param_grid_RF, cv=inner_cv, iid=False)
Class_KNN_GridSearch = GridSearchCV(Class_KNN, param_grid_KNN, cv=inner_cv, iid=False)

In [9]:
# Order the important in the desreasing order 
# Forward selection. Add the least significant feature


# Initialization of important sensor order
order1_forward = {}
order2_forward = {}
order3_forward = {}

for i in range (10):
    order1_forward['sensor' + str(i)] = 0
    order2_forward['sensor' + str(i)] = 0
    order3_forward['sensor' + str(i)] = 0

m = 20 #number of iterations

# Foward algorithm: Iteration m times three algorithms and sum over result

# Step 1: We start with empty list sensors. Using the Gridsearch to fine the best parameters for each classifiers with full sensors.
# Step 2: Iteration until there is only one sensor in the remain list. 
#         Add each sensor in the remaind list and compute the score using k-fold cross validation
#         Find the added sensor for which the score is minimun. Add this sensor to the list
#         Update the list, score and return step 1

for k in range(m):
    
    List1_forward = []
    List2_forward = []
    List3_forward = []
    
    X_forward1 = pd.DataFrame()
    X_forward2 = pd.DataFrame()
    X_forward3 = pd.DataFrame()
    
    X_remain1 = X.copy()
    X_remain2 = X.copy()
    X_remain3 = X.copy()
    
    for i in range(9):
    
        data1={}
        data2={}
        data3={}
    
        Max_score1=1
        Max_score2=1
        Max_score3=1
    
        if (i==0):
            Class_KNN_GridSearch.fit(X, y)
            Class_RandomForest_GridSearch.fit(X, y)
            Class_SVM_Gridsearch.fit(X, y)
        else:
            Class_KNN_GridSearch.fit(X_forward1, y)
            Class_RandomForest_GridSearch.fit(X_forward2, y)
            Class_SVM_Gridsearch.fit(X_forward3, y)
    
        for j in range(10-i):
            X_forward1_trial = X_forward1.copy()
            X_forward1_trial[X_remain1.columns[j]] = X_remain1[X_remain1.columns[j]]

            X_forward2_trial = X_forward2.copy()
            X_forward2_trial[X_remain2.columns[j]] = X_remain2[X_remain2.columns[j]]
        
            X_forward3_trial = X_forward3.copy()
            X_forward3_trial[X_remain3.columns[j]] = X_remain3[X_remain3.columns[j]]

            
            a1 = np.mean(cross_val_score(KNeighborsClassifier(n_neighbors=Class_KNN_GridSearch.best_params_['n_neighbors']),\
                                         X_forward1_trial, y, cv=outer_cv))
            
            a2 = np.mean(cross_val_score(RandomForestClassifier(max_depth = Class_RandomForest_GridSearch.best_params_['max_depth'], \
                                         n_estimators = Class_RandomForest_GridSearch.best_params_['n_estimators']),\
                                         X_forward2_trial, y, cv=outer_cv))

            a3 = np.mean(cross_val_score(svm.SVC(gamma='scale', C = Class_SVM_Gridsearch.best_params_['C']), \
                                         X_forward3_trial, y, cv=outer_cv))
            
            if (a1 < Max_score1):
                Max_score1 = a1
                name1 = X_remain1.columns[j]

            if (a2 < Max_score2):
                Max_score2 = a2
                name2 = X_remain2.columns[j]

            if (a3 < Max_score3):
                Max_score3 = a3
                name3 = X_remain3.columns[j]

        X_forward1[name1] = X_remain1[name1]
        X_forward2[name2] = X_remain2[name2]
        X_forward3[name3] = X_remain3[name3]
       
        X_remain1 = X_remain1.drop(name1, axis = 1) 
        X_remain2 = X_remain2.drop(name2, axis = 1)
        X_remain3 = X_remain3.drop(name3, axis = 1)
                
        List1_forward.append(name1)
        List2_forward.append(name2)
        List3_forward.append(name3)

    List1_forward.append(X_remain1.columns[0])
    List2_forward.append(X_remain2.columns[0])
    List3_forward.append(X_remain3.columns[0])
    for l in range(10):
        order1_forward[List1_forward[l]]+=9-l
        order2_forward[List2_forward[l]]+=9-l
        order3_forward[List3_forward[l]]+=9-l

order_forward={}    
for i in range(10):
    order_forward['sensor'+str(i)] = order1_forward['sensor'+str(i)] + order2_forward['sensor'+str(i)]+order3_forward['sensor'+str(i)]
    
sorted_order1 = sorted(order1_forward.items(), key = lambda x:x[1])
sorted_order2 = sorted(order2_forward.items(), key = lambda x:x[1])
sorted_order3 = sorted(order3_forward.items(), key = lambda x:x[1]) 
order_forward = sorted(order_forward.items(), key = lambda x:x[1])

print('Ranking sensors with forward selection: ', order_forward)

Ranking sensors with forward selection:  [('sensor6', 28), ('sensor8', 56), ('sensor4', 108), ('sensor2', 197), ('sensor0', 224), ('sensor1', 318), ('sensor3', 346), ('sensor5', 411), ('sensor9', 472), ('sensor7', 540)]


In [10]:
# Order the important in the desreasing order 
# Backward selection. Remove the most significant feature

order1_back = {}
order2_back = {}
order3_back = {}

for i in range (10):
    order1_back['sensor' + str(i)] = 0
    order2_back['sensor' + str(i)] = 0
    order3_back['sensor' + str(i)] = 0

m = 20 #number of iteration

# Backward algorithm: Iteration m times three algorithms and sum over result

# Step 1: We start with full sensors in a list. Using the Gridsearch to fine the best parameters for each classifiers.
# Step 2: Iteration until there is only one sensor in the list. 
#         Eliminate each sensor in the list and compute the score using k-fold cross validation
#         Find the eliminated sensor for which the score is minimum. This sensor is ranked better than others in the list
#         Remove this sensor out of the list
#         Update the list, score and return step 1

for k in range(m):
    
    List1_back = []
    List2_back = []
    List3_back = []
    
    X_back1 = X.copy()
    X_back2 = X.copy()
    X_back3 = X.copy()
    
    for i in range(9):
    
        Max_score1 = 1
        Max_score2 = 1
        Max_score3 = 1
        
        Class_KNN_GridSearch.fit(X_back1, y)
        Class_RandomForest_GridSearch.fit(X_back2, y)
        Class_SVM_Gridsearch.fit(X_back3, y)
        
        for j in range(10-i):
                    
            data1 = X_back1.drop([X_back1.columns[j]], axis = 1)
            data2 = X_back2.drop([X_back2.columns[j]], axis = 1)
            data3 = X_back3.drop([X_back3.columns[j]], axis = 1)
      
        
            a1 = np.mean(cross_val_score(KNeighborsClassifier(n_neighbors=Class_KNN_GridSearch.best_params_['n_neighbors']),\
                                         data1, y, cv=outer_cv))
            a2 = np.mean(cross_val_score(RandomForestClassifier(max_depth = Class_RandomForest_GridSearch.best_params_['max_depth'],\
                                                          n_estimators = Class_RandomForest_GridSearch.best_params_['n_estimators']),\
                                         data2, y, cv=outer_cv))
            a3 = np.mean(cross_val_score(svm.SVC(gamma='scale', C = Class_SVM_Gridsearch.best_params_['C']), \
                                         data3, y, cv=outer_cv))
            
            if(a1  < Max_score1):
                Max_score1 = a1
                name1 = X_back1.columns[j]
            if (a2 < Max_score2):
                Max_score2 = a2
                name2 = X_back2.columns[j]
            if (a3 < Max_score3):
                Max_score3 = a3
                name3 = X_back3.columns[j]
                
        X_back1 = X_back1.drop([name1], axis=1)
        X_back2 = X_back2.drop([name2], axis=1)
        X_back3 = X_back3.drop([name3], axis=1)
   
        List1_back.append(name1)
        List2_back.append(name2)
        List3_back.append(name3)
    
    List1_back.append(X_back1.columns[0]) 
    List2_back.append(X_back2.columns[0])
    List3_back.append(X_back3.columns[0])


    for l in range(10):
        order1_back[List1_back[l]]+=l
        order2_back[List2_back[l]]+=l
        order3_back[List3_back[l]]+=l


order_back={}    
for i in range(10):
    order_back['sensor'+str(i)] = order1_back['sensor'+str(i)] + order2_back['sensor'+str(i)]+order3_back['sensor'+str(i)]
    
sorted_order1_back = sorted(order1_back.items(), key = lambda x:x[1])   # Sau khi sort xong se chuyen thanh List
sorted_order2_back = sorted(order2_back.items(), key = lambda x:x[1])
sorted_order3_back = sorted(order3_back.items(), key = lambda x:x[1])
order_back = sorted(order_back.items(), key = lambda x:x[1])


print('Ranking sensors with backward selection: ', order_back)    

Ranking sensors with backward selection:  [('sensor6', 23), ('sensor8', 70), ('sensor4', 111), ('sensor2', 212), ('sensor0', 237), ('sensor1', 308), ('sensor3', 337), ('sensor5', 400), ('sensor9', 470), ('sensor7', 532)]
