In [16]:
import pandas as pd
import numpy as np
import pickle
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from project_utils import runModel, split_and_scale
from sklearn import feature_selection
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold
import time
np.random.seed(42)

In [17]:
#Set data for binary classification

df = pd.read_csv('npf_train.csv')
df = df.drop(columns=['date', 'id', 'partlybad'])
df['class2'] = (df['class4'] == 'nonevent').astype(int)
X = df.drop(columns=['class4', 'class2'])
#X = X[["CO2168.mean", "CO242.mean", "CO2504.mean"]]
y = df['class2']



In [18]:
#Define your model parametrs here and add them to the list

rf = RandomForestClassifier(n_estimators=50, random_state=42, class_weight={1: 0.55, 0:0.45}, min_samples_leaf=5, min_samples_split=5)
lr = LogisticRegressionCV(max_iter=10)
nb = GaussianNB()
svm = SVC(probability=True)
knn = KNeighborsClassifier(n_neighbors=5)
#xcols = xdf.columns

models = [rf, lr, nb, svm, knn]

In [19]:
def splitAndScaleDate(X, y, split=0.33):
    scaler = StandardScaler()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split)
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.fit_transform(X_test)
    return X_train, X_test, y_train, y_test

In [20]:
def runModel(m, X_, y_):
    time_start = time.time()
    classifier_pipeline = make_pipeline(StandardScaler(), m)
    
    cv = KFold(n_splits=10, random_state=0, shuffle=False)
    
    sfs1 = SFS(classifier_pipeline, 
        k_features=(1, 100), 
        forward=True, 
        scoring='accuracy',
        cv=cv,
        n_jobs=-1
    )
    sfs1.fit(X_,y_)
    time_end = time.time()
    time_run = time_end - time_start
    print('best combination (ACC: %.3f): %s\n' % (sfs1.k_score_, sfs1.k_feature_idx_), time_run)
    
    d = {"model": m, "score": sfs1.k_score_, "combination": sfs1.k_feature_idx_}
    return d

In [21]:
time_start = time.time()
#X_train, X_test, y_train, y_test = splitAndScaleDate(X, y)
scores = [runModel(m, X.iloc[:, 0:100], y) for m in models]
results = pd.DataFrame(scores)
time_end = time.time()
print(time_end - time_start)
results.head()



best combination (ACC: 0.889): (3, 12, 20, 23, 29, 31, 34, 40, 43, 53, 69, 70, 77, 80, 83, 92, 96)
 626.5888030529022
best combination (ACC: 0.882): (0, 2, 4, 5, 6, 9, 10, 11, 12, 17, 18, 20, 25, 31, 32, 33, 39, 42, 47, 48, 49, 50, 52, 53, 54, 56, 57, 61, 62, 63, 64, 67, 69, 74, 75, 76, 79, 85, 89, 91, 94, 98)
 848.2930130958557
best combination (ACC: 0.865): (0, 4, 10, 11, 12, 16, 18, 48, 49, 51, 53, 54, 55, 57, 58, 69, 70, 84, 86, 98, 99)
 18.885116815567017
best combination (ACC: 0.898): (10, 12, 15, 17, 21, 24, 50, 67, 68, 70, 71, 80, 98, 99)
 172.72309827804565
best combination (ACC: 0.898): (16, 70, 71, 80, 82, 98)
 35.30685496330261
1701.8022410869598


Unnamed: 0,model,score,combination
0,"RandomForestClassifier(class_weight={0: 0.45, ...",0.888792,"(3, 12, 20, 23, 29, 31, 34, 40, 43, 53, 69, 70..."
1,LogisticRegressionCV(max_iter=10),0.882271,"(0, 2, 4, 5, 6, 9, 10, 11, 12, 17, 18, 20, 25,..."
2,GaussianNB(),0.864879,"(0, 4, 10, 11, 12, 16, 18, 48, 49, 51, 53, 54,..."
3,SVC(probability=True),0.897585,"(10, 12, 15, 17, 21, 24, 50, 67, 68, 70, 71, 8..."
4,KNeighborsClassifier(),0.897536,"(16, 70, 71, 80, 82, 98)"


### Time
* 20 -> 38s
* 30 -> 85s
* 40 -> 155s best score: 0.856
* 50 -> 244s best score: 0.853
* 60 -> 358s best score: 0.856
* 70 -> 488s best score: 0.880
* 80 -> 644s best score: 0.884
* 90 -> 834s best score: 0.893

In [22]:
# 101
#max = 0
#best_shape = 0
#best_model = ""
#for i in range(1, 101):
#    #print(df_.iloc[:, 0:i].shape)
#    #print(i)
#    X_train, X_test, y_train, y_test = splitAndScaleDate(df_.iloc[:, 0:i], y)
#    scores = [runModel(m, X_train, X_test, y_train, y_test) for m in models]
#    results = pd.DataFrame(scores)
#    test_scores = results['test score'].to_numpy()
#    
#    if (np.amax(test_scores) > max):
#        max = np.amax(test_scores)
#        best_shape = df_.iloc[:, 0:i].shape
#        
#print(max)
#print(best_shape)

In [152]:
print(df_.iloc[:, 0:101].shape)
X_train, X_test, y_train, y_test = splitAndScaleDate(df_.iloc[:, 0:101], y)
scores = [runModel(m, X_train, X_test, y_train, y_test) for m in models]
results = pd.DataFrame(scores)
results.head()

(458, 100)


Unnamed: 0,model,train score,test score
0,"(DecisionTreeClassifier(max_features='auto', m...",0.944444,0.855263
1,LogisticRegressionCV(max_iter=1000),0.918301,0.815789
2,GaussianNB(),0.810458,0.763158
3,SVC(probability=True),0.921569,0.835526
4,KNeighborsClassifier(),0.911765,0.815789


In [168]:
classifier_pipeline = make_pipeline(StandardScaler(), svm)
cv = KFold(n_splits=10, random_state=0, shuffle=False)



In [227]:
sfs1 = SFS(classifier_pipeline, 
           k_features=(1, 20), 
           forward=True, 
           scoring='accuracy',
           cv=cv
          )

In [229]:
time_start = time.time()
sfs1.fit(df_.iloc[:, 0:[10, 12, 15, 17, 21, 24, 50, 67, 68, 70, 71, 80, 98, 99]],y)
time_end = time.time()
print(time_end - time_start)

17.101253986358643


In [217]:
sfs1.subsets_

{1: {'feature_idx': (0,),
  'cv_scores': array([0.47826087, 0.7173913 , 0.63043478, 0.67391304, 0.45652174,
         0.67391304, 0.58695652, 0.41304348, 0.73333333, 0.48888889]),
  'avg_score': 0.5852657004830918,
  'feature_names': ('CO2168.mean',)},
 2: {'feature_idx': (0, 3),
  'cv_scores': array([0.5       , 0.69565217, 0.67391304, 0.65217391, 0.47826087,
         0.7173913 , 0.63043478, 0.63043478, 0.8       , 0.6       ]),
  'avg_score': 0.6378260869565217,
  'feature_names': ('CO2168.mean', 'CO2336.std')},
 3: {'feature_idx': (0, 2, 3),
  'cv_scores': array([0.52173913, 0.7173913 , 0.67391304, 0.63043478, 0.45652174,
         0.73913043, 0.65217391, 0.58695652, 0.77777778, 0.6       ]),
  'avg_score': 0.6356038647342995,
  'feature_names': ('CO2168.mean', 'CO2336.mean', 'CO2336.std')},
 4: {'feature_idx': (0, 1, 2, 3),
  'cv_scores': array([0.5       , 0.67391304, 0.67391304, 0.65217391, 0.47826087,
         0.67391304, 0.58695652, 0.63043478, 0.77777778, 0.6       ]),
  'avg_sc

In [199]:
print('best combination (ACC: %.3f): %s\n' % (sfs1.k_score_, sfs1.k_feature_idx_))

best combination (ACC: 0.898): (10, 12, 15, 17, 21, 24, 50, 67, 68, 70, 71, 80, 98, 99)



In [24]:
x_t = df.iloc[:, [10, 12, 15, 17, 21, 24, 50, 67, 68, 70, 71, 80, 98, 99]]

In [33]:
s = SVC(probability=True)
X_train, X_test, y_train, y_test = splitAndScaleDate(x_t, y)

In [34]:
s.fit (X_train, y_train)

SVC(probability=True)

In [35]:
s.score(X_train, y_train)

0.8921568627450981

In [36]:
s.score(X_test, y_test)

0.875