In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
np.random.seed(1000)

In [2]:
#reading the data
file = "../tables/scores_answers.csv"

data = pd.read_csv(file,index_col=['noc','workshop.number'])
data.sort_index(inplace=True)
data.loc[data.share == 'remain constant','share'] = 'constant'
data.dropna(inplace=True)

In [3]:
#splitting up x and y
x = data.drop(['absolute','share','Unnamed: 0','noc_code'],axis=1) #making x data frame
x['work_num'] = x.index.get_level_values(1) #making workshop number a variable as well as an index
x.drop_duplicates(inplace=True)
x = np.round(x).astype(int)#round x to make discrete

#creating y variables
y_abs = pd.DataFrame(data['absolute']).pivot_table(index = ['noc','workshop.number'], columns = 'absolute', aggfunc = len).fillna(0)
y_abs['sum'] = y_abs.sum(axis = 1)
y_abs['not_increase'] = y_abs['fewer'] + y_abs['same']
y_abs.loc[:,y_abs.columns!='sum'] = y_abs.loc[:,y_abs.columns!='sum'].divide(y_abs['sum'],axis=0)
y_abs['y'] = y_abs[['fewer','more','same']].idxmax(axis=1)
y_abs['binned_y'] = y_abs[['more','not_increase']].idxmax(axis=1)

y_share = pd.DataFrame(data['share']).pivot_table(index = ['noc','workshop.number'], columns = 'share', aggfunc = len).fillna(0)
y_share['sum'] = y_share.sum(axis = 1)
y_share['not_increase'] = y_share['decrease'] + y_share['constant']
y_share.loc[:,y_share.columns!='sum'] = y_share.loc[:,y_share.columns!='sum'].divide(y_share['sum'],axis=0)
y_share['y'] = y_share[['constant','decrease','increase']].idxmax(axis=1)
y_share['binned_y'] = y_share[['increase','not_increase']].idxmax(axis=1)

In [4]:
#splitting up into random training and validation sets. We have decided to just use the whole data set with kfold validation 
#because of how little data there is. Going to leave this code here in case we change our mind
train_set = np.random.choice(x.index,size=int(len(x.index)*0.8),replace=False)
test_set = np.logical_not(x.index.isin(train_set))

x_train = x.loc[train_set]
x_test = x.loc[test_set]

#full y set for k-fold
y = [y_abs['y'],y_share['y'],
     y_abs['binned_y'],y_share['binned_y'],
     y_abs[['more','same']],y_share[['increase','constant']],
     y_abs['more'],y_share['increase']]

#training and validation y sets if we want to us them
y_train = np.empty(8,dtype = pd.Series)
y_test = np.empty(8,dtype = pd.Series)

for i in range(8):
    y_train[i] = y[i].loc[train_set]
    y_test[i] = y[i].loc[test_set]

ok lets try some models
our y var has 8 binary types that can be combined so there are 8 distinct versions of the y variable. The types are
 - absolute vs share (which answer from the expert we use)
 - binned vs not (whether we make use tertiary answers or bin into binary)
 - categories vs distribution (whether we use regression forests to try and fit answer proportions. Also another   possibillity
is using the probabilities that come from counting tree votes)

TOD0 - make a specific model specific parameter grid for every model? Would do this by looking for edge cases in
chosen paramters and adjusting accordingly 
- try to figure out how to use these premades to make a model that is fitted based on the proportion of trees voting for an outcome

old param grids:
param_grid= {'max_features': np.asarray(range(9,25,2)),#number of features a tree selects
    'n_estimators':[100,150,250,275,300],#number of trees
    'min_samples_leaf': [1,2,4,8],#minimum number of data points can be used to make a leaf at the end of a tree
    'min_samples_split': [5,10,15],#min number of data points to split a branch
    'criterion':['gini','entropy']}#metric to order features for each treet

param_grid_regres= {'max_features': np.asarray(range(9,25,2)),#number of features a tree selects
    'n_estimators':[100,150,250,275,300],#number of trees
    'min_samples_leaf': [1,2,4,8],#minimum number of data points can be used to make a leaf at the end of a tree
    'min_samples_split': [5,10,15],#min number of data points to split a branch
    'criterion':['mse','mae']}#metric to order features for each tree
    
then...

tailored_param_grids = [{'n_estimators':[300,350,400],'min_samples_split': [15,18,21]},
                        {'n_estimators':[125,150,175],'min_samples_split': [8,10,12], 'min_samples_leaf': [8,10,12]},
                        {'n_estimators':[300,350,400],'min_samples_split': [8,10,12]},
                        {'n_estimators':[300,350,400],'min_samples_split': [8,10,12]},
                        {'n_estimators':[300,350,400],'min_samples_split': [8,10,12]},
                        {'n_estimators':[50,75,100],'min_samples_split': [15,18,21], 'min_samples_leaf': [8,10,12]},
                        {'n_estimators':[50,75,100],'min_samples_split': [2,3,5], 'min_samples_leaf': [8,10,12]},
                        {'n_estimators':[50,75,100],'min_samples_split': [2,3,5]},
                        {'n_estimators':[50,75,100],'min_samples_split': [2,3,5]}]
                        
                        
                        tailored_rf = [RandomForestClassifier(criterion = 'gini',max_features=21,min_samples_leaf=4,min_samples_split=18),
               RandomForestClassifier(criterion = 'gini',max_features=23,min_samples_leaf=8,min_samples_split=12),
               RandomForestClassifier(criterion = 'gini',max_features=15,min_samples_leaf=1,min_samples_split=10),
               RandomForestClassifier(criterion = 'gini',max_features=11,min_samples_leaf=4,min_samples_split=12),
               RandomForestRegressor(criterion='mse',max_features=13),
               RandomForestRegressor(criterion='mse',max_features=11),
               RandomForestRegressor(criterion='mse',max_features=11,min_samples_leaf=2),
               RandomForestRegressor(criterion='mse',max_features=15,min_samples_leaf=1)]

tailored_param_grids = [{'n_estimators':[300,325]},
                        {'n_estimators':[150,175,200]},
                        {'n_estimators':[400,450,500]},
                        {'n_estimators':[400,450,500],'min_samples_split': [8,10,12]},
                        {'n_estimators':[350,375],'min_samples_split': [8,10,12]},
                        {'n_estimators':[50,75,100],'min_samples_split': [15,18,21], 'min_samples_leaf': [8,10,12]},
                        {'n_estimators':[50,75,100],'min_samples_split': [2,3,5], 'min_samples_leaf': [8,10,12]},
                        {'n_estimators':[50,75,100],'min_samples_split': [2,3,5]},
                        {'n_estimators':[50,75,100],'min_samples_split': [2,3,5]}]

In [None]:
tailored_rf = [RandomForestClassifier(criterion = 'gini',max_features=21,min_samples_leaf=4,min_samples_split=18,n_estimators=300,n_jobs=-1),
               RandomForestClassifier(criterion = 'gini',max_features=23,min_samples_leaf=8,min_samples_split=12,n_estimators=300,n_jobs=-1),
               RandomForestClassifier(criterion = 'gini',max_features=15,min_samples_leaf=1,min_samples_split=10,n_estimators=300,n_jobs=-1),
               RandomForestClassifier(criterion = 'gini',max_features=11,min_samples_leaf=4,min_samples_split=10,n_estimators=300,n_jobs=-1),
               RandomForestRegressor(criterion='mse',max_features=None,min_samples_leaf=10, min_samples_split=10,n_estimators=300,n_jobs=-1),
               RandomForestRegressor(criterion='mse',max_features=None,min_samples_leaf=10, min_samples_split=15,n_estimators=300,n_jobs=-1),
               RandomForestRegressor(criterion='mse',max_features=None,min_samples_split=10,min_samples_leaf=2,n_estimators=300,n_jobs=-1),
               RandomForestRegressor(criterion='mse',max_features=None,min_samples_leaf=2, min_samples_split=5,n_estimators=300,n_jobs=-1)]

param_grid = [{'max_features':[11,23,120]},{"max_depth":np.append(np.asarray(range(10,110,10)),None)}]

                        
selected_models = np.empty(8,dtype = RandomForestClassifier)
scores = np.zeros(8)
features = np.empty(8, dtype = np.ndarray)

for i in range(8):
    if i<4:
        search = GridSearchCV(tailored_rf[i],param_grid,cv=5,n_jobs=-1,iid=False)
    else:
        search = GridSearchCV(tailored_rf[i],param_grid,scoring=make_scorer(mean_absolute_error),cv=5,n_jobs=-1,iid=False)
    search.fit(x,y[i])
    selected_models[i] = search.best_estimator_ 
    scores[i] = search.best_score_
    features[i] = selected_models[i].feature_importances_    

In [None]:
tailored_rf = [RandomForestClassifier(criterion = 'gini',max_features=21,min_samples_leaf=4,min_samples_split=18,n_estimators=300,n_jobs=-1),
               RandomForestClassifier(criterion = 'gini',max_features=23,min_samples_leaf=8,min_samples_split=12,n_estimators=300,n_jobs=-1),
               RandomForestClassifier(criterion = 'gini',max_features=15,min_samples_leaf=1,min_samples_split=10,n_estimators=300,n_jobs=-1),
               RandomForestClassifier(criterion = 'gini',max_features=11,min_samples_leaf=4,min_samples_split=10,n_estimators=300,n_jobs=-1),
               RandomForestRegressor(criterion='mse',max_features=None,min_samples_leaf=10, min_samples_split=10,n_estimators=300,n_jobs=-1),
               RandomForestRegressor(criterion='mse',max_features=None,min_samples_leaf=10, min_samples_split=15,n_estimators=300,n_jobs=-1),
               RandomForestRegressor(criterion='mse',max_features=None,min_samples_split=10,min_samples_leaf=2,n_estimators=300,n_jobs=-1),
               RandomForestRegressor(criterion='mse',max_features=None,min_samples_leaf=2, min_samples_split=5,n_estimators=300,n_jobs=-1)]

selected_models = np.empty(8,dtype = RandomForestClassifier)
scores = np.zeros(8)
features = np.empty(8, dtype = np.ndarray)

for i in range(8):
    rf = tailored_rf[i]
    rf.fit(x,y[i])
    if i<4:
        scores[i] = np.mean(cross_val_score(rf,x,y[i],cv=5,n_jobs=-1))
    else:
        scores[i] = np.mean(cross_val_score(rf,x,y[i],scoring=make_scorer(mean_squared_error),cv=5,n_jobs=-1))
    features[i] = rf.feature_importances_    

In [None]:
scores

In [None]:
selected_models

In [None]:
feature_grid = pd.DataFrame(np.stack(features, axis=0 ),columns=x.columns)

In [None]:
feature_grid.iloc[0,:].sort_values( ascending = False)[0:10]

In [None]:
feature_grid.iloc[1,:].sort_values( ascending = False)[0:10]

In [None]:
feature_grid.iloc[2,:].sort_values( ascending = False)[0:10]

In [None]:
feature_grid.iloc[3,:].sort_values( ascending = False)[0:10]

In [None]:
feature_grid.iloc[4,:].sort_values( ascending = False)[0:10]

In [None]:
feature_grid.iloc[5,:].sort_values( ascending = False)[0:10]

In [None]:
feature_grid.iloc[6,:].sort_values( ascending = False)[0:10]

In [None]:
feature_grid.iloc[7,:].sort_values( ascending = False)[0:10]

make pipe in which features are trimmed by a Recursive Feature Evaluation and then inputed into a random forest. Grid search
will be used to select the number of features to be inittially trimed and then max_features for the random forest. This may
well be overkill but it is interesting to see if there is improvement in a tree never seeing certain features. 

currently setting params 
on what we learned in the earlier grid search, but it might be good to grid search in this context
as well

In [None]:
rfe_random_forests = [
    RFE(RandomForestClassifier(criterion = 'gini',max_features=21,min_samples_leaf=4,min_samples_split=18,n_estimators=300,n_jobs=-1)),
    RFE(RandomForestClassifier(criterion = 'gini',max_features=23,min_samples_leaf=8,min_samples_split=12,n_estimators=300,n_jobs=-1)),
    RFE(RandomForestClassifier(criterion = 'gini',max_features=15,min_samples_leaf=1,min_samples_split=10,n_estimators=300,n_jobs=-1)),
    RFE(RandomForestClassifier(criterion = 'gini',max_features=11,min_samples_leaf=4,min_samples_split=10,n_estimators=300,n_jobs=-1)),
]

param_grid = {'n_features_to_select': np.asarray(range(30,120,10))}

selected_models = np.empty(4,dtype = Pipeline)
scores = np.zeros(4)
features = np.empty(4, dtype = np.ndarray)

for i in range(4): 
    search = GridSearchCV(rfe_random_forests[i],param_grid,cv=5,n_jobs=-1,iid=False)
    search.fit(x,y[i])
    selected_models[i] = search.best_estimator_ 
    scores[i] = search.best_score_
    features[i] = selected_models[i].get_support()

In [None]:
scores

In [None]:
selected_models

In [5]:
x_array = np.asarray(x)

In [7]:
rf =[ #share_bin,share_cont,abs_bin_cont,share_bin_cont
RandomForestClassifier(criterion = 'gini',class_weight='balanced',max_features='auto',min_samples_leaf=4,min_samples_split=10, n_estimators=2000,n_jobs=-1),
RandomForestRegressor(criterion='mse',max_features=None,min_samples_leaf=10, min_samples_split=15,n_estimators=2000,n_jobs=-1),
RandomForestRegressor(criterion='mse',max_features=None,min_samples_leaf=2, min_samples_split=5,n_estimators=2000,n_jobs=-1),
]

sfs_rf = [
SFS(estimator = rf[0],k_features=(30,90),forward=False,floating=True,scoring='accuracy',n_jobs=-1,verbose=1),
SFS(estimator = rf[2],k_features=(30,90),forward=False,floating=True,scoring=make_scorer(mean_squared_error),n_jobs=-1,verbose=1)
]

In [7]:
#ran july 29th
sfs_rf[0].fit(x_array,y[3])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 121 out of 121 | elapsed:  4.5min finished
Features: 120/30[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  4.4min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   20.2s finished
Features: 119/30[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 119 out of 119 | elapsed:  4.4min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   21.7s finished
Features: 118/30[Parallel(n_jobs=-1)]: Usi

[Parallel(n_jobs=-1)]: Done 101 out of 101 | elapsed:  3.8min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  20 | elapsed:   34.3s remaining:   34.3s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   58.7s finished
Features: 100/30[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  3.8min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  21 | elapsed:   34.6s remaining:   26.0s
[Parallel(n_jobs=-1)]: Done  21 out of  21 | elapsed:   59.4s finished
Features: 99/30[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  99 out of  99 | elapsed:  3.8min finished
[Parallel(n_jobs=-1)

[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:  3.1min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.6min finished
Features: 80/30[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  2.9min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  41 out of  41 | elapsed:  1.6min finished
Features: 79/30[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  79 out of  79 | elapsed:  2.8min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 out of  42 |

[Parallel(n_jobs=-1)]: Done  61 out of  61 | elapsed:  2.2min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  2.2min finished
Features: 60/30[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  2.2min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  61 out of  61 | elapsed:  2.2min finished
Features: 59/30[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  59 out of  59 | elapsed:  2.2min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 co

[Parallel(n_jobs=-1)]: Done  41 out of  41 | elapsed:  1.6min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  2.8min finished
Features: 40/30[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.5min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:  3.1min finished
Features: 39/30[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  39 out of  39 | elapsed:  1.5min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  82 out of  82 |

SequentialFeatureSelector(clone_estimator=True, cv=5,
             estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=4,
            min_samples_split=10, min_weight_fraction_leaf=0.0,
            n_estimators=2000, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False),
             floating=True, forward=False, k_features=(30, 90), n_jobs=-1,
             pre_dispatch='2*n_jobs', scoring='accuracy', verbose=1)

In [8]:
with open('SFS_auto.txt', 'w') as f:
    f.write(str(sfs_rf[0].k_score_))
    f.write(str(sfs_rf[0].k_feature_idx_))

In [8]:
#running july 30th
sfs_rf[1].fit(x_array,y[7])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 121 out of 121 | elapsed:  5.0min finished
Features: 120/30[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  4.9min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   18.0s finished
Features: 119/30[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 119 out of 119 | elapsed:  4.8min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:   20.6s finished
Features: 118/30[Parallel(n_jobs=-1)]: Usi

[Parallel(n_jobs=-1)]: Done 101 out of 101 | elapsed:  4.1min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  20 | elapsed:   36.6s remaining:   36.6s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   57.7s finished
Features: 100/30[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  4.0min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  21 | elapsed:   36.6s remaining:   27.5s
[Parallel(n_jobs=-1)]: Done  21 out of  21 | elapsed:   58.2s finished
Features: 99/30[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  99 out of  99 | elapsed:  4.0min finished
[Parallel(n_jobs=-1)

[Parallel(n_jobs=-1)]: Done  84 out of  84 | elapsed:  4.0min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 out of  37 | elapsed:  1.7min finished
Features: 83/30[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  83 out of  83 | elapsed:  4.3min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 out of  38 | elapsed:  2.2min finished
Features: 82/30[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  82 out of  82 | elapsed:  4.5min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  39 out of  39 | elapsed:  2.1min finished
Features: 81/30[Parallel(n_jobs=-1)

[Parallel(n_jobs=-1)]: Done  53 out of  53 | elapsed:  2.3min finished
Features: 67/30[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  67 out of  67 | elapsed:  3.7min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:  3.0min finished
Features: 66/30[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  66 out of  66 | elapsed:  3.6min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed:  2.5min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 co

[Parallel(n_jobs=-1)]: Done  59 out of  59 | elapsed:  3.0min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  62 out of  62 | elapsed:  3.1min finished
Features: 58/30[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  58 out of  58 | elapsed:  2.5min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  63 out of  63 | elapsed:  2.5min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  62 out of  62 | elapsed:  2.7min finished
Features: 58/30[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 co

[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed:  3.6min finished
Features: 50/30[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  2.7min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  71 out of  71 | elapsed:  2.8min finished
Features: 49/30[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  49 out of  49 | elapsed:  2.1min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  3.5min finished
Features: 48/30[Parallel(n_jobs=-1)]: Using backend LokyBac

[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  84 out of  84 | elapsed:  3.3min finished
Features: 36/30[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:  2.0min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  85 out of  85 | elapsed:  3.4min finished
Features: 35/30[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:  1.5min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  86 out of  86 | elapsed:  3.7min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      |

SequentialFeatureSelector(clone_estimator=True, cv=5,
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=None, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=5,
           min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
             floating=True, forward=False, k_features=(30, 90), n_jobs=-1,
             pre_dispatch='2*n_jobs',
             scoring=make_scorer(mean_squared_error), verbose=1)

In [9]:
with open('SFS_share_binned.txt', 'w') as f:
    f.write(str(sfs_rf[3].k_score_))
    f.write(str(sfs_rf[3].k_feature_idx_))

IndexError: list index out of range

In [31]:
sfs_rf[1].fit(x_array,y[5])

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   14.5s
[Parallel(n_jobs=-1)]: Done 121 out of 121 | elapsed:  1.0min finished
Features: 120/30[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done 120 out of 120 | elapsed:  1.0min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    4.8s finished
Features: 119/30[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   14.4s
[Parallel(n_jobs=-1)]: Done 119 out of 119 | elapsed:   59.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    5.3s finished
Features: 118/30[Parallel(n_jobs=-1)]: Usi

[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done 102 out of 102 | elapsed:   51.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  19 | elapsed:    7.8s remaining:   10.7s
[Parallel(n_jobs=-1)]: Done  19 out of  19 | elapsed:   12.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  18 | elapsed:    7.5s remaining:   15.0s
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:   12.5s finished
Features: 102/30[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   14.5s
[Parallel(n_jobs=-1)]: Done 102 out of 102 | elapsed:   41.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 out of  19 | elapsed:    5.8s remaining:    7.9s
[Parallel(n_jobs=-1)]: Do

[Parallel(n_jobs=-1)]: Done  93 out of  93 | elapsed:   33.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 out of  28 | elapsed:   10.9s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done  28 out of  28 | elapsed:   11.0s finished
Features: 92/30[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   11.2s
[Parallel(n_jobs=-1)]: Done  92 out of  92 | elapsed:   34.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  29 out of  29 | elapsed:   11.0s finished
Features: 91/30[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   10.9s
[Parallel(n_jobs=-1)]: Done  91 out of  91 | elapsed:   33.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Do

[Parallel(n_jobs=-1)]: Done  45 out of  45 | elapsed:   22.9s finished
Features: 75/30[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   14.4s
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:   37.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 out of  46 | elapsed:   22.9s finished
Features: 74/30[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   13.6s
[Parallel(n_jobs=-1)]: Done  74 out of  74 | elapsed:   29.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  47 out of  47 | elapsed:   17.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 out of  46 | elapsed:   17.0s finished
Features: 74/30[Parallel(n_jobs=-1)

Features: 60/30[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   10.7s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   22.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   10.8s
[Parallel(n_jobs=-1)]: Done  61 out of  61 | elapsed:   22.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   10.9s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   22.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done  59 out of  59 | elapsed:   21.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   10.4s
[Paral

[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   14.0s
[Parallel(n_jobs=-1)]: Done  59 out of  59 | elapsed:   28.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   13.9s
[Parallel(n_jobs=-1)]: Done  62 out of  62 | elapsed:   29.4s finished
Features: 58/30[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-1)]: Done  58 out of  58 | elapsed:   28.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   14.2s
[Parallel(n_jobs=-1)]: Done  63 out of  63 | elapsed:   30.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   14.1s
[Parallel(n_jobs=-1)]: Done  62 out of  62 | elapsed:   30.3s finished
Features: 58

[Parallel(n_jobs=-1)]: Done  44 out of  44 | elapsed:   22.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   13.9s
[Parallel(n_jobs=-1)]: Done  77 out of  77 | elapsed:   32.7s finished
Features: 43/30[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  43 out of  43 | elapsed:   15.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done  78 out of  78 | elapsed:   27.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done  77 out of  77 | elapsed:   27.5s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   10.

[Parallel(n_jobs=-1)]: Done  86 out of  86 | elapsed:   42.9s finished
Features: 34/30[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 out of  34 | elapsed:   19.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   14.4s
[Parallel(n_jobs=-1)]: Done  87 out of  87 | elapsed:   43.5s finished
Features: 33/30[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 out of  33 | elapsed:   19.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   14.3s
[Parallel(n_jobs=-1)]: Done  88 out of  88 | elapsed:   43.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   14.7s
[Parallel(n_jobs=-1)]: Done  87 out of  87 |

SequentialFeatureSelector(clone_estimator=True, cv=5,
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=None, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=10, min_samples_split=15,
           min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
             floating=True, forward=False, k_features=(30, 90), n_jobs=-1,
             pre_dispatch='2*n_jobs',
             scoring=make_scorer(mean_squared_error), verbose=1)

In [32]:
with open('SFS.txt', 'w') as f:
    f.write(str(sfs_rf[0].k_score_))
    f.write(str(sfs_rf[0].k_feature_idx_))
    f.write(str(sfs_rf[1].k_score_))
    f.write(str(sfs_rf[1].k_feature_idx_))

In [34]:
sfs_rf[0].k_score_

0.8234927536231883

In [38]:
len(sfs_rf[0].k_feature_idx_)

SyntaxError: unexpected EOF while parsing (<ipython-input-38-35faa4fff774>, line 1)

In [39]:
sfs_rf[1].k_score_

0.05864193844235257

In [40]:
len(sfs_rf[1].k_feature_idx_)

32

In [None]:
with open('SFS_auto.txt', 'w') as f:
    f.write(str(sfs_rf[0].k_score_))
    f.write(str(sfs_rf[0].k_feature_idx_))