In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, mean_squared_error
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
np.random.seed(1000)

In [3]:
#reading the data
file = "C:/Users/USERID/Documents/GitHub/Employment2030/Employment2030/tables/scores_answers.csv"

data = pd.read_csv(file,index_col=['noc','workshop.number'])
data.sort_index(inplace=True)
data.loc[data.share == 'remain constant','share'] = 'constant'
data.dropna(inplace=True)

In [4]:
#splitting up x and y
x = data.drop(['absolute','share','Unnamed: 0','noc_code'],axis=1) #making x data frame
x['work_num'] = x.index.get_level_values(1) #making workshop number a variable as well as an index
x.drop_duplicates(inplace=True)
x = np.round(x).astype(int)#round x to make discrete

#creating y variables
y_abs = pd.DataFrame(data['absolute']).pivot_table(index = ['noc','workshop.number'], columns = 'absolute', aggfunc = len).fillna(0)
y_abs['sum'] = y_abs.sum(axis = 1)
y_abs['not_increase'] = y_abs['fewer'] + y_abs['same']
y_abs.loc[:,y_abs.columns!='sum'] = y_abs.loc[:,y_abs.columns!='sum'].divide(y_abs['sum'],axis=0)
y_abs['y'] = y_abs[['fewer','more','same']].idxmax(axis=1)
y_abs['binned_y'] = y_abs[['more','not_increase']].idxmax(axis=1)

y_share = pd.DataFrame(data['share']).pivot_table(index = ['noc','workshop.number'], columns = 'share', aggfunc = len).fillna(0)
y_share['sum'] = y_share.sum(axis = 1)
y_share['not_increase'] = y_share['decrease'] + y_share['constant']
y_share.loc[:,y_share.columns!='sum'] = y_share.loc[:,y_share.columns!='sum'].divide(y_share['sum'],axis=0)
y_share['y'] = y_share[['constant','decrease','increase']].idxmax(axis=1)
y_share['binned_y'] = y_share[['increase','not_increase']].idxmax(axis=1)

In [33]:
#splitting up into random training and validation sets. We have decided to just use the whole data set with kfold validation 
#because of how little data there is. Going to leave this code here in case we change our mind
train_set = np.random.choice(x.index,size=int(len(x.index)*0.8),replace=False)
test_set = np.logical_not(x.index.isin(train_set))

x_train = x.loc[train_set]
x_test = x.loc[test_set]

#full y set for k-fold
y = [y_abs['y'],y_share['y'],
     y_abs['binned_y'],y_share['binned_y'],
     y_abs[['more','same']],y_share[['increase','constant']],
     y_abs['more'],y_share['increase']]

#training and validation y sets if we want to us them
y_train = np.empty(8,dtype = pd.Series)
y_test = np.empty(8,dtype = pd.Series)

for i in range(8):
    y_train[i] = y[i].loc[train_set]
    y_test[i] = y[i].loc[test_set]

ok lets try some models
our y var has 8 binary types that can be combined so there are 8 distinct versions of the y variable. The types are
 - absolute vs share (which answer from the expert we use)
 - binned vs not (whether we make use tertiary answers or bin into binary)
 - categories vs distribution (whether we use regression forests to try and fit answer proportions. Also another   possibillity
is using the probabilities that come from counting tree votes)

TOD0 - make a specific model specific parameter grid for every model? Would do this by looking for edge cases in
chosen paramters and adjusting accordingly 
- try to figure out how to use these premades to make a model that is fitted based on the proportion of trees voting for an outcome

old param grids:
param_grid= {'max_features': np.asarray(range(9,25,2)),#number of features a tree selects
    'n_estimators':[100,150,250,275,300],#number of trees
    'min_samples_leaf': [1,2,4,8],#minimum number of data points can be used to make a leaf at the end of a tree
    'min_samples_split': [5,10,15],#min number of data points to split a branch
    'criterion':['gini','entropy']}#metric to order features for each treet

param_grid_regres= {'max_features': np.asarray(range(9,25,2)),#number of features a tree selects
    'n_estimators':[100,150,250,275,300],#number of trees
    'min_samples_leaf': [1,2,4,8],#minimum number of data points can be used to make a leaf at the end of a tree
    'min_samples_split': [5,10,15],#min number of data points to split a branch
    'criterion':['mse','mae']}#metric to order features for each tree
    
then...

tailored_param_grids = [{'n_estimators':[300,350,400],'min_samples_split': [15,18,21]},
                        {'n_estimators':[125,150,175],'min_samples_split': [8,10,12], 'min_samples_leaf': [8,10,12]},
                        {'n_estimators':[300,350,400],'min_samples_split': [8,10,12]},
                        {'n_estimators':[300,350,400],'min_samples_split': [8,10,12]},
                        {'n_estimators':[300,350,400],'min_samples_split': [8,10,12]},
                        {'n_estimators':[50,75,100],'min_samples_split': [15,18,21], 'min_samples_leaf': [8,10,12]},
                        {'n_estimators':[50,75,100],'min_samples_split': [2,3,5], 'min_samples_leaf': [8,10,12]},
                        {'n_estimators':[50,75,100],'min_samples_split': [2,3,5]},
                        {'n_estimators':[50,75,100],'min_samples_split': [2,3,5]}]
                        
                        
                        tailored_rf = [RandomForestClassifier(criterion = 'gini',max_features=21,min_samples_leaf=4,min_samples_split=18),
               RandomForestClassifier(criterion = 'gini',max_features=23,min_samples_leaf=8,min_samples_split=12),
               RandomForestClassifier(criterion = 'gini',max_features=15,min_samples_leaf=1,min_samples_split=10),
               RandomForestClassifier(criterion = 'gini',max_features=11,min_samples_leaf=4,min_samples_split=12),
               RandomForestRegressor(criterion='mse',max_features=13),
               RandomForestRegressor(criterion='mse',max_features=11),
               RandomForestRegressor(criterion='mse',max_features=11,min_samples_leaf=2),
               RandomForestRegressor(criterion='mse',max_features=15,min_samples_leaf=1)]

tailored_param_grids = [{'n_estimators':[300,325]},
                        {'n_estimators':[150,175,200]},
                        {'n_estimators':[400,450,500]},
                        {'n_estimators':[400,450,500],'min_samples_split': [8,10,12]},
                        {'n_estimators':[350,375],'min_samples_split': [8,10,12]},
                        {'n_estimators':[50,75,100],'min_samples_split': [15,18,21], 'min_samples_leaf': [8,10,12]},
                        {'n_estimators':[50,75,100],'min_samples_split': [2,3,5], 'min_samples_leaf': [8,10,12]},
                        {'n_estimators':[50,75,100],'min_samples_split': [2,3,5]},
                        {'n_estimators':[50,75,100],'min_samples_split': [2,3,5]}]

In [6]:
tailored_rf = [RandomForestClassifier(criterion = 'gini',max_features=21,min_samples_leaf=4,min_samples_split=18,n_estimators=300,n_jobs=-1),
               RandomForestClassifier(criterion = 'gini',max_features=23,min_samples_leaf=8,min_samples_split=12,n_estimators=300,n_jobs=-1),
               RandomForestClassifier(criterion = 'gini',max_features=15,min_samples_leaf=1,min_samples_split=10,n_estimators=300,n_jobs=-1),
               RandomForestClassifier(criterion = 'gini',max_features=11,min_samples_leaf=4,min_samples_split=10,n_estimators=300,n_jobs=-1),
               RandomForestRegressor(criterion='mse',max_features=None,min_samples_leaf=10, min_samples_split=10,n_estimators=300,n_jobs=-1),
               RandomForestRegressor(criterion='mse',max_features=None,min_samples_leaf=10, min_samples_split=15,n_estimators=300,n_jobs=-1),
               RandomForestRegressor(criterion='mse',max_features=None,min_samples_split=10,min_samples_leaf=2,n_estimators=300,n_jobs=-1),
               RandomForestRegressor(criterion='mse',max_features=None,min_samples_leaf=2, min_samples_split=5,n_estimators=300,n_jobs=-1)]

param_grid = [{'max_features':[11,23,120]},{"max_depth":np.append(np.asarray(range(10,110,10)),None)}]

                        
selected_models = np.empty(8,dtype = RandomForestClassifier)
scores = np.zeros(8)
features = np.empty(8, dtype = np.ndarray)

for i in range(8):
    if i<4:
        search = GridSearchCV(tailored_rf[i],param_grid,cv=5,n_jobs=-1,iid=False)
    else:
        search = GridSearchCV(tailored_rf[i],param_grid,scoring=make_scorer(mean_squared_error),cv=5,n_jobs=-1,iid=False)
    search.fit(x,y[i])
    selected_models[i] = search.best_estimator_ 
    scores[i] = search.best_score_
    features[i] = selected_models[i].feature_importances_    

In [44]:
scores

array([0.68984058, 0.49957971, 0.70743478, 0.77415942, 0.04297254,
       0.04481727, 0.06117511, 0.05732341])

In [45]:
selected_models

array([RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features=21, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=4, min_samples_split=18,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features=23, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=8, min_samples_split=12,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=70, max_features=15, max_leaf_nodes=None,
 

In [32]:
pd.DataFrame(np.stack(features, axis=0 ),columns=x.columns).sort_values(by = 7, axis=1, ascending = False)

Unnamed: 0,value.Fluency of Ideas,work_num,value.Computers and Electronics,value.Memorization,value.Persuasion,value.Service Orientation,value.Transportation,value.Law and Government,value.Systems Evaluation,value.Technology Design,...,value.Night Vision,value.Response Orientation,value.English Language,value.Extent Flexibility,value.Originality,value.Far Vision,value.Active Listening,value.Oral Expression,value.Dynamic Flexibility,value.Selective Attention
0,0.091563,0.008616,0.006647,0.018168,0.111272,0.050141,0.009139,0.006777,0.053372,0.006285,...,0.000321,0.001886,0.003978,0.000547,0.038836,0.000102,0.005465,0.0,0.0,0.0
1,0.121404,0.007217,0.026108,0.010753,0.065919,0.060765,0.008313,0.02831,0.032267,0.016459,...,0.001462,0.001001,0.001147,0.001427,0.029868,2.9e-05,0.003462,0.0,0.0,0.0
2,0.034254,0.00733,0.048138,0.047906,0.029107,0.021205,0.011227,0.015505,0.021778,0.028965,...,0.003004,0.002994,0.006162,0.002291,0.016772,0.000752,0.001416,0.002666,0.002088,0.000193
3,0.006565,0.020614,0.194886,0.036824,0.009372,0.04777,0.028785,0.006572,0.005519,0.234203,...,0.004839,0.001386,0.000195,0.000118,0.000352,0.000661,0.001532,0.0,0.0,0.0
4,0.356364,0.004029,0.044305,0.0417,0.027282,0.097577,0.004563,0.028291,0.021154,0.005811,...,0.000396,0.0,0.0,0.000106,0.002083,0.0,0.0,0.0,0.0,0.0
5,0.331554,0.003989,0.046607,0.045984,0.030751,0.097194,0.003537,0.039819,0.022231,0.004422,...,0.000919,5e-06,0.000277,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.236842,0.048945,0.0393,0.037068,0.058409,0.050751,0.022238,0.016565,0.020357,0.010448,...,0.000431,0.001036,0.00139,0.000165,0.006199,0.000289,0.000287,0.000198,0.003882,0.0
7,0.146743,0.079557,0.077546,0.077271,0.071789,0.034595,0.030352,0.029625,0.018572,0.018495,...,0.000553,0.000539,0.000404,0.000365,0.00035,0.00031,0.000209,4e-05,0.0,0.0


make pipe in which features are trimmed by a Recursive Feature Evaluation and then inputed into a random forest. Grid search
will be used to select the number of features to be inittially trimed and then max_features for the random forest. This may
well be overkill but it is interesting to see if there is improvement in a tree never seeing certain features. 

currently setting params 
on what we learned in the earlier grid search, but it might be good to grid search in this context
as well

In [52]:
tailored_rf = [
    RFE(RandomForestClassifier(criterion = 'gini',max_features=21,min_samples_leaf=4,min_samples_split=18,n_estimators=300,n_jobs=-1)),
    RFE(RandomForestClassifier(criterion = 'gini',max_features=23,min_samples_leaf=8,min_samples_split=12,n_estimators=300,n_jobs=-1)),
    RFE(RandomForestClassifier(criterion = 'gini',max_features=15,min_samples_leaf=1,min_samples_split=10,n_estimators=300,n_jobs=-1)),
    RFE(RandomForestClassifier(criterion = 'gini',max_features=11,min_samples_leaf=4,min_samples_split=10,n_estimators=300,n_jobs=-1)),
    RFE(RandomForestRegressor(criterion='mse',max_features=None,min_samples_leaf=10, min_samples_split=10,n_estimators=300,n_jobs=-1)),
    RFE(RandomForestRegressor(criterion='mse',max_features=None,min_samples_leaf=10, min_samples_split=15,n_estimators=300,n_jobs=-1)),
    RFE(RandomForestRegressor(criterion='mse',max_features=None,min_samples_split=10,min_samples_leaf=2,n_estimators=300,n_jobs=-1)),
    RFE(RandomForestRegressor(criterion='mse',max_features=None,min_samples_leaf=2, min_samples_split=5,n_estimators=300,n_jobs=-1))]

param_grid = {'n_features_to_select': np.asarray(range(30,120,10))}

selected_models = np.empty(4,dtype = Pipeline)
scores = np.zeros(4)
features = np.empty(4, dtype = np.ndarray)

for i in range(4): 
    search = GridSearchCV(rfe_random_forests[i],param_grid,cv=5,n_jobs=-1,iid=False)
    search.fit(x,y[i])
    selected_models[i] = search.best_estimator_ 
    scores[i] = search.best_score_
    features[i] = selected_models[i].get_support()

In [33]:
rf = RandomForestClassifier(criterion = 'gini',max_features=11,min_samples_leaf=4,min_samples_split=10, n_estimators=300,n_jobs=-1)

sfs_rf = SFS(estimator = rf,k_features=60,forward=False,floating=True,scoring='accuracy',n_jobs=-1)
sfs_rf.fit(x,y[3])

SequentialFeatureSelector(clone_estimator=True, cv=5,
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=11, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=4, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=300, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
             floating=True, forward=False, k_features=60, n_jobs=-1,
             pre_dispatch='2*n_jobs', scoring='accuracy', verbose=0)

In [19]:
#forward sequential with 300 estimators
print(sfs_rf.k_score_)
print(sfs_rf.k_feature_names_)

0.8234927536231883
('value.Reading Comprehension', 'value.Active Listening', 'value.Writing', 'value.Speaking', 'value.Mathematics Skill', 'value.Science', 'value.Critical Thinking', 'value.Active Learning', 'value.Learning Strategies', 'value.Monitoring', 'value.Social Perceptiveness', 'value.Coordination', 'value.Persuasion', 'value.Negotiation', 'value.Instructing', 'value.Service Orientation', 'value.Complex Problem Solving', 'value.Operations Analysis', 'value.Technology Design', 'value.Equipment Selection', 'value.Installation', 'value.Operation Monitoring', 'value.Operation and Control', 'value.Equipment Maintenance', 'value.Troubleshooting', 'value.Repairing', 'value.Judgment and Decision Making', 'value.Systems Analysis', 'value.Systems Evaluation', 'value.Time Management', 'value.Management of Financial Resources', 'value.Management of Material Resources', 'value.Oral Comprehension', 'value.Written Comprehension', 'value.Oral Expression', 'value.Written Expression', 'value.Fl

In [21]:
#backward floating with 50 estimators
print(sfs_rf.k_score_)
print(sfs_rf.k_feature_names_)

0.8234927536231883
('value.Active Listening', 'value.Writing', 'value.Speaking', 'value.Mathematics Skill', 'value.Science', 'value.Critical Thinking', 'value.Active Learning', 'value.Learning Strategies', 'value.Social Perceptiveness', 'value.Coordination', 'value.Negotiation', 'value.Instructing', 'value.Complex Problem Solving', 'value.Technology Design', 'value.Equipment Selection', 'value.Installation', 'value.Programming', 'value.Operation Monitoring', 'value.Operation and Control', 'value.Equipment Maintenance', 'value.Troubleshooting', 'value.Repairing', 'value.Judgment and Decision Making', 'value.Systems Evaluation', 'value.Time Management', 'value.Management of Financial Resources', 'value.Management of Personnel Resources', 'value.Oral Comprehension', 'value.Written Comprehension', 'value.Oral Expression', 'value.Written Expression', 'value.Fluency of Ideas', 'value.Originality', 'value.Problem Sensitivity', 'value.Deductive Reasoning', 'value.Information Ordering', 'value.

In [34]:
#backward floating with 300 estimators
print(sfs_rf.k_score_)
print(sfs_rf.k_feature_names_)

0.8314927536231883
('value.Reading Comprehension', 'value.Active Listening', 'value.Writing', 'value.Speaking', 'value.Mathematics Skill', 'value.Science', 'value.Critical Thinking', 'value.Active Learning', 'value.Learning Strategies', 'value.Social Perceptiveness', 'value.Coordination', 'value.Persuasion', 'value.Negotiation', 'value.Instructing', 'value.Complex Problem Solving', 'value.Operations Analysis', 'value.Technology Design', 'value.Equipment Selection', 'value.Installation', 'value.Programming', 'value.Operation Monitoring', 'value.Operation and Control', 'value.Equipment Maintenance', 'value.Troubleshooting', 'value.Repairing', 'value.Quality Control Analysis', 'value.Judgment and Decision Making', 'value.Systems Analysis', 'value.Systems Evaluation', 'value.Management of Financial Resources', 'value.Management of Material Resources', 'value.Management of Personnel Resources', 'value.Oral Comprehension', 'value.Written Comprehension', 'value.Oral Expression', 'value.Writte