In [1]:
def tree_paths(tree):
    
    children_left = tree.children_left
    children_right = tree.children_right
    values = tree.value
    
    tree_paths = []
    tree_probs = []
    path = []
    path_probs = []
    stack = [(0, -1)]  # seed is the root node id and its parent depth
    while len(stack) > 0:
        node_id, parent_depth = stack.pop()    
               
        while len(path)>parent_depth+1:
            path.pop()
            path_probs.pop()
        path.append(node_id)
        true = values[node_id][0][0].copy()
        false = values[node_id][0][1].copy()
        path_probs.append(round(true/(true+false),3))
        
        # If we have a test node
        if (children_left[node_id] != children_right[node_id]):
            stack.append((children_left[node_id], parent_depth + 1))
            stack.append((children_right[node_id], parent_depth + 1))
        else:
            tree_paths.append(path.copy())
            tree_probs.append(path_probs.copy())
            
    return tree_paths, tree_probs

In [2]:
def forest_paths(model):    
    forest_attributes = []
    baseline_means = []
    
    for treeEst in model.estimators_:
        path_thresholds = []
        path_features = []
        path_probs = []
        
        children_left = treeEst.tree_.children_left
        children_right = treeEst.tree_.children_right
        all_thresholds = treeEst.tree_.threshold
        all_features = treeEst.tree_.feature

        paths, path_probs = tree_paths(treeEst.tree_)
        
        leaf_probs = []
        for prob_list in path_probs:
                leaf_probs.append(prob_list[len(prob_list)-1])
        baseline_means.append(np.mean(leaf_probs))

        for i in range(len(paths)):
            thresholds = []
            features = []
            for j in range(len(paths[i])-1):
                if paths[i][j+1] == children_right[paths[i][j]]:
                    thresholds.append(all_thresholds[paths[i][j]])
                else:
                    thresholds.append(-all_thresholds[paths[i][j]])
                features.append(all_features[paths[i][j]])
            path_thresholds.append(thresholds.copy())
            path_features.append(features.copy())
    
        tree_atributes = pd.DataFrame([path_features, path_thresholds, path_probs]).T
        tree_atributes.columns = ['features','thresholds','path_probs']
        forest_attributes.append(tree_atributes.copy())
    return forest_attributes, np.mean(baseline_means)

In [3]:
def init_influence_list(features,conditionals,product):
    if product:
        combo_index = pd.MultiIndex.from_product([features,conditionals])
    else:
        combo_index = pd.MultiIndex.from_arrays([features,conditionals])
    
    infl_lists = []
    for i in range(combo_index.to_series().shape[0]):
        infl_lists.append([])
    
    influences = pd.Series(infl_lists,index=combo_index)
    
    return influences        

In [4]:
def get_influences(feature_combos,model):
    forest_attributes, baseline = forest_paths(model)
    
    for tree_frame in forest_attributes:
        for index, path in tree_frame.iterrows():       
            for step in range(len(path['features'])-1):
                #what features we have seen and the feature we are on
                previous = np.append(path['features'][0:step],-1)
                current_feature = path['features'][step]
                if current_feature in feature_combos.index.get_level_values(0):
                    #make relevant calculations
                    current_prob = path['path_probs'][step]
                    next_prob = path['path_probs'][step+1]
                    pct_change = (next_prob-current_prob)/current_prob
                    influence = np.sign(path['thresholds'][step])*pct_change
                    #add to all relevant combos
                    for combo in feature_combos.loc[current_feature,previous]:
                        combo.append(influence)
                
    influences_df = pd.DataFrame(index=feature_combos.index)
    
    for index, influences in feature_combos.iteritems():
        if influences:
            influences = np.asarray(influences)
            influences_df.loc[index,'pos_influence'] = np.mean(influences[influences>0])
            influences_df.loc[index,'neg_influence'] = np.mean(influences[influences<=0])
            influences_df.loc[index,'pct_pos'] = np.sum(influences>0)/len(influences)
            influences_df.loc[index,'path count'] = len(influences)
    return influences_df

In [69]:
def feature_name_index(table):
    table_index = pd.MultiIndex.from_arrays([
        x.columns[table.index.get_level_values(0)],
        x.columns[table.index.get_level_values(1)]
    ])

    table.index = table_index
    
    return table

In [5]:
%run utils_rf
from sklearn import tree

In [6]:
#read training data
file = "../../tables/noc_answers.csv"
x, x_agg, y, y_agg, x_noclvl, y_noclvl = data_proccess(file,True)
x.drop(['work_num_1','work_num_2','work_num_3','work_num_4','work_num_5','work_num_6'],axis=1,inplace=True)

#grab just the noc codes to cut out test nocs from main noc table
train_nocs = pd.read_csv(file,usecols=['noc_code']).drop_duplicates()

In [7]:
rf = RandomForestClassifier(**init_params('cat'))

rf.fit(x,y['increase'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=8, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [10]:

sffs_prob_features = ['value.Learning Strategies',
             'value.Monitoring',
             'value.Persuasion',
             'value.Service Orientation',
             'value.Management of Material Resources',
             'value.Information Ordering',
             'value.Memorization',
             'value.Flexibility of Closure',
             'value.Time Sharing',
             'value.Finger Dexterity',
             'value.Far Vision',
             'value.Computers and Electronics',
             'value.Chemistry',
             'value.Biology']

x_sffs = x.loc[:,sffs_prob_features]

In [11]:
rf_sffs = RandomForestClassifier(**init_params('cat'))
rf_sffs.fit(x_sffs,y['increase'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=8, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [8]:
rf_features = pd.Series(rf.feature_importances_,index=x.columns)
selected_features = rf_features.sort_values(ascending=False).iloc[0:20]
selected_features_ix = np.where(np.isin(x.columns,selected_features.index))[0]
selected_features = x.columns[selected_features_ix]

In [9]:
all_combos = init_influence_list(selected_features_ix,range(-1,120),True)
selected_combos = get_influences(all_combos,rf)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [73]:
all_sffs_combos = init_influence_list(range(0,14),range(-1,120),True)
selected_sffs_combos = get_influences(all_sffs_combos,rf_sffs)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [75]:
selected_sffs_combos = feature_name_index(selected_sffs_combos)

In [63]:
all_independents = init_influence_list(range(0,120),[-1],True)
independent_influences = get_influences(all_independents,rf)

In [70]:
independent_influences = feature_name_index(independent_influences)

In [72]:
independent_influences.sort_values('pct_pos',ascending=False)

Unnamed: 0,Unnamed: 1,pos_influence,neg_influence,pct_pos,path count
value.Memorization,value.Transportation,0.303741,-0.139704,0.989091,1375.0
value.Fluency of Ideas,value.Transportation,0.406025,-0.128103,0.980210,1617.0
value.Systems Evaluation,value.Transportation,0.341401,-0.116726,0.970628,1464.0
value.Instructing,value.Transportation,0.319096,-0.145940,0.962264,901.0
value.Persuasion,value.Transportation,0.379083,-0.153903,0.961930,1865.0
value.Service Orientation,value.Transportation,0.298379,-0.160304,0.960057,1402.0
value.Fine Arts,value.Transportation,0.149403,-0.064652,0.959016,366.0
value.Originality,value.Transportation,0.347700,-0.156877,0.956120,1299.0
value.Number Facility,value.Transportation,0.344997,-0.070851,0.948421,950.0
value.Technology Design,value.Transportation,0.185255,-0.174036,0.942718,1030.0


In [62]:
pd.DataFrame({
    'pct pos':
        selected_combos['pct_pos'][selected_combos['path count']>200][:,'value.Transportation'],
    'Pos Influence':
        selected_combos['pos_influence'][selected_combos['path count']>200][:,'value.Transportation'],
    'Neg Influence':
        selected_combos['neg_influence'][selected_combos['path count']>200][:,'value.Transportation']
}).sort_values('pct pos',ascending=False)
         

Unnamed: 0,pct pos,Pos Influence,Neg Influence
value.Memorization,0.989091,0.303741,-0.139704
value.Fluency of Ideas,0.98021,0.406025,-0.128103
value.Systems Evaluation,0.970628,0.341401,-0.116726
value.Instructing,0.962264,0.319096,-0.14594
value.Persuasion,0.96193,0.379083,-0.153903
value.Service Orientation,0.960057,0.298379,-0.160304
value.Originality,0.95612,0.3477,-0.156877
value.Number Facility,0.948421,0.344997,-0.070851
value.Technology Design,0.942718,0.185255,-0.174036
value.Systems Analysis,0.939141,0.322796,-0.169622


In [76]:
pd.DataFrame({
    'pct pos':
        selected_sffs_combos['pct_pos'][selected_sffs_combos['path count']>200][:,'value.Transportation'],
    'Pos Influence':
        selected_sffs_combos['pos_influence'][selected_sffs_combos['path count']>200][:,'value.Transportation'],
    'Neg Influence':
        selected_sffs_combos['neg_influence'][selected_sffs_combos['path count']>200][:,'value.Transportation']
}).sort_values('pct pos',ascending=False)

Unnamed: 0,pct pos,Pos Influence,Neg Influence
value.Critical Thinking,0.970207,0.291593,-0.101478
value.Writing,0.922587,0.33957,-0.10123
value.Speaking,0.91093,0.282302,-0.154833
value.Mathematics Skill,0.867931,0.105588,-0.056547
value.Reading Comprehension,0.799568,0.224773,-0.151341
value.Coordination,0.725109,0.226631,-0.430163
value.Monitoring,0.707041,0.167189,-0.188349
value.Persuasion,0.700778,0.217126,-0.164128
value.Social Perceptiveness,0.699394,0.169688,-0.076319
value.Science,0.560571,0.193423,-0.131132


Non Structural Sufficiency Test