In [5]:
def tree_paths(tree):
    
    children_left = tree.children_left
    children_right = tree.children_right
    values = tree.value
    
    tree_paths = []
    tree_probs = []
    path = []
    path_probs = []
    stack = [(0, -1)]  # seed is the root node id and its parent depth
    while len(stack) > 0:
        node_id, parent_depth = stack.pop()    
               
        while len(path)>parent_depth+1:
            path.pop()
            path_probs.pop()
        path.append(node_id)
        true = values[node_id][0][0].copy()
        false = values[node_id][0][1].copy()
        path_probs.append(round(true/(true+false),3))
        
        # If we have a test node
        if (children_left[node_id] != children_right[node_id]):
            stack.append((children_left[node_id], parent_depth + 1))
            stack.append((children_right[node_id], parent_depth + 1))
        else:
            tree_paths.append(path.copy())
            tree_probs.append(path_probs.copy())
            
    return tree_paths, tree_probs

def forest_paths(model):    
    forest_attributes = []
    baseline_means = []
    
    for treeEst in model.estimators_:
        path_thresholds = []
        path_features = []
        path_probs = []
        
        children_left = treeEst.tree_.children_left
        children_right = treeEst.tree_.children_right
        all_thresholds = treeEst.tree_.threshold
        all_features = treeEst.tree_.feature

        paths, path_probs = tree_paths(treeEst.tree_)
        
        leaf_probs = []
        for prob_list in path_probs:
                leaf_probs.append(prob_list[len(prob_list)-1])
        baseline_means.append(np.mean(leaf_probs))

        for i in range(len(paths)):
            thresholds = []
            features = []
            for j in range(len(paths[i])-1):
                if paths[i][j+1] == children_right[paths[i][j]]:
                    thresholds.append(all_thresholds[paths[i][j]])
                else:
                    thresholds.append(-all_thresholds[paths[i][j]])
                features.append(all_features[paths[i][j]])
            path_thresholds.append(thresholds.copy())
            path_features.append(features.copy())
    
        tree_atributes = pd.DataFrame([path_features, path_thresholds, path_probs]).T
        tree_atributes.columns = ['features','thresholds','path_probs']
        forest_attributes.append(tree_atributes.copy())
    return forest_attributes, np.mean(baseline_means)

def init_influence_list(features,conditionals,product):
    if product:
        combo_index = pd.MultiIndex.from_product([features,conditionals])
    else:
        combo_index = pd.MultiIndex.from_arrays([features,conditionals])
    
    infl_lists = []
    for i in range(combo_index.to_series().shape[0]):
        infl_lists.append([])
    
    influences = pd.Series(infl_lists,index=combo_index)
    
    return influences        

def get_influences(feature_combos,model):
    forest_attributes, baseline = forest_paths(model)
    
    for tree_frame in forest_attributes:
        for index, path in tree_frame.iterrows():
            previous = ['blank']
            for step in range(len(path['features'])-1):
                #what features we have seen and the feature we are on
                current_feature = path['features'][step]
                direction = np.sign(path['thresholds'][step])
                if current_feature in feature_combos.index.get_level_values(0):
                    #make relevant calculations
                    current_prob = path['path_probs'][step]
                    next_prob = path['path_probs'][step+1]
                    pct_change = (next_prob-current_prob)/current_prob
                    influence = direction*pct_change
                    #add to all relevant combos
                    for combo in feature_combos.loc[current_feature,previous]:
                        combo.append(influence)
                previous.append(current_feature*direction)
                
    influences_df = pd.DataFrame(index=feature_combos.index)
    
    for index, influences in feature_combos.iteritems():
        if influences:
            influences = np.asarray(influences)
            influences_df.loc[index,'pos_influence'] = np.mean(influences[influences>0])
            influences_df.loc[index,'neg_influence'] = np.mean(influences[influences<=0])
            influences_df.loc[index,'pct_pos'] = np.sum(influences>0)/len(influences)
            influences_df.loc[index,'occurance count'] = len(influences)
    return influences_df

def feature_name_index(table,features):
    
    conditional_labels = []#I know there is a better way to do this
    for condition in table.index.get_level_values(1):
        if condition == 'blank':
            conditional_labels.append(condition)
        elif condition >=0:
            conditional_labels.append("high "+features[condition])
        else:
            conditional_labels.append("low "+features[-1*condition])
    
    table_index = pd.MultiIndex.from_arrays([
        features[table.index.get_level_values(0)],conditional_labels])

    table.index = table_index
    
    return table

%run ../utils_rf

#read training data
file = "../../tables/model_input/noc_answers.csv"
x, x_agg, y, y_agg, x_noclvl, y_noclvl = data_proccess(file,True)
x.drop(['work_num_1','work_num_2','work_num_3','work_num_4','work_num_5','work_num_6'],axis=1,inplace=True)

#grab just the noc codes to cut out test nocs from main noc table
train_nocs = pd.read_csv(file,usecols=['noc_code']).drop_duplicates()

In [7]:
#single time run
rf = RandomForestClassifier(**init_params('cat'))
rf.fit(x,y['increase'])

#making combo sets to check
conditionals = list(range(120))+list(range(-119,0))+['blank']
all_combos = init_influence_list(range(120),conditionals,True)

#running the analysis
influences = feature_name_index(get_influences(all_combos,rf),x.columns)

#pulling out the non conditional
non_cond_inf = influences.xs('blank',level=1)
non_cond_inf.to_csv("../../tables/feature_analysis_output/1run_non_conditional_influences.csv")

#getting portions instead of path counts
influences = pd.merge(influences.reset_index(),
      non_cond_inf['occurance count'].reset_index(),
      left_on=['level_0'],
      right_on=['index'],
      how='inner').set_index(['level_0','level_1']).drop('index',axis=1)

influences['occurance pct'] = influences['occurance count_x']/influences['occurance count_y']

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [8]:
#we are looking for consistency for our main outputs so we run the process 10 times
#we for pairs we take a running average
#for trying to find foundational skills we ask that they be above 0.95 positivie influence in all runs
top10set = []
for i in range(10):
    rf = RandomForestClassifier(**init_params('cat')).fit(x,y['increase'])
    conditionals = list(range(120))+list(range(-119,0))+['blank']
    all_combos = init_influence_list(range(120),conditionals,True)
    current_infl = feature_name_index(get_influences(all_combos,rf),x.columns).fillna(0)
    
    #tring to find our consistent foundational skills
    non_cond_info = current_infl.xs('blank',level=1)
    top10set.append(non_cond_info.loc[non_cond_info['pct_pos']>0.95].copy())
    
    pairs = current_infl[
    np.logical_not(
        np.in1d(current_infl.index.get_level_values(1), 'blank')
    )].copy()
    
    current_sig_pairs = pairs.loc[np.logical_or(pairs['pct_pos']>0.95,pairs['pct_pos']<0.05)].copy()
    current_sig_pairs['count']=1
    if i==0:
        sig_pairs = current_sig_pairs
    else:
        common_idx = sig_pairs.index.intersection(current_sig_pairs.index) #grab the pair we have seen
        for_update = sig_pairs.loc[common_idx,sig_pairs.columns != 'count'].copy()#make lists of old and new info
        update_with = current_sig_pairs.loc[common_idx,sig_pairs.columns != 'count'].copy()
        new_pairs = current_sig_pairs.loc[~current_sig_pairs.index.isin(common_idx)].copy()#new rows
        counts = sig_pairs.loc[common_idx,'count'].copy()
        updated  = (for_update.mul(counts,axis=0).add(update_with,axis=0)).div(counts+1,axis=0)
        
        sig_pairs.loc[common_idx,sig_pairs.columns != 'count'] = updated.copy()
        sig_pairs.loc[common_idx,'count'] = sig_pairs.loc[common_idx,'count']+1
        sig_pairs = pd.concat([sig_pairs,new_pairs])

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [9]:
fullset = []
for bestset in top10set:
    fullset.append(bestset.index.get_values())
    
cons_set = np.unique(np.concatenate(fullset),return_counts =True)

In [10]:
cons_set[]

SyntaxError: invalid syntax (<ipython-input-10-b245c73c4f81>, line 1)

In [None]:
#define signficance markers based on how many times we ran everything
sig_pairs['significance'] = 'not sig'
sig_pairs.loc[sig_pairs['count']>i/4,'significance'] = '*'
sig_pairs.loc[sig_pairs['count']>2*i/4,'significance'] = '**'
sig_pairs.loc[sig_pairs['count']>3*i/4,'significance'] = '***'
sig_pairs = sig_pairs.loc[sig_pairs['significance']!='not sig']

In [None]:
#name index levels
sig_pairs.index.set_names(['main','conditional'],inplace=True)

In [None]:
sig_pairs.to_csv('../../tables/feature_analysis_output/sig_pairs.csv')