In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_selection import mutual_info_classif, chi2, f_regression, mutual_info_regression, f_classif, SelectKBest
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, mean_squared_error, mean_absolute_error, log_loss
from sklearn.metrics import roc_curve, roc_auc_score, auc
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import matplotlib.pyplot as plt

np.random.seed(1000)

*Processing Data*

the below function reads the data file and creats 3 tables:
    1. the x table - workshop number and rounded SAk scores
    2. aggregated x table - noc and sak level
    3. y with individual expert answers, a vector for binning on increase, decrease and no bin
    4. aggregated y - vector for binned on increase, decrease and  non-binned

In [3]:
def data_proccess(file):
    data = pd.read_csv(file,index_col=['noc','workshop.number'])
    data.sort_index(inplace=True)
    data.loc[data.share == 'remain constant','share'] = 'constant'
        
    x = data.drop(['absolute','share','Unnamed: 0','noc_code'],axis=1) #making x data frame
    x['work_num'] = x.index.get_level_values(1) #making workshop number a variable as well as an index
    x = np.round(x).astype(int)#round x to make discrete
    
    x_agg = x.drop_duplicates()
    
    x_noclvl = x_agg.drop('work_num',axis=1).droplevel(1).drop_duplicates()
    
    y = pd.DataFrame({'non_binned': data['share'],
              'increase': data['share'].str.replace('constant','decrease'),
              'decrease': data['share'].str.replace('constant','increase')})

    y_agg = pd.DataFrame(data['share']).pivot_table(index = ['noc','workshop.number'], columns = 'share', aggfunc = len).fillna(0)
    y_agg['sum'] = y_agg.sum(axis = 1)
    y_noclvl = y_agg.groupby(level=0).sum()
    y_agg.loc[:,y_agg.columns!='sum'] = y_agg.loc[:,y_agg.columns!='sum'].divide(y_agg['sum'],axis=0)
    y_noclvl.loc[:,y_noclvl.columns!='sum'] = y_noclvl.loc[:,y_noclvl.columns!='sum'].divide(y_noclvl['sum'],axis=0)
    
    
    
    return x, x_agg, y, y_agg, x_noclvl, y_noclvl

In [4]:
def init_params(model_type):
    if model_type == 'cat':
        params = {
         'criterion': 'gini',
         'max_features': 'auto',
         'min_samples_leaf': 8,
         'min_samples_split': 5,
         'n_estimators': 1000,
         'n_jobs':-1
        }
    if model_type == 'reg':
        params = {
         'criterion': 'mse',
         'max_features': None,
         'min_samples_leaf': 1,
         'min_samples_split': 15,
         'n_estimators': 250,
         'n_jobs':-1
        }
    return params

In [5]:
def run_k_fold (x,y,params,index,binned,model_type):
    
    x = pd.DataFrame(x)
    
    rf = RandomForestClassifier(**params)
    kf = KFold(n_splits=5,shuffle=False)
    n_trees = params['n_estimators']
    
    if model_type == 'reg':
        rf = RandomForestRegressor(**params)
        kf = KFold(n_splits=10,shuffle=True)
    
    if binned:
        pred = np.zeros(x.shape[0])
    else:
        pred = np.zeros((x.shape[0],3))
    
    for train_index, test_index in kf.split(x):
        x_train, x_test = x.iloc[train_index], x.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        rf.fit(x_train,y_train)
        
        if model_type == 'reg':
            pred[test_index] = rf.predict(x_test)
        if model_type == 'pred_probs':
            if binned:
                pred[test_index] = rf.predict_proba(x_test)[:,1]
            else: 
                pred[test_index] = rf.predict_proba(x_test)
        if model_type == 'tree_port':
            tree_pred = np.zeros((n_trees,len(test_index))) 
            for tree in range(n_trees):
                tree_pred[tree] = rf.estimators_[tree].predict(x_test)
            pred[test_index] = tree_pred.mean(axis=0)
    
    pred = pd.DataFrame(pred,index=index).groupby(index).first()
    
    return pred

In [6]:
def k_fold_feature_importance(x,y,model_type):
    rf = RandomForestClassifier(**init_params(model_type))
    kf = KFold(n_splits=10,shuffle=False)
    
    if model_type == 'reg':
        rf = RandomForestRegressor(**init_params(model_type))
        kf = KFold(n_splits=5,shuffle=True)
    
    feature_imp = np.zeros((x.shape[1],5))
    i=0
    
    for train_index, test_index in kf.split(x):
        x_train, x_test = x.iloc[train_index], x.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        rf.fit(x_train,y_train)
        
        feature_imp[:,i] = rf.feature_importances_
    
    return feature_imp.mean(axis=1)

In [7]:
def param_search(x,y,model_type):
    
    param_grid= {'n_estimators':[100,150,250,275,300,600,1000],#number of trees 
             'min_samples_leaf': [1,2,4,8],#minimum number of data points can be used to make a leaf at the end of a tree 
             'min_samples_split': [5,10,15]#min number of data points to split a branch 
             }
    
    
    if model_type == 'reg':
        rf = RandomForestRegressor(**init_params(model_type))
        search = GridSearchCV(rf,param_grid,scoring='neg_mean_squared_error',cv=5,n_jobs=-1,iid=False)
        
    if model_type == 'cat':
        rf = RandomForestClassifier(**init_params(model_type))
        param_grid['criterion'] = ['gini','entropy']
        search = GridSearchCV(rf,param_grid,scoring='neg_log_loss',cv=5,n_jobs=-1,iid=False)
    
    search.fit(x,y)
    
    return search.best_params_, search.cv_results_ 

In [8]:
def basic_feature_selection(x,y,model_type,k):
    if model_type == 'class':
        return SelectKBest(mutual_info_classif,k).fit_transform(x,y)

    if model_type == 'reg':
        return SelectKBest(mutual_info_regression,k).fit_transform(x,y)


In [9]:
#Feature scores for a number of differnt measures
def different_feature_rankings(x, x_agg, y, y_agg):
    mi_class, chi2_class, f_class = basic_feature_selection(x,y['increase'],'class')
    mi_reg, f_reg = basic_feature_selection(x_agg,y_agg['increase'],'reg')
    
    feature_scores = pd.DataFrame({'mi_class': mi_class.scores_,
                 'chi2': chi2_class.scores_,
                 'f_class': f_class.scores_,
                 'mi_reg': mi_reg.scores_,
                 'f_reg': f_reg.scores_},index=x.columns)
    
    feature_scores.sort_values('mi_reg',ascending = False)

In [12]:
def scores_by_k(x, x_agg, y, y_agg):
    reg_scores = np.zeros((120,120))
    class_scores = np.zeros((120,120))

    
    
    for k in range(1,121):
        reg_scores[k-1] = run_k_fold(basic_feature_selection(x_agg,y_agg['increase'],'reg',k),
                            y_agg['increase'],
                            init_params('reg'),
                            x_agg.index,
                            True,'reg').iloc[:,0].values
        class_scores[k-1] = run_k_fold(basic_feature_selection(x,y['increase'],'class',k),
                            y['increase'],
                            init_params('cat'),
                            x.index,
                            True,'pred_probs').iloc[:,0].values
        
    r_scores = abs(pd.DataFrame(reg_scores,index=range(1,121),columns=x_agg.index).T.subtract(y_agg['increase'],axis=0)).mean(axis=0)
    c_scores = abs(pd.DataFrame(class_scores,index=range(1,121),columns=x_agg.index).T.subtract(y_agg['increase'],axis=0)).mean()
        
    return r_scores, c_scores

In [13]:
def run_models(x, x_agg, y, y_agg,binned,increase,k_reg,k_class):    
    
    if binned:
        if increase:
            #set number of features to run with
            x_agg_cut = basic_feature_selection(x_agg,y_agg['increase'],'reg',k_reg)
            x_cut = basic_feature_selection(x,y['increase'],'class',k_class)

            #run regression in a k-fold framework and place results into dataframes
            pred = pd.concat([
                run_k_fold(x_agg_cut,y_agg['increase'],init_params('reg'),x_agg.index,binned,'reg'),
                run_k_fold(x_cut,y['increase'],init_params('cat'),x.index,binned,'pred_probs'),
                run_k_fold(x_cut,y['increase'],init_params('cat'),x.index,binned,'tree_port')
            ],axis=1)

        else:
            x_agg_cut = basic_feature_selection(x_agg,y_agg['decrease'],'reg',k_reg)
            x_cut = basic_feature_selection(x,y['decrease'],'class',k_class)

            #run regression in a k-fold framework and place results into dataframes
            pred = pd.concat([
                run_k_fold(x_agg_cut,y_agg['decrease'],init_params('reg'),x_agg.index,binned,'reg'),
                1 - run_k_fold(x_cut,y['decrease'],init_params('cat'),x.index,binned,'pred_probs'),
                1 - run_k_fold(x_cut,y['decrease'],init_params('cat'),x.index,binned,'tree_port')
            ],axis=1)
        
    else:
        y_agg = y_agg[['constant','decrease','increase']]
        y = y['non_binned']
        pred = pd.concat([
            run_k_fold(x_agg,y_agg,init_params('reg'),x_agg.index,binned,'reg'),
            run_k_fold(x,y,init_params('cat'),x.index,binned,'pred_probs')#,
            #run_k_fold(x,y,init_params('cat'),x.index,binned,'tree_port')
        ],axis=1)
        
    
    pred.set_index(x_agg.index,inplace = True)
    if binned:
        pred.columns = ['regression','pred_prob','tree_portions']
    else:
        pred.columns = ['regression_con','regression_dec','regression_inc',
                                 'prob_con','prob_dec','prob_inc']
    
    return pred

In [14]:
def confusion_matrix(pred,truth,binned):
    if binned:
        matrix = pd.DataFrame(
         [[sum(np.logical_and(truth>=0.5,pred>=0.5)),
           sum(np.logical_and(truth>=0.5,pred<0.5))],
          [sum(np.logical_and(truth<0.5,pred>=0.5)),
           sum(np.logical_and(truth<0.5,pred<0.5))]]
        ,columns=['pred_increase','pred_decrease'],index=['true_increase','true_decrease'])
    else:
        pred.columns = ['constant','decrease','increase']
        matrix = pd.DataFrame(
        [[sum(np.logical_and(truth.idxmax(axis=1)=='increase',pred.idxmax(axis=1)=='increase')),
          sum(np.logical_and(truth.idxmax(axis=1)=='increase',pred.idxmax(axis=1)=='constant')),
          sum(np.logical_and(truth.idxmax(axis=1)=='increase',pred.idxmax(axis=1)=='decrease'))],
         [sum(np.logical_and(truth.idxmax(axis=1)=='constant',pred.idxmax(axis=1)=='increase')),
          sum(np.logical_and(truth.idxmax(axis=1)=='constant',pred.idxmax(axis=1)=='constant')),
          sum(np.logical_and(truth.idxmax(axis=1)=='constant',pred.idxmax(axis=1)=='decrease'))],
         [sum(np.logical_and(truth.idxmax(axis=1)=='decrease',pred.idxmax(axis=1)=='increase')),
          sum(np.logical_and(truth.idxmax(axis=1)=='decrease',pred.idxmax(axis=1)=='constant')),
          sum(np.logical_and(truth.idxmax(axis=1)=='decrease',pred.idxmax(axis=1)=='decrease'))]],
        columns=['pred_increase','pred_constant','pred_decrease'],index=['true_increase','true_constant','true_decrease'])
        
    return matrix

In [15]:
def run_sfs(x,y,model_type):
    if model_type == 'reg':
        rf = RandomForestRegressor(**init_params(model_type))
        sfs = SFS(rf, 
           k_features=(5,110), 
           forward=True, 
           floating=True, 
           verbose=2,
           scoring='neg_mean_squared_error',
           cv=5,
           n_jobs=-1)
    else:
        rf = RandomForestClassifier(**init_params(model_type))
        sfs = SFS(rf, 
           k_features=(1,20), 
           forward=True, 
           floating=True, 
           verbose=2,
           scoring=make_scorer(custom_MSE,greater_is_better=False,needs_proba=True),
           cv=5,
           n_jobs=-1)
    
    sfs.fit(x,y)
    
    return sfs

In [16]:
def custom_MAE(y_true, y_pred):
    totals = y_true.groupby(level=[0,1]).count()
    increase_count = y_true[y_true=='increase'].groupby(level=[0,1]).count()
    y_true_agg = increase_count.divide(totals).fillna(0)
    
    y_true_agg_rep = np.repeat(y_true_agg[0],totals[0])
    for i in range(1,len(y_true_agg)):
        y_true_agg_rep = np.concatenate((y_true_agg_rep,np.repeat(y_true_agg[i],totals[i])),axis=None)
    
    return mean_absolute_error(y_true_agg_rep,y_pred)