# Velocity - Feature Selection and RFE


In [1]:
%matplotlib inline

In [2]:
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.figsize': (15, 6)})
import os, sys
import numpy as np
import pandas as pd
from giuh_helpers import tic, toc

# Load saved data from csv file

In [3]:
# Load Data from CSV file
df = pd.read_csv('data/velocity_regressors3.csv', index_col='site_no')
df.index = df.index.astype(str).str.zfill(8)

In [4]:
assert(len(df)==len(df.dropna())) # check no missing values

In [5]:
# Using machine learning
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

from sklearn import linear_model
from sklearn import ensemble

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold #KFold is changed
from sklearn.model_selection import GridSearchCV

randomseed = 28
cv = KFold(n_splits=10, shuffle=True,random_state=randomseed)

In [6]:
data = df
response = 'VELOCITY'
y = data[response]
X = data.drop([response], axis=1)#.values
# Split into training and test part
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=randomseed)

## 1. Model Selection using Recursive Feature Elimination (RFE)
RFE algorithm select features by recursively considering smaller and smaller sets of features

In [7]:
from sklearn.feature_selection import RFE
from collections import OrderedDict


In [8]:
def k_fold_val(grid, sel_features):
    ''' Fit estimator with a subset of features
        Return: test score
    '''
    grid.fit(X_train[sel_features], y_train)
    test_score = grid.score(X_test[sel_features], y_test)
    return test_score

In [9]:
# USE RFE to select the best combination of features
est = ensemble.ExtraTreesRegressor(min_samples_split=2, n_estimators=175, random_state=randomseed)
# est = ensemble.ExtraTreesRegressor(min_samples_split=2, n_estimators=250, random_state=randomseed)
feat_list_dict = OrderedDict()
for i in range(1,20):
    rfe = RFE(estimator=est, n_features_to_select=i, step=1)
    rfe.fit(X_train, y_train)

    selected = []
    for v in zip(X.columns, rfe.ranking_):
        if v[1]==1: #selected features will be give 1 ranking
            # append the column name of selected feature
            selected.append(v[0])
    print(selected)
    # Index by number of features and save the selected features
    feat_list_dict[len(selected)] = selected

['SLOPE']
['WSAREA', 'SLOPE']
['WSAREA', 'TMAX', 'SLOPE']
['WSAREA', 'TMAX', 'SLOPE', 'PET']
['WSAREA', 'TMAX', 'TMEAN', 'SLOPE', 'PET']
['WSAREA', 'TMAX', 'TMEAN', 'SLOPE', 'PET', 'ELEV']
['WSAREA', 'PRECIP', 'TMAX', 'TMEAN', 'SLOPE', 'PET', 'ELEV']
['WSAREA', 'PRECIP', 'TMAX', 'TMEAN', 'TMIN', 'SLOPE', 'PET', 'ELEV']
['WSAREA', 'PERM', 'PRECIP', 'TMAX', 'TMEAN', 'TMIN', 'SLOPE', 'PET', 'ELEV']
['WSAREA', 'PERM', 'PRECIP', 'TMAX', 'TMEAN', 'TMIN', 'HYDRCON', 'SLOPE', 'PET', 'ELEV']
['WSAREA', 'OW', 'PERM', 'PRECIP', 'TMAX', 'TMEAN', 'TMIN', 'HYDRCON', 'SLOPE', 'PET', 'ELEV']
['WSAREA', 'OW', 'CLAY', 'PERM', 'PRECIP', 'TMAX', 'TMEAN', 'TMIN', 'HYDRCON', 'SLOPE', 'PET', 'ELEV']
['WSAREA', 'OW', 'RD', 'CLAY', 'PERM', 'PRECIP', 'TMAX', 'TMEAN', 'TMIN', 'HYDRCON', 'SLOPE', 'PET', 'ELEV']
['WSAREA', 'OW', 'RD', 'CLAY', 'WTDEP', 'PERM', 'PRECIP', 'TMAX', 'TMEAN', 'TMIN', 'HYDRCON', 'SLOPE', 'PET', 'ELEV']
['WSAREA', 'OW', 'RD', 'CLAY', 'WTDEP', 'PERM', 'PRECIP', 'TMAX', 'TMEAN', 'TMIN', 'HYD

In [10]:
# Save this dict; either through pickle or converting to dataframe
feat_list_df = pd.DataFrame.from_dict(dict(feat_list_dict), orient='index')
# Add a new column for saving scores; 
# we will save the predicted values here for each combintion of features obtained from RFE
feat_list_df['test_score'] = np.nan

# save all these predicted velocity outputs
# then run model on each combination to see the performance
res = OrderedDict()
# est = ensemble.ExtraTreesRegressor(min_samples_split=2, n_estimators=200, random_state=randomseed)
# use the same estimator as above
for row in feat_list_dict:
    sel_features = feat_list_dict[row] #here row seems to be key
    est = ensemble.ExtraTreesRegressor(min_samples_split=2, n_estimators=250, random_state=randomseed) #Moved from outside to inside
    ts = k_fold_val(est, sel_features)
    print(feat_list_dict[row], ts)
    features = ', '.join(sel_features)
    res[features] = ts

['SLOPE'] 0.1039647402222883
['WSAREA', 'SLOPE'] 0.42836357105388473
['WSAREA', 'TMAX', 'SLOPE'] 0.611217637391287
['WSAREA', 'TMAX', 'SLOPE', 'PET'] 0.6416649030445327
['WSAREA', 'TMAX', 'TMEAN', 'SLOPE', 'PET'] 0.6528440086823092
['WSAREA', 'TMAX', 'TMEAN', 'SLOPE', 'PET', 'ELEV'] 0.6511060421933927
['WSAREA', 'PRECIP', 'TMAX', 'TMEAN', 'SLOPE', 'PET', 'ELEV'] 0.6632168931810839
['WSAREA', 'PRECIP', 'TMAX', 'TMEAN', 'TMIN', 'SLOPE', 'PET', 'ELEV'] 0.6717963119884874
['WSAREA', 'PERM', 'PRECIP', 'TMAX', 'TMEAN', 'TMIN', 'SLOPE', 'PET', 'ELEV'] 0.6671002249440346
['WSAREA', 'PERM', 'PRECIP', 'TMAX', 'TMEAN', 'TMIN', 'HYDRCON', 'SLOPE', 'PET', 'ELEV'] 0.6592141831828656
['WSAREA', 'OW', 'PERM', 'PRECIP', 'TMAX', 'TMEAN', 'TMIN', 'HYDRCON', 'SLOPE', 'PET', 'ELEV'] 0.6629485497682179
['WSAREA', 'OW', 'CLAY', 'PERM', 'PRECIP', 'TMAX', 'TMEAN', 'TMIN', 'HYDRCON', 'SLOPE', 'PET', 'ELEV'] 0.6602790568835009
['WSAREA', 'OW', 'RD', 'CLAY', 'PERM', 'PRECIP', 'TMAX', 'TMEAN', 'TMIN', 'HYDRCON', '

In [11]:
res = dict(res)
res_df = pd.DataFrame.from_dict(res, orient='index')
res_df.columns = ['score']
res_df

Unnamed: 0,score
SLOPE,0.103965
"WSAREA, SLOPE",0.428364
"WSAREA, TMAX, SLOPE",0.611218
"WSAREA, TMAX, SLOPE, PET",0.641665
"WSAREA, TMAX, TMEAN, SLOPE, PET",0.652844
"WSAREA, TMAX, TMEAN, SLOPE, PET, ELEV",0.651106
"WSAREA, PRECIP, TMAX, TMEAN, SLOPE, PET, ELEV",0.663217
"WSAREA, PRECIP, TMAX, TMEAN, TMIN, SLOPE, PET, ELEV",0.671796
"WSAREA, PERM, PRECIP, TMAX, TMEAN, TMIN, SLOPE, PET, ELEV",0.6671
"WSAREA, PERM, PRECIP, TMAX, TMEAN, TMIN, HYDRCON, SLOPE, PET, ELEV",0.659214


In [12]:
#  Looks like this used for paper finally
import pickle
fp = open('results/rfe_dict.pkl', 'wb')
# pickle.dump(res, fp, protocol=2)
pickle.dump(res, fp)
fp.close()