# Feature selection

## regression tree

Build a regression tree to see which features are chosen first.

PCA is advised to be done beforehand

#### get data

In [8]:
import sys
import yaml
import tqdm

sys.path.append('/home/joel/projects/driftlon/analysis')
sys.path.append('/home/joel/projects/driftlon')
from correlation import *
from analysis_utils import get_data_for_keys, get_common_keys

In [4]:
X = pickle.load(open('../X.pkl', 'rb+'))
Y = pickle.load(open('../Y.pkl', 'rb+'))

y = [int(y_ > 0) for y_ in Y]

In [5]:
len(X), len(Y), len(y)

(87303, 87303, 87303)

In [6]:
non_numerical_fields_path = './non_numeric_fields.yaml'

with open(non_numerical_fields_path, 'r') as file_:
    non_numerical_fields = yaml.load(file_.read(),  Loader=yaml.BaseLoader)

In [9]:
common_keys = [x for x in get_common_keys(X) if x not in non_numerical_fields]
data_for_keys = get_data_for_keys(common_keys, X)

In [10]:
all_data = pd.DataFrame(data_for_keys).T.set_axis(common_keys, axis=1)
quant_indices = [common_keys[x[0]] for x in enumerate(all_data.loc[0,:]) if type(x[1])==int] 
data = pd.DataFrame(all_data.loc[:,quant_indices])
converted_data = data.apply(pd.to_numeric)

#### build tree

In [None]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.metrics import accuracy_score, recall_score

In [None]:
for i in range(2):
    tree_clf = DecisionTreeClassifier(max_depth=i+1)

    tree_clf.fit(all_data[:10000], y[:10000])
    
    # eval
    pred = tree_clf.predict(all_data[10000:])
    acc = accuracy_score(y[10000:], pred)
    rec = recall_score(y[10000:], pred)
    print(i, acc, rec)

#### visualize tree

In [None]:
export_graphviz(tree_clf, 
                out_file="plots/driftlon_tree.dot",
                feature_names=common_keys,
                class_names=['pro', 'pleb'],
                rounded=True,
                filled=True
               ) 

In [None]:
! dot -Tpng "plots/driftlon_tree.dot" -o plots/driftlon_tree.png

#### load data from decision tree experiment and plot

In [None]:
import pandas as pd

In [None]:
decision_tree_raw = pd.read_csv('./decision_tree.csv')

In [None]:
decision_tree_acc_rec = decision_tree_raw.set_axis(['epoch','epoch','precision', 'recall'],axis='columns')[['precision', 'recall']]

In [None]:
decision_tree_acc_rec.loc[:10,:].plot(kind='line')

## random forest

Regression random forest

get feature importance

In [None]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
import random

In [None]:
data_with_target = list(zip(all_data.to_numpy(), Y))
random.shuffle(data_with_target)
x_train, y_train = list(zip(*data_with_target[:40000]))
x_test, y_test = list(zip(*data_with_target[40000:]))

In [None]:
rf_reg = RandomForestRegressor(n_estimators=5000, n_jobs=-1, oob_score=True, bootstrap=True)
ef_reg = ExtraTreesRegressor(n_estimators=5000, n_jobs=-1, oob_score=True, bootstrap=True)

In [None]:
%%time
for reg in (rf_reg, ef_reg):
    reg.fit(x_train, y_train)
    print(reg.oob_score_)
    print(reg.score(x_test, y_test))

In [None]:
ran_importance = []
et_importance = []

for name, score in zip(common_keys, rf_reg.feature_importances_):
    ran_importance.append((name, score))
    
for name, score in zip(common_keys, ef_reg.feature_importances_):
    et_importance.append((name, score))

In [None]:
list(reversed(sorted(ran_importance, key=lambda x: x[1])))

In [None]:
list(reversed(sorted(et_importance, key=lambda x: x[1])))

classification random forrest

In [None]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
import random

In [None]:
data_with_target = list(zip(all_data.to_numpy(), y))
random.shuffle(data_with_target)
x_train, y_train = list(zip(*data_with_target[:40000]))
x_test, y_test = list(zip(*data_with_target[40000:]))

In [None]:
rf_clf = RandomForestClassifier(n_estimators=5000, n_jobs=-1, oob_score=True, bootstrap=True)
ef_clf = ExtraTreesClassifier(n_estimators=5000, n_jobs=-1, oob_score=True, bootstrap=True)

In [None]:
%%time
for reg in (rf_reg, ef_reg):
    reg.fit(x_train, y_train)
    print(reg.oob_score_)
    print(reg.score(x_test, y_test))

In [None]:
ran_importance = []
et_importance = []

for name, score in zip(common_keys, rf_reg.feature_importances_):
    ran_importance.append((name, score))
    
for name, score in zip(common_keys, ef_reg.feature_importances_):
    et_importance.append((name, score))

In [None]:
list(reversed(sorted(ran_importance, key=lambda x: x[1])))

In [None]:
list(reversed(sorted(et_importance, key=lambda x: x[1])))