# Final Feature selection

In [None]:
#imports
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from BorutaShap import BorutaShap
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, accuracy_score, precision_score
import pickle


In [None]:
%matplotlib inline
sns.set_palette("Set2")

In [None]:
#load initial cleaned data
df=pd.read_csv(r"..\data\processed\prelim_clean2.csv", index_col=[0])

## Boruta

In [None]:
# no model selected default is Random Forest, if classification is True it is a Classification problem
Feature_Selector = BorutaShap(importance_measure='shap',
                              classification=True)

Feature_Selector.fit(X=df.drop(['target'], axis=1), y=df.target, n_trials=100, random_state=1)

In [None]:
Feature_Selector.TentativeRoughFix()

In [None]:
Feature_Selector.plot(which_features="accepted",y_scale='log')


### check if our base model improves 

In [None]:
subset = Feature_Selector.Subset()
subset.drop(['NACCNMRI','DEMUN'], axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(subset, df['target'], test_size=0.35, random_state=1)

model_feat1 = RandomForestClassifier()
#train model
model_feat1.fit(X_train, y_train)
y_pred = model_feat1.predict(X_test)
print(model_feat1.score(X_test, y_test))
print(precision_score(y_test,y_pred))
print(recall_score(y_test,y_pred))



In [None]:
### add genetics

In [None]:
# subset['NACCNE4S']=df['NACCNE4S']
# subset['NACCAPOE']=df['NACCAPOE']
# X_train, X_test, y_train, y_test = train_test_split(subset, df['target'], test_size=0.35, random_state=1)

# model_feat1 = RandomForestClassifier()
# #train model
# model_feat1.fit(X_train, y_train)
# y_pred = model_feat1.predict(X_test)
# print(model_feat1.score(X_test, y_test))
# print(precision_score(y_test,y_pred))
# print(recall_score(y_test,y_pred))



## find which tests these featues are part of which tests and add the rest for engineering

In [None]:
dd=pd.read_csv(r"..\docs\rdd_datadictionary_uds.csv")
import json
with open('../data/processed/forms.txt') as json_file:
    form_key = json.load(json_file)

In [None]:
forms_set=dd['Form'].loc[dd['VariableName'].isin(subset.columns)]
forms_set.unique()
equiv=[form_key[x] for x in forms_set.unique().tolist()]
equiv

In [None]:
#u_forms=forms_set.unique().tolist()
u_forms=['b4','b7','c1c2','c1','c2']
other_vars=dd['VariableName'].loc[dd['Form'].isin(u_forms)]

In [None]:
broad_subset=df[df.columns.intersection(other_vars)]
broad_subset=pd.concat([broad_subset, subset], axis=1, sort=False)
broad_subset

### let's try the forest on this broad feature list

In [None]:
X_train, X_test, y_train, y_test = train_test_split(broad_subset, df['target'], test_size=0.35, random_state=1)

model_feat1 = RandomForestClassifier()
#train model
model_feat1.fit(X_train, y_train)
y_pred = model_feat1.predict(X_test)
print(model_feat1.score(X_test, y_test))
print(precision_score(y_test,y_pred))
print(recall_score(y_test,y_pred))



### save broad and subset features

In [None]:
pickle.dump(broad_subset.columns, open('../models/01final_features_broad.sav', 'wb'))
pickle.dump(subset.columns, open('../models/01final_features_res.sav', 'wb'))


# Feature engineering + shap

In [None]:
#import
# Importing modules to create our layers and model.
from keras.layers import Input, Dense
from keras.models import Model
from sklearn.preprocessing import MinMaxScaler

In [None]:
#normalize to 0-1
scaler = MinMaxScaler()
normalized = scaler.fit_transform(broad_subset)

In [None]:
# Defining the level of compression of the hidden layer. Basically, as the input is passed through the encoding layer, it will come out smaller if you want it to find salient features. If I choose num of columns for my encoding dimension, there would be a compression factor of 1, or nothing.
encoding_dim = round(.75*subset.shape[1])
input_dim = Input(shape=(subset.shape[1], ))
# This is the size of the output. We want to generate 28 x 28 pictures in the end, so this is the size we're looking for. 
output_dim = subset.shape[1]
encoded = Dense(encoding_dim, activation='relu')(input_dim)
decoded = Dense(output_dim, activation='sigmoid')(encoded)

In [None]:
# encoder
autoencoder = Model(input_dim, decoded)
# intermediate result
encoder = Model(input_dim, encoded)
autoencoder.compile(optimizer='adam', loss='binary_crossentropy')#mean_squared_error

In [None]:
X_train, X_test, y_train, y_test = train_test_split(subset, df['target'], test_size=0.35, random_state=1)
autoencoder.fit(X_train, X_train,
                epochs=550,
                batch_size=60,
                shuffle=True,
                validation_data=(X_test, X_test))

### fit the  forest model

In [None]:
encoded_X_train = encoder.predict(X_train)
encoded_X_val = encoder.predict(X_test)

model_feat1 = RandomForestClassifier()
#train model
model_feat1.fit(encoded_X_train, y_train)
y_pred = model_feat1.predict(encoded_X_val)
print(model_feat1.score(encoded_X_val, y_test))
print(precision_score(y_test,y_pred))
print(recall_score(y_test,y_pred))



Not sucessful 