In [None]:
from src.utils.train_utils import *
from src.utils.test_utils import *
from src.utils.feature_analyzer import *
from src.utils.plot_utils import *

### Import the data

In [None]:
train_df_path = '../../data/synth_data_train_labeled.csv'
test_df_path = '../../data/synth_data_test_labeled.csv'
fi_dir = 'feature_importance'
target_model_dir = "../../model"

In [None]:
cols_to_drop = ['checked', 'Ja', 'Nee']
train_df = add_checked(pd.read_csv(train_df_path))
X_train = train_df.astype(np.float32)
y_train = train_df['checked']

In [None]:
test_df = add_checked(pd.read_csv(test_df_path))
X_test = test_df.drop(cols_to_drop, axis=1)
X_test = X_test.astype(np.float32)
y_test = test_df['checked']

### Manipulate the data

In [None]:
# Manipulate the data to reduce/increase bias
def data_manipulator(X, y):
    ########INSERT DATA MANIPULATION CODE HERE##########
    
    problem_features = filter_features(X.columns)
    
    for feature in problem_features:
        X[feature] = X[feature].mean() # Average values of problematic features to effectively 'remove' them from the model
    return X, y
    ####################################################

X_prime, y_prime = data_manipulator(X_train, y_train)

X_prime.head()

### Evaluate feature importances

In [None]:
fa = FeatureAnalyzer()
latest_ver_name = get_versioned_name(fi_dir, 'fi_v', '.pkl', create_new=False) # set to true to create new version
latest_ver_path = fi_dir + '/' + latest_ver_name
# fa.evaluate_importance(dataframe=X_prime, target='checked', add_drop=['Ja', 'Nee'], filepath=latest_ver_path) # Uncomment this line if you don't have the feature_importance.pkl file

In [None]:
if all(col in X_prime.columns for col in cols_to_drop):
    X_prime = X_prime.drop(['checked', 'Ja', 'Nee'], axis=1)
    
print(f'Loading feature importance data from {latest_ver_name}')

print(latest_ver_path)

In [None]:
fa.load_importance(filepath='feature_importance/fi_v0.pkl')
fa.plot_importance(column_names=X_prime.columns, min_val=0.01)

In [None]:
fad = fa.feature_importance_as_dict(column_names=X_prime.columns, normalize=True)
sorted_fi = dict(sorted(fad.items(), key=lambda item: item[1], reverse=True)[:10])

print("Top 10 most important features according to permutation importance:")
for rank, feature in enumerate(sorted_fi):
    print(f'{rank+1}: {feature} - {sorted_fi[feature]:.4f}')

### Visualize modified data distributions

In [None]:
X_en, top_features_en = translate_cols(X_prime, sorted_fi)
top_features_en

In [None]:
for feature in top_features_en.keys():
    plot_distribution(X_en[feature], feature)

### Train model on modified data

In [None]:
model_path = target_model_dir + '/' + get_versioned_name(target_model_dir, 'gboost1_v', '.onnx', create_new=False) # set to true to create new version
run(X_prime, y_prime, model_path)

In [None]:
# Let's load the model
new_session = rt.InferenceSession(model_path)

# Let's predict the target
y_pred_onnx =  new_session.run(None, {'X': X_test.values.astype(np.float32)})

accuracy_onnx_model = accuracy_score(y_test, y_pred_onnx[0])
print('Accuracy of the ONNX model: ', accuracy_onnx_model)