# Feature Importance Determination

In [4]:
from src.utils.feature_analyzer import *
from src.utils.preprocessing import *
from src.utils.other_utils import *
from src.utils.test_utils import *

## Determine Important Features

In [5]:
RANDOM_STATE = 42

# Load the dataset
df = add_checked(pd.read_csv("../../data/synth_data_train_labeled.csv"))
df.head()

Unnamed: 0,adres_aantal_brp_adres,adres_aantal_verschillende_wijken,adres_aantal_verzendadres,adres_aantal_woonadres_handmatig,adres_dagen_op_adres,adres_recentst_onderdeel_rdam,adres_recentste_buurt_groot_ijsselmonde,adres_recentste_buurt_nieuwe_westen,adres_recentste_buurt_other,adres_recentste_buurt_oude_noorden,...,typering_hist_ind,typering_hist_sector_zorg,typering_ind,typering_indicatie_geheime_gegevens,typering_other,typering_transport__logistiek___tuinbouw,typering_zorg__schoonmaak___welzijn,Ja,Nee,checked
0,4,4,1,1,6345,1,0,0,1,0,...,1,0,1,0,0,0,0,0.550907,0.449093,0
1,1,1,1,0,17546,1,0,0,0,0,...,1,1,1,0,0,0,0,0.411258,0.588742,0
2,5,2,0,0,4052,1,0,0,0,0,...,1,0,0,0,0,0,0,0.690235,0.309765,0
3,2,1,0,0,16335,1,0,0,0,0,...,1,0,1,0,1,0,0,0.710641,0.289359,1
4,3,1,0,0,20872,1,0,0,0,0,...,1,0,1,0,1,0,0,0.394054,0.605946,0


In [6]:
# Checking for Null values
df.isnull().sum()/len(df)*100

adres_aantal_brp_adres                      0.0
adres_aantal_verschillende_wijken           0.0
adres_aantal_verzendadres                   0.0
adres_aantal_woonadres_handmatig            0.0
adres_dagen_op_adres                        0.0
                                           ... 
typering_transport__logistiek___tuinbouw    0.0
typering_zorg__schoonmaak___welzijn         0.0
Ja                                          0.0
Nee                                         0.0
checked                                     0.0
Length: 318, dtype: float64

In [7]:
fa = FeatureAnalyzer()

In [None]:
df_modified = df.copy()
df_modified['persoon_leeftijd_bij_onderzoek'] = 0

In [None]:
# fa.evaluate_importance(dataframe=df_modified, target='checked', add_drop=['Ja', 'Nee'], filename="fi_v1.pkl") # Uncomment this line if you don't have the feature_importance.pkl file
fa.load_importance(filepath='fi_v1.pkl')

In [None]:
X = df.drop(['checked', 'Ja', 'Nee'], axis=1)
y = df['checked']
fa.plot_importance(column_names=X.columns, min_val=0.1)

In [None]:
fad = fa.feature_importance_as_dict(column_names=X.columns, normalize=True)
print(fad)

In [None]:
description = pd.read_csv("../../data/data_description.csv", encoding='latin1')
description.head()

In [None]:
scaler = MinMaxScaler()
description['Relative importance'] = scaler.fit_transform(description[['Relative importance']])
filtered_description = description[description['Relative importance'] > 0.1]

plt.figure(figsize=(10, 8))
ax = filtered_description['Relative importance'].plot(kind='bar')
ax.set_xticklabels(filtered_description['Feature (nl)'], rotation=90)
plt.title('Relative Importance of Features')
plt.xlabel('Features')
plt.ylabel('Relative Importance')
plt.tight_layout()
plt.show()

In [None]:
TOP = 10
sorted_fad = {k: v for k, v in sorted(fad.items(), key=lambda item: item[1], reverse=True)}

sorted_feature_names = list(sorted_fad.keys())[:TOP]
sorted_importances_mean = list(sorted_fad.values())[:TOP]

filtered_description = filtered_description.sort_values(by='Relative importance', ascending=False)
filtered_description_sorted = filtered_description['Feature (nl)'][:TOP]

most_important_features_new = set(sorted_feature_names)
most_important_features_original = set(filtered_description_sorted)
# print("Most important features according to RF:")
# print(most_important_features_new)
# print("Most important features according to original data:")
# print(most_important_features_original)

intersection = most_important_features_new.intersection(most_important_features_original)
percentage_intersection = len(intersection) / len(most_important_features_original) * 100
print(f"Similarity between original importance and RF: {percentage_intersection:.2f}%")

In [None]:
def get_important_features(top, fad, filtered_description):
    TOP = top
    sorted_fad = {k: v for k, v in sorted(fad.items(), key=lambda item: item[1], reverse=True)}

    sorted_feature_names = list(sorted_fad.keys())[:TOP]
    sorted_importances_mean = list(sorted_fad.values())[:TOP]
    
    filtered_description = filtered_description.sort_values(by='Relative importance', ascending=False)
    filtered_description_sorted = filtered_description['Feature (nl)'][:TOP]
    
    most_important_features_new = set(sorted_feature_names)
    most_important_features_original = set(filtered_description_sorted)
    # print("Most important features according to RF:")
    # print(most_important_features_new)
    # print("Most important features according to original data:")
    # print(most_important_features_original)
    
    intersection = most_important_features_new.intersection(most_important_features_original)
    percentage_intersection = len(intersection) / len(most_important_features_original) * 100
    # print(f"Similarity between original importance and RF: {percentage_intersection:.2f}%")
    return most_important_features_new

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline

In [None]:
feature_sizes = [10, 20, 40, 80, 160]
accuracies = []

def train(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    selector = VarianceThreshold()    
    classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
    pipeline = Pipeline(steps=[('feature selection', selector), ('classification', classifier)])

    # Let's train a simple model
    pipeline.fit(X_train, y_train)
    
    # Let's evaluate the model
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    return accuracy

original_accuracy = train(X, y)
print(f'Original Accuracy: {original_accuracy}')

for size in feature_sizes:
    features = get_important_features(size, fad, filtered_description)
    
    preprocessor = DataProcessor(X, y, features)
    X_processed, y_processed = preprocessor.preprocess(remove_features=True)
    
    print(f'Processed data dimensions: {X_processed.shape}')
    
    processed_accuracy = train(X_processed, y_processed)
    accuracies.append(processed_accuracy)
    
best_number_of_features = feature_sizes[accuracies.index(max(accuracies))]
print(f'Best performing number of features: {best_number_of_features}')
print(f'Accuracy: {max(accuracies)}')

# Nested Cross Validation

In [None]:
features = get_important_features(best_number_of_features, fad, filtered_description)
preprocessor = DataProcessor(X, y, features)
X_processed, y_processed = preprocessor.preprocess(remove_features=True)

In [None]:
selector = VarianceThreshold()
classifier = GradientBoostingClassifier(random_state=0)

In [None]:
# Define hyperparameter grid
parameters=[{'max_depth':[1,3,5],'learning_rate': [0.01,0.1,1.0],'n_estimators':[100,250,500]}]

In [None]:
mean, std = nested_cv(X, y, classifier, parameters)
print('Accuracy of original model: %.3f (%.3f)' % (mean, std))

In [None]:
mean_processed, std_processed = nested_cv(X_processed, y_processed, classifier, parameters)
print('Accuracy of processed model: %.3f (%.3f)' % (mean_processed, std_processed))

## The Best Model Evaluation

In [None]:
import pandas as pd

In [8]:
df = pd.read_csv('../../data/Experiment_persoon_geslacht_vrouw/0.csv')
df.head(10)

FileNotFoundError: [Errno 2] No such file or directory: '../../data/Experiment_persoon_geslacht_vrouw/0.csv'

In [12]:
def filter_features(features, keywords):
    return [feature for feature in features if any(keyword in feature for keyword in keywords)]

In [13]:
X = df.drop(['checked', 'Ja', 'Nee'], axis=1)
y = df['checked']

In [17]:
filtered_features = filter_features(X.columns, ["adres", "taal", "geslacht", "leeftijd", "persoonlijke"])
print(len(filtered_features))
print(filtered_features)

69
['adres_aantal_brp_adres', 'adres_aantal_verschillende_wijken', 'adres_aantal_verzendadres', 'adres_aantal_woonadres_handmatig', 'adres_dagen_op_adres', 'adres_recentst_onderdeel_rdam', 'adres_recentste_buurt_groot_ijsselmonde', 'adres_recentste_buurt_nieuwe_westen', 'adres_recentste_buurt_other', 'adres_recentste_buurt_oude_noorden', 'adres_recentste_buurt_vreewijk', 'adres_recentste_plaats_other', 'adres_recentste_plaats_rotterdam', 'adres_recentste_wijk_charlois', 'adres_recentste_wijk_delfshaven', 'adres_recentste_wijk_feijenoord', 'adres_recentste_wijk_ijsselmonde', 'adres_recentste_wijk_kralingen_c', 'adres_recentste_wijk_noord', 'adres_recentste_wijk_other', 'adres_recentste_wijk_prins_alexa', 'adres_recentste_wijk_stadscentru', 'adres_unieke_wijk_ratio', 'afspraak_afgelopen_jaar_monitoring_insp__wet_taaleis_na_12_mnd_n_a_v__taa04_____geen_maatregel', 'afspraak_afgelopen_jaar_ontheffing_taaleis', 'afspraak_verzenden_beschikking_i_v_m__niet_voldoen_aan_wet_taaleis', 'belemmeri