In [1]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score,cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,ConfusionMatrixDisplay
from sklearn import feature_selection
from scipy.stats import chi2_contingency
from sklearn.metrics import confusion_matrix,precision_score,recall_score,f1_score,precision_recall_curve
from sklearn.feature_selection import SelectFromModel
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
import pickle as pk
import numpy as np

In [None]:
data_prof = pd.read_csv('Data_Arbre.csv')
data_debase = pd.read_csv('Patrimoine_Arbore.csv', encoding='latin1')

In [None]:
cols = ["fk_arb_etat","tronc_diam","haut_tot","fk_stadedev","age_estim", "clc_quartier", "feuillage", "fk_port", "fk_nomtech"]
data = data_prof[cols]

# index = data[(data["fk_arb_etat"] == "Essouché") | (data["fk_arb_etat"] == "Non essouché")]
index = data[(data["fk_arb_etat"] == "SUPPRIMÉ") | (data["fk_arb_etat"] == "ABATTU") | (data["fk_arb_etat"] == "EN PLACE") | (data["fk_arb_etat"] == "REMPLACÉ")].index
data.drop(index, inplace=True)

data.loc[data["fk_arb_etat"] == "Essouché", "fk_arb_etat"] = 1
data.loc[data["fk_arb_etat"] == "Non essouché", "fk_arb_etat"] = 0

# data.loc[data["fk_arb_etat"] != 1, "fk_arb_etat"] = 0
data.fk_arb_etat = data.fk_arb_etat.astype(int)
# data.fk_arb_etat.value_counts()

In [None]:
#Encodage
#Get dummies de pandas encoder en hot one
encoder = OrdinalEncoder()
lst_col = ['fk_stadedev', 'clc_quartier', 'feuillage', 'fk_port', 'fk_nomtech']

temp = data[lst_col]

data[lst_col] = encoder.fit_transform(temp)

In [None]:
X_data = data.drop(columns=["fk_arb_etat"])
Y_data = data["fk_arb_etat"]

X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size=0.2, random_state=42, stratify=Y_data)

In [None]:
forest_feature = RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42)
forest_feature.fit(X_train, Y_train)

In [None]:
# Extract feature importances
importances = forest_feature.feature_importances_

cols_lst_feature = ["tronc_diam","haut_tot","fk_stadedev","age_estim", "clc_quartier", "feuillage", "fk_port", "fk_nomtech"]

feature_names = data[cols_lst_feature].columns
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})

In [None]:
# Rank features by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

In [None]:
# Select top N features (example selecting top 10 features)
top_features = feature_importance_df['Feature'][:5].values
X_train_selected = X_train[top_features]
X_test_selected = X_test[top_features]

In [None]:
featured_model = RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42)
featured_model.fit(X_train_selected, Y_train)

In [None]:
import matplotlib.pyplot as plt
classes = cross_val_predict(featured_model, X_train_selected, Y_train, cv=3, n_jobs=-1)
# print(classes)
matrice = confusion_matrix(Y_train, classes, normalize='true')

print(matrice)
plt.matshow(matrice,cmap=plt.cm.gray)

In [None]:
# sm = SMOTE(random_state=42, n_jobs=-1, sampling_strategy=0.4)
sm = SMOTE(random_state=42, n_jobs=-1)
X_smote, Y_smote = sm.fit_resample(X_train_selected, Y_train)

In [None]:
smoted_model = RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42)
smoted_model.fit(X_smote, Y_smote)

In [None]:
classes = cross_val_predict(smoted_model, X_smote, Y_smote, cv=3, n_jobs=-1)
# print(classes)
matrice = confusion_matrix(Y_smote, classes, normalize='true')

print(matrice)
plt.matshow(matrice,cmap=plt.cm.gray)

In [None]:
# score = cross_val_score(smoted_model, X_test, Y_test, cv=5, n_jobs=-1)
# print(np.mean(score))
y_predicted = smoted_model.predict(X_test_selected)
score = accuracy_score(Y_test, y_predicted)

print(format(score,'.4f'))

In [None]:
confusion_matrix(Y_test, y_predicted, normalize='true')