In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression, ElasticNet
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from boruta import BorutaPy
from scipy.stats import pearsonr, spearmanr, kendalltau


In [2]:
#Preprocessing
data = pd.read_csv(r'\Users\yazda\Desktop\AyeAye2024\Aye-Aye-Project\data\LEK\LEK_data_grouped.csv', delimiter=';')
data.head(10)
data.columns
data['nr.month.low.supply'].fillna(data['nr.month.low.supply'].median(), inplace=True)
columns_to_cast = [
    'Forest.cover2018',
    'Forest.cover1990',
    'Forestloss_total',
    'Forestloss_relative',
]
for column in columns_to_cast:
    data[column] = data[column].str.replace(',', '.').astype('float64')
# age columns removed 
selected_columns_dm = data[['Clove.benefit.score','nr.month.low.supply','Region', 'Village', 'Gender',
       'schooling.years', 'Ethnic.group', 'Migration.Experience',
       'Farms.Cloves', 'encountered.binary',
       'ID.vis.Aye-aye', 'ID.nom', 'Knowledge.non.benefit.score.max.6',
       'DK.score', 'Diet.Insects', 'Knowledge.signs', 'Remoteness',
       'village.size', 'School', 'Forest.cover2018', 'Forest.cover1990',
       'Forestloss_total', 'Forestloss_relative', 'Farms.Paddy.Rice',
       'Farms.Tavy.Rice', 'Extracts.Medical.plants', 'FES.provisioning',
       'FES.regulating', 'FES.cultural', 'FES.supporting', 'ID.vis.Indri',
       'ID.vis.Microcebus', 'Aye.aye.protected']]
## Removed 'Region', 'Village', 'Gender', 'Age', 'schooling.years', 'Ethnic.group', 'Migration.Experience'
selected_columns_ndm = data[['Clove.benefit.score','nr.month.low.supply','Farms.Cloves', 'encountered.binary',
       'ID.vis.Aye-aye', 'ID.nom', 'Knowledge.non.benefit.score.max.6',
       'DK.score', 'Diet.Insects', 'Knowledge.signs', 'Remoteness',
       'village.size', 'School', 'Forest.cover2018', 'Forest.cover1990',
       'Forestloss_total', 'Forestloss_relative', 'Farms.Paddy.Rice',
       'Farms.Tavy.Rice', 'Extracts.Medical.plants', 'FES.provisioning',
       'FES.regulating', 'FES.cultural', 'FES.supporting', 'ID.vis.Indri',
       'ID.vis.Microcebus', 'Aye.aye.protected']]
selected_columns_ndm_noforest = data[['Clove.benefit.score','nr.month.low.supply','Farms.Cloves', 'encountered.binary',
       'ID.vis.Aye-aye', 'ID.nom', 'Knowledge.non.benefit.score.max.6',
       'DK.score', 'Diet.Insects', 'Knowledge.signs', 'Remoteness',
       'village.size', 'School',
       'Farms.Paddy.Rice',
       'Farms.Tavy.Rice', 'Extracts.Medical.plants', 'FES.provisioning',
       'FES.regulating', 'FES.cultural', 'FES.supporting', 'ID.vis.Indri',
       'ID.vis.Microcebus', 'Aye.aye.protected']]
selected_columns_ndm_noforest_ndk = data[['Clove.benefit.score','nr.month.low.supply','Farms.Cloves', 'encountered.binary',
       'ID.vis.Aye-aye', 'ID.nom', 'Knowledge.non.benefit.score.max.6',
       'Diet.Insects', 'Knowledge.signs', 'Remoteness',
       'village.size', 'School',
       'Farms.Paddy.Rice',
       'Farms.Tavy.Rice', 'Extracts.Medical.plants', 'FES.provisioning',
       'FES.regulating', 'FES.cultural', 'FES.supporting', 'ID.vis.Indri',
       'ID.vis.Microcebus', 'Aye.aye.protected']]

In [3]:
column_dtypes = selected_columns_dm.dtypes
print(column_dtypes)

Clove.benefit.score                    int64
nr.month.low.supply                  float64
Region                                 int64
Village                                int64
Gender                                 int64
schooling.years                        int64
Ethnic.group                           int64
Migration.Experience                   int64
Farms.Cloves                           int64
encountered.binary                     int64
ID.vis.Aye-aye                         int64
ID.nom                                 int64
Knowledge.non.benefit.score.max.6      int64
DK.score                               int64
Diet.Insects                           int64
Knowledge.signs                        int64
Remoteness                             int64
village.size                           int64
School                                 int64
Forest.cover2018                     float64
Forest.cover1990                     float64
Forestloss_total                     float64
Forestloss

In [4]:
for column in selected_columns_dm.columns:
    unique_values = selected_columns_dm[column].unique()
    print(f"Unique values in '{column}':\n{unique_values}\n")

Unique values in 'Clove.benefit.score':
[1 0]

Unique values in 'nr.month.low.supply':
[ 8.  6.  3.  5.  1.  2.  7.  4. 10.  9. 12. 11.  0.]

Unique values in 'Region':
[0 1 2 3]

Unique values in 'Village':
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20]

Unique values in 'Gender':
[1 0]

Unique values in 'schooling.years':
[ 0 10  2  9  5  4  6  8 11  1  3 12  7]

Unique values in 'Ethnic.group':
[1 3 2]

Unique values in 'Migration.Experience':
[0 1]

Unique values in 'Farms.Cloves':
[1 0]

Unique values in 'encountered.binary':
[0 1]

Unique values in 'ID.vis.Aye-aye':
[0 1]

Unique values in 'ID.nom':
[0 1]

Unique values in 'Knowledge.non.benefit.score.max.6':
[4 5 3 6 0 1 2]

Unique values in 'DK.score':
[ 2  0  1  3 10  4  7  9  5  8 14 11  6 13 12]

Unique values in 'Diet.Insects':
[1 0]

Unique values in 'Knowledge.signs':
[0 1]

Unique values in 'Remoteness':
[1 5 4 3 2]

Unique values in 'village.size':
[4 2 1 5 3]

Unique values in 'School':
[1 0]

Unique 

In [5]:

def feature_selection_and_correlation(d, target_col, n_features=7):
    # Data Preparation    
    X = d.drop(columns=[target_col])
    y = d[target_col]
    
    # Apply StandardScaler
    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    
    results = {}
    
    Boruta Algorithm
    rf_boruta = RandomForestRegressor(n_jobs=-1, max_depth=5)
    boruta = BorutaPy(rf_boruta, n_estimators='auto', verbose=0, random_state=42)
    boruta.fit(X_scaled.values, y.values)
    boruta_features = X_scaled.columns[boruta.support_].tolist()
    results['boruta'] = boruta_features[:min(n_features, len(boruta_features))]
    
    # XGBoost
    xgb = XGBRegressor(n_estimators=100)
    xgb.fit(X_scaled, y)
    xgb_importance = pd.DataFrame({'feature': X_scaled.columns, 'importance': xgb.feature_importances_})
    xgb_importance = xgb_importance.sort_values('importance', ascending=False)
    results['xgboost'] = xgb_importance['feature'].tolist()[:min(n_features, len(X_scaled.columns))]
    
    # Random Forest
    rf = RandomForestRegressor(n_estimators=100)
    rf.fit(X_scaled, y)
    rf_importance = pd.DataFrame({'feature': X_scaled.columns, 'importance': rf.feature_importances_})
    rf_importance = rf_importance.sort_values('importance', ascending=False)
    results['random_forest'] = rf_importance['feature'].tolist()[:min(n_features, len(X_scaled.columns))]
    
    # Logistic Regression
    lr = LogisticRegression(penalty='l1', solver='liblinear')
    lr.fit(X_scaled, y)
    lr_importance = pd.DataFrame({'feature': X_scaled.columns, 'importance': np.abs(lr.coef_[0])})
    lr_importance = lr_importance.sort_values('importance', ascending=False)
    results['logistic_regression'] = lr_importance['feature'].tolist()[:min(n_features, len(X_scaled.columns))]
    
    # Elastic Net
    en = ElasticNet(alpha=1, l1_ratio=0.5)
    en.fit(X_scaled, y)
    en_importance = pd.DataFrame({'feature': X_scaled.columns, 'importance': np.abs(en.coef_)})
    en_importance = en_importance.sort_values('importance', ascending=False)
    results['elastic_net'] = en_importance['feature'].tolist()[:min(n_features, len(X_scaled.columns))]
    
    # Calculate correlations
    correlations = {}
    for method, features in results.items():
        pearson = [pearsonr(X_scaled[feature], y)[0] for feature in features]
        spearman = [spearmanr(X_scaled[feature], y)[0] for feature in features]
        kendall = [kendalltau(X_scaled[feature], y)[0] for feature in features]
        correlations[method] = pd.DataFrame({
            'Feature': features,
            'Pearson': pearson,
            'Spearman': spearman,
            'Kendall': kendall
        }).sort_values(by='Pearson', ascending=False)
    
    return {'selected_features': results, 'correlations': correlations}

In [15]:
result_DM = feature_selection_and_correlation(selected_columns_dm, 'Clove.benefit.score')
print(result_DM['selected_features'])
print(result_DM['correlations'])

{'xgboost': ['DK.score', 'Diet.Insects', 'Forestloss_total', 'Ethnic.group', 'Forestloss_relative', 'Gender', 'Farms.Paddy.Rice'], 'random_forest': ['DK.score', 'schooling.years', 'nr.month.low.supply', 'Knowledge.non.benefit.score.max.6', 'Diet.Insects', 'Village', 'Gender'], 'logistic_regression': ['DK.score', 'Gender', 'Village', 'Diet.Insects', 'Remoteness', 'schooling.years', 'Ethnic.group'], 'elastic_net': ['nr.month.low.supply', 'Region', 'ID.vis.Microcebus', 'ID.vis.Indri', 'FES.supporting', 'FES.cultural', 'FES.regulating']}
{'xgboost':                Feature   Pearson  Spearman   Kendall
1         Diet.Insects  0.388766  0.388766  0.388766
5               Gender  0.341434  0.341434  0.341434
2     Forestloss_total  0.098168  0.086480  0.072315
6     Farms.Paddy.Rice  0.098073  0.098073  0.098073
4  Forestloss_relative  0.014440  0.005523  0.004618
3         Ethnic.group -0.133697 -0.128074 -0.126196
0             DK.score -0.432866 -0.450332 -0.385017, 'random_forest':       

In [16]:
result_NDM = feature_selection_and_correlation(selected_columns_ndm, 'Clove.benefit.score')
print(result_NDM['selected_features'])
print(result_NDM['correlations'])

{'xgboost': ['DK.score', 'Diet.Insects', 'Farms.Cloves', 'ID.vis.Microcebus', 'Knowledge.signs', 'village.size', 'Farms.Tavy.Rice'], 'random_forest': ['DK.score', 'nr.month.low.supply', 'Knowledge.non.benefit.score.max.6', 'Diet.Insects', 'Forestloss_total', 'Forest.cover1990', 'Forestloss_relative'], 'logistic_regression': ['DK.score', 'Diet.Insects', 'ID.vis.Microcebus', 'Remoteness', 'Extracts.Medical.plants', 'ID.vis.Aye-aye', 'Farms.Cloves'], 'elastic_net': ['nr.month.low.supply', 'Farms.Cloves', 'ID.vis.Microcebus', 'ID.vis.Indri', 'FES.supporting', 'FES.cultural', 'FES.regulating']}
{'xgboost':              Feature   Pearson  Spearman   Kendall
1       Diet.Insects  0.388766  0.388766  0.388766
3  ID.vis.Microcebus  0.265376  0.265376  0.265376
2       Farms.Cloves  0.140299  0.140299  0.140299
4    Knowledge.signs  0.136833  0.136833  0.136833
5       village.size  0.070499  0.073070  0.066058
6    Farms.Tavy.Rice  0.063320  0.063320  0.063320
0           DK.score -0.432866 -0.

In [17]:
result_NDM_nf = feature_selection_and_correlation(selected_columns_ndm_noforest, 'Clove.benefit.score')
print(result_NDM_nf['selected_features'])
print(result_NDM_nf['correlations'])

{'xgboost': ['DK.score', 'Farms.Cloves', 'Diet.Insects', 'FES.cultural', 'ID.vis.Microcebus', 'Extracts.Medical.plants', 'village.size'], 'random_forest': ['DK.score', 'nr.month.low.supply', 'Knowledge.non.benefit.score.max.6', 'Diet.Insects', 'village.size', 'Remoteness', 'ID.vis.Indri'], 'logistic_regression': ['DK.score', 'Diet.Insects', 'ID.vis.Microcebus', 'Extracts.Medical.plants', 'ID.vis.Aye-aye', 'Farms.Cloves', 'ID.vis.Indri'], 'elastic_net': ['nr.month.low.supply', 'Farms.Cloves', 'ID.vis.Microcebus', 'ID.vis.Indri', 'FES.supporting', 'FES.cultural', 'FES.regulating']}
{'xgboost':                    Feature   Pearson  Spearman   Kendall
2             Diet.Insects  0.388766  0.388766  0.388766
4        ID.vis.Microcebus  0.265376  0.265376  0.265376
5  Extracts.Medical.plants  0.184300  0.184300  0.184300
1             Farms.Cloves  0.140299  0.140299  0.140299
6             village.size  0.070499  0.073070  0.066058
3             FES.cultural  0.062553  0.062553  0.062553
0 

In [18]:
result_NDM_nf_ndk = feature_selection_and_correlation(selected_columns_ndm_noforest_ndk, 'Clove.benefit.score')
print(result_NDM_nf_ndk['selected_features'])
print(result_NDM_nf_ndk['correlations'])

{'xgboost': ['Diet.Insects', 'FES.supporting', 'Farms.Cloves', 'School', 'ID.vis.Microcebus', 'Knowledge.signs', 'ID.nom'], 'random_forest': ['Knowledge.non.benefit.score.max.6', 'nr.month.low.supply', 'Diet.Insects', 'village.size', 'Remoteness', 'ID.vis.Microcebus', 'Aye.aye.protected'], 'logistic_regression': ['Diet.Insects', 'Knowledge.non.benefit.score.max.6', 'ID.vis.Microcebus', 'Extracts.Medical.plants', 'ID.vis.Aye-aye', 'Aye.aye.protected', 'ID.vis.Indri'], 'elastic_net': ['nr.month.low.supply', 'Farms.Paddy.Rice', 'ID.vis.Microcebus', 'ID.vis.Indri', 'FES.supporting', 'FES.cultural', 'FES.regulating']}
{'xgboost':              Feature   Pearson  Spearman   Kendall
0       Diet.Insects  0.388766  0.388766  0.388766
4  ID.vis.Microcebus  0.265376  0.265376  0.265376
6             ID.nom  0.147828  0.147828  0.147828
2       Farms.Cloves  0.140299  0.140299  0.140299
5    Knowledge.signs  0.136833  0.136833  0.136833
3             School  0.130893  0.130893  0.130893
1     FES.