In [38]:

from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor

import warnings
warnings.filterwarnings('ignore')


In [39]:
#  Loading storage data
def load_data():
    df = pd.read_csv("data3.csv")
    #df = pd.read_csv("data4.csv")
    # Renaming columns
    column_mapping = {col: col.strip().lower().replace('&', 'and') for col in df.columns}
    return df.rename(columns=column_mapping)

In [40]:
#seperating the company name and number(year) from the column name "Company"
def split_company_name_and_year(df):
    df[['company', 'year']] = df['company'].str.extract(r'^(.*?)-(\d{4})$')
#extract year is after "-"


In [41]:
# Advanced preprocessing
def advanced_preprocessing(df):
    df = df[df['esg_score'].notna()].copy()
    df['year'] = pd.to_numeric(df['year'], errors='coerce')
    
    # Feature engineering of different year
    df['years_active'] = df.groupby('company')['year'].transform(lambda x: x - x.min() + 1)
    df['year_diff'] = df.groupby('company')['year'].transform(lambda x: x.diff().fillna(1))
    
    # Auto fill missing values
    num_cols = df.select_dtypes(include=np.number).columns.tolist()
    for col in num_cols:
        df[col] = df.groupby('company')[col].transform(
            lambda x: x.fillna(x.mean() if x.mean() > 0 else 0)
        )
    
    return df

In [42]:
# preapre data
df = load_data()
split_company_name_and_year(df)
processed_df = advanced_preprocessing(df)

# features engineering
X = processed_df.drop(columns=['esg_score', 'company', 'year'])
y = processed_df['esg_score']
groups = processed_df['company']


In [43]:
#calculate the average of each column 
def calculate_average(df):
    return df.mean()

# create more random data based on the average of the each column 
def create_random_data(df, num_samples=1000):
    averages = calculate_average(df)
    random_data = pd.DataFrame(np.random.normal(loc=averages, scale=0.1, size=(num_samples, len(averages))), columns=df.columns)
    return random_data



In [44]:
# Define the model
models = {
   
'XGBoost1': make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    XGBRegressor(
        n_estimators=1000,
        max_depth=2,
        learning_rate=0.1,
        random_state=42
    )
),

}


In [45]:
# Cross-validation
gkf = GroupKFold(n_splits=3)
print("Model evaluation：")
for name, model in models.items():
    mae_scores = []
    for train_idx, test_idx in gkf.split(X, y, groups):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        mae_scores.append(mean_absolute_error(y_test, preds))
    
    print(f"{name:15} | MAE: {np.mean(mae_scores):.2f} ± {np.std(mae_scores):.2f}")

Model evaluation：
XGBoost1        | MAE: 1.54 ± 0.02


In [46]:
# analysis of feature importance
print("\nXGBoost Top 5：")
rf_model = models['XGBoost1'].steps[2][1]
importances = pd.Series(rf_model.feature_importances_, index=X.columns)
print(importances.sort_values(ascending=False).head(5))



XGBoost Top 5：
certification                    0.635102
water_consumption_intensities    0.048207
women_in_the_management_team     0.042299
total_waste_generated            0.038474
energy_consumption_intensitie    0.036471
dtype: float32
