In [29]:

# Imports
import sys
sys.executable
import numpy as np
import requests # for downloading webpages
from bs4 import BeautifulSoup  # for parsing HTML
import pandas as pd # for storing and handling datasets
import time # for adding delays between requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.inspection import permutation_importance
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from yellowbrick.cluster import KElbowVisualizer


In [5]:
df_score = pd.read_pickle("sustainability_scores.pkl")

In [6]:
df_ml = df_score.copy()

In [7]:
df_ml.info

<bound method DataFrame.info of         Id      Brand                                  Product_Name     Price  \
0        1        H&M              Jacquard-knit merino wool jumper   € 79.99   
1        2        H&M                               Oversize Jumper   € 24.99   
2        3       Zara                              Soft Knit Jumper   € 25.95   
3        4       Zara                     Cashmere Oversize Sweater  € 139.00   
4        5  Patagonia  Women's Recycled Wool-Blend Crewneck Sweater  € 160.00   
...    ...        ...                                           ...       ...   
1001  1002       Zara                               Jacket Slim Fit   € 83.95   
1002  1003  Patagonia                               Jumper Textured  € 113.09   
1003  1004  Patagonia                             Jumper Asymmetric  € 108.94   
1004  1005  Patagonia                                 Jumper Ribbed  € 224.17   
1005  1006        H&M                              Jacket Oversized   € 27.97

In [8]:
df_ml.dtypes

Id                             int64
Brand                         object
Product_Name                  object
Price                         object
Category                      object
Subcategory                   object
Score_100                    float64
S_final                      float64
S_env                        float64
Score_env_burden             float64
Certification_Total          float64
Material_CO2_norm            float64
Material_Water_norm          float64
Material_Energy_norm         float64
Material_Chemical_norm       float64
Care_CO2_norm                float64
Care_Water_norm              float64
Care_Energy_norm             float64
Origin_Grid_norm             float64
Origin_Transport_norm        float64
Origin_Manufacturing_norm    float64
dtype: object

In [9]:
df_ml["Id"].duplicated().sum()

np.int64(0)

In [14]:
df_ml.isna().sum().sum()

np.int64(0)

In [13]:
df_ml.head()

Unnamed: 0,Id,Brand,Product_Name,Price,Category,Subcategory,Score_100,S_final,S_env,Score_env_burden,...,Material_CO2_norm,Material_Water_norm,Material_Energy_norm,Material_Chemical_norm,Care_CO2_norm,Care_Water_norm,Care_Energy_norm,Origin_Grid_norm,Origin_Transport_norm,Origin_Manufacturing_norm
0,1,H&M,Jacquard-knit merino wool jumper,79.99,Woman,Jumper,76.0,0.760372,0.610372,0.389628,...,0.442509,0.294938,0.779978,0.72885,0.0,0.0,0.0,0.65,0.4,0.6
1,2,H&M,Oversize Jumper,24.99,Woman,Jumper,44.0,0.437647,0.437647,0.562353,...,0.181185,0.024195,0.377338,0.390817,1.0,1.0,1.0,0.65,0.4,0.6
2,3,Zara,Soft Knit Jumper,25.95,Woman,Jumper,71.0,0.711008,0.461008,0.538992,...,0.133275,0.025011,0.264026,0.317607,1.0,1.0,1.0,0.65,0.4,0.6
3,4,Zara,Cashmere Oversize Sweater,139.0,Woman,Jumper,50.0,0.499217,0.499217,0.500783,...,1.0,0.25783,1.0,1.0,0.0,0.0,0.0,0.55,0.45,0.75
4,5,Patagonia,Women's Recycled Wool-Blend Crewneck Sweater,160.0,Woman,sweater,100.0,1.0,0.865,0.135,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.45,0.35,0.55


In [10]:
df_ml.isna().sum().sort_values(ascending=False).head(10)

Id                  0
Brand               0
Product_Name        0
Price               0
Category            0
Subcategory         0
Score_100           0
S_final             0
S_env               0
Score_env_burden    0
dtype: int64

In [11]:
df_ml["Price"] = (
    df_ml["Price"]
    .str.replace("€", "", regex=False)
    .str.strip()
)

df_ml["Price"] = pd.to_numeric(df_ml["Price"], errors="coerce")

In [12]:
df_ml["Price"].dtype

dtype('float64')

In [16]:
list(df_ml.columns)

['Id',
 'Brand',
 'Product_Name',
 'Price',
 'Category',
 'Subcategory',
 'Score_100',
 'S_final',
 'S_env',
 'Score_env_burden',
 'Certification_Total',
 'Material_CO2_norm',
 'Material_Water_norm',
 'Material_Energy_norm',
 'Material_Chemical_norm',
 'Care_CO2_norm',
 'Care_Water_norm',
 'Care_Energy_norm',
 'Origin_Grid_norm',
 'Origin_Transport_norm',
 'Origin_Manufacturing_norm']

In [23]:
cols_to_drop = [
    "Id",
    "Product_Name",
    
    # Composite sustainability scores → leakage
    "Score_100",
    "S_final",
    "S_env",
    "Score_env_burden",
    
    # Post-purchase impacts → not price drivers
    #"Care_CO2_norm",
    #"Care_Water_norm",
    #"Care_Energy_norm"
]

df_ml = df_ml.drop(columns=cols_to_drop, errors="ignore")

In [25]:
print("Rows:", df_ml.shape[0])
print("Columns:", df_ml.shape[1])


Rows: 1006
Columns: 21


In [26]:
list(df_ml.columns)


['Price',
 'Certification_Total',
 'Material_CO2_norm',
 'Material_Water_norm',
 'Material_Energy_norm',
 'Material_Chemical_norm',
 'Care_CO2_norm',
 'Care_Water_norm',
 'Care_Energy_norm',
 'Origin_Grid_norm',
 'Origin_Transport_norm',
 'Origin_Manufacturing_norm',
 'Brand_Patagonia',
 'Brand_Penneys',
 'Brand_Zara',
 'Category_Woman',
 "Category_Woman's Knitwear",
 'Subcategory_Jumper',
 'Subcategory_Sweater',
 'Subcategory_Tshirt',
 'Subcategory_sweater']