# Importing Libraries

In [67]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Loading & Preprocessing the data

In [68]:
# LOAD DATA

df = pd.read_csv(r"Backend\Ecopack_dataset.csv")


# HANDLE MISSING VALUES

num_cols = df.select_dtypes(include=np.number).columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

cat_cols = df.select_dtypes(exclude=np.number).columns
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])


In [69]:

# FEATURES & TARGETS

X = df[["strength","weight_capacity",
        "recyclability_percentage",
        "biodegradability_score"]]


y_cost = df["cost"]
y_co2 = df["co2_score"]

# TRAIN-TEST SPLIT

In [70]:

# TRAIN-TEST SPLIT

X_train, X_test, y_cost_train, y_cost_test = train_test_split(
    X, y_cost, test_size=0.2, random_state=42
)

X_train2, X_test2, y_co2_train, y_co2_test = train_test_split(
    X, y_co2, test_size=0.2, random_state=42
)


# COST MODEL PIPELINE

In [71]:

# COST MODEL PIPELINE  (SCALER + MODEL TOGETHER)

cost_pipeline = Pipeline([
    # CHANGE: scaler now inside pipeline (no separate object)
    ("scaler", StandardScaler()),
    ("model", RandomForestRegressor(
        n_estimators=100,
        random_state=42
    ))
])


# CHANGE: fit pipeline directly on RAW data
cost_pipeline.fit(X_train, y_cost_train)

# Predict
y_cost_pred = cost_pipeline.predict(X_test)

print("---- COST MODEL ----")
print("MAE:", mean_absolute_error(y_cost_test, y_cost_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_cost_test, y_cost_pred)))
print("R2:", r2_score(y_cost_test, y_cost_pred))



---- COST MODEL ----
MAE: 1.0517948717948717
RMSE: 1.2932480807217919
R2: 0.7201476127612763


# CO2 MODEL PIPELINE

In [72]:

# CO2 MODEL PIPELINE (SCALER + MODEL TOGETHER)

co2_pipeline = Pipeline([
    # CHANGE: separate scaler for this model
    ("scaler", StandardScaler()),
    ("model", XGBRegressor(
        n_estimators=90,
        learning_rate=0.1,
        random_state=42
    ))
])


# CHANGE: fit on RAW data
co2_pipeline.fit(X_train2, y_co2_train)

# Predict
y_co2_pred = co2_pipeline.predict(X_test2)

print("\n---- CO2 MODEL ----")
print("MAE:", mean_absolute_error(y_co2_test, y_co2_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_co2_test, y_co2_pred)))
print("R2:", r2_score(y_co2_test, y_co2_pred))




---- CO2 MODEL ----
MAE: 0.40066081285476685
RMSE: 0.8979882485400303
R2: 0.7263479232788086


XGBoost is a boosting algorithm (not bagging like RF).

It builds trees sequentially where each next tree focuses on correcting errors of previous trees.

Benefits:
1)High accuracy on tabular data
2)Handles complex patterns well
3)Strong performance in real-world ML projects
4)Often better than RF when tuned properly

learning_rate=0.1
Controls step size: smaller learning rate = slower but more accurate learning

# SAVE PIPELINES

In [73]:
# SAVE PIPELINES (NO SEPARATE SCALER ANYMORE)

joblib.dump(cost_pipeline, "Backend\models\cost_model2.pkl")   # CHANGE
joblib.dump(co2_pipeline, "Backend\models\co2_model2.pkl")     # CHANGE

print("Pipelines saved successfully")


Pipelines saved successfully


# Suitability Score

In [75]:
# Creating Suitability Score as it is not present in dataset

# Min-max normalization (0 to 1)
def minmax(series):
    return (series - series.min()) / (series.max() - series.min())

df["strength_norm"] = minmax(df["strength"])
df["recy_norm"]     = minmax(df["recyclability_percentage"])
df["biodeg_norm"]   = minmax(df["biodegradability_score"])

# Weighted suitability score
df["suitability_score"] = (
    0.40 * df["strength_norm"] +
    0.30 * df["recy_norm"] +
    0.30 * df["biodeg_norm"]
)



In [76]:

# Converting target values into "goodness score" cozz lower cost, lower CO2 = better

df["cost_norm"] = minmax(df["cost"])
df["co2_norm"]  = minmax(df["co2_score"])


df["cost_score"] = 1 - df["cost_norm"]   # lower cost = higher score
df["co2_score_final"] = 1 - df["co2_norm"]  # lower CO2 = higher score


In [77]:
# Final Score for Ranking

df["final_score"] = (
    0.40 * df["cost_score"] +
    0.40 * df["co2_score_final"] +
    0.20 * df["suitability_score"]
)

# Ranking materials

In [78]:

ranked_materials = df.sort_values("final_score", ascending=False)   # ascending=False means it will show final score high to low

print("===== TOP 10 MATERIALS =====")
print(ranked_materials[[
    "material_name",
    "cost", "co2_score",
    "suitability_score", "final_score"
]].head(10))

===== TOP 10 MATERIALS =====
                material_name  cost  co2_score  suitability_score  final_score
3      Tissue Paper Packaging     1          2           0.539423     0.907885
42          Eco Cushion Paper     2          2           0.553846     0.866325
6        Areca Leaf Packaging     3          2           0.740000     0.859111
60      Ultra Thin Paper Wrap     1          3           0.478846     0.829103
17                Molded Pulp     5          2           1.000000     0.822222
48       Palm Fiber Composite     4          2           0.767692     0.820205
58   Compostable Cushion Wrap     3          2           0.530769     0.817265
27        Palm Leaf Packaging     4          2           0.730769     0.812821
55  Cellulose Reinforced Film     3          2           0.507692     0.812650
24                Kraft Paper     3          3           0.739423     0.792329


# Saving full ranking to CSV

In [79]:
ranked_materials.to_csv("Material_Ranking_Output.csv", index=False)
print("\n Full ranking saved as: Material_Ranking_Output.csv")


 Full ranking saved as: Material_Ranking_Output.csv
