<a href="https://colab.research.google.com/github/vikasrkarjigi/shell-fuel-blend/blob/main/ShellAI_FuelBlend_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Basline Code**

In [11]:
# Basic Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.multioutput import MultiOutputRegressor
from lightgbm import LGBMRegressor
import matplotlib.pyplot as plt

# Load data directly from GitHub
base_url = "https://raw.githubusercontent.com/vikasrkarjigi/shell-fuel-blend/refs/heads/main/"

train = pd.read_csv(base_url + "train.csv")
test = pd.read_csv(base_url + "test.csv")
sample_submission = pd.read_csv(base_url + "sample_solution.csv")

print("Train shape:", train.shape)
print("Test shape:", test.shape)
train.head()

Train shape: (2000, 65)
Test shape: (500, 56)


Unnamed: 0,Component1_fraction,Component2_fraction,Component3_fraction,Component4_fraction,Component5_fraction,Component1_Property1,Component2_Property1,Component3_Property1,Component4_Property1,Component5_Property1,...,BlendProperty1,BlendProperty2,BlendProperty3,BlendProperty4,BlendProperty5,BlendProperty6,BlendProperty7,BlendProperty8,BlendProperty9,BlendProperty10
0,0.21,0.0,0.42,0.25,0.12,-0.021782,1.981251,0.020036,0.140315,1.032029,...,0.489143,0.607589,0.32167,-1.236055,1.601132,1.384662,0.30585,0.19346,0.580374,-0.762738
1,0.02,0.33,0.19,0.46,0.0,-0.224339,1.148036,-1.10784,0.149533,-0.354,...,-1.257481,-1.475283,-0.437385,-1.402911,0.147941,-1.143244,-0.439171,-1.379041,-1.280989,-0.503625
2,0.08,0.08,0.18,0.5,0.16,0.457763,0.242591,-0.922492,0.908213,0.972003,...,1.784349,0.450467,0.622687,1.375614,-0.42879,1.161616,0.601289,0.87295,0.66,2.024576
3,0.25,0.42,0.0,0.07,0.26,-0.577734,-0.930826,0.815284,0.447514,0.455717,...,-0.066422,0.48373,-1.865442,-0.046295,-0.16382,-0.209693,-1.840566,0.300293,-0.351336,-1.551914
4,0.26,0.16,0.08,0.5,0.0,0.120415,0.666268,-0.626934,2.725357,0.392259,...,-0.118913,-1.172398,0.301785,-1.787407,-0.493361,-0.528049,0.286344,-0.265192,0.430513,0.735073


In [12]:
# Separate input features and target blend properties
X_train = train.iloc[:, :55]  # First 55 columns: 5 compositions + 50 properties
y_train = train.iloc[:, 55:]  # Last 10 columns: BlendProperty1 to BlendProperty10
X_test = test.iloc[:, :55]    # Test input features

# Column check
print("Features shape:", X_train.shape)
print("Targets shape:", y_train.shape)

Features shape: (2000, 55)
Targets shape: (2000, 10)


In [15]:
# Split the data (80% train, 20% validation)
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

print(X_tr.shape)
print(y_tr.shape)

print(X_val.shape)
print(y_val.shape)

(1600, 55)
(1600, 10)
(400, 55)
(400, 10)


In [16]:
# Baseline LightGBM model
model = MultiOutputRegressor(LGBMRegressor(random_state=42, n_estimators=300))
model.fit(X_tr, y_tr)


# Predict on validation part
val_preds = model.predict(X_val)

# Preview predictions
pd.DataFrame(val_preds, columns=sample_submission.columns[1:]).head()


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001674 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12981
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 55
[LightGBM] [Info] Start training from score -0.007867
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000878 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12981
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 55
[LightGBM] [Info] Start training from score -0.004643
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000890 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12981
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 55
[LightGBM] [Info] Start t

Unnamed: 0,BlendProperty1,BlendProperty2,BlendProperty3,BlendProperty4,BlendProperty5,BlendProperty6,BlendProperty7,BlendProperty8,BlendProperty9,BlendProperty10
0,0.442591,0.774901,1.045879,0.212403,2.792571,-0.585939,0.939622,-0.110494,-0.497752,1.262021
1,-1.229278,-0.468017,0.567296,-1.277004,0.112452,-1.262086,0.425898,-0.494212,-0.831855,-0.177001
2,1.096769,0.702759,0.931839,0.438659,-0.385268,0.273795,0.775263,0.84814,0.302906,0.713217
3,0.511137,1.317039,-0.730157,0.134925,-0.638638,1.081774,-0.66462,0.069466,1.600993,-0.753556
4,-0.796053,-2.032464,-0.432822,-0.778949,-0.113958,-1.813897,-0.426938,-1.174153,-1.504054,0.266076


In [17]:
# Compute MAPE
mape_score = mean_absolute_percentage_error(y_val, val_preds)
print(f"Overall Validation MAPE: {mape_score:.4f}")

Overall Validation MAPE: 1.3563


In [18]:
# View MAPE for each individual blend property
for i, col in enumerate(y_val.columns):
    score = mean_absolute_percentage_error(y_val.iloc[:, i], val_preds[:, i])
    print(f"{col}: {score:.4f}")

BlendProperty1: 2.6594
BlendProperty2: 1.8677
BlendProperty3: 0.8588
BlendProperty4: 0.8632
BlendProperty5: 0.2760
BlendProperty6: 1.2324
BlendProperty7: 0.7532
BlendProperty8: 1.6506
BlendProperty9: 2.8752
BlendProperty10: 0.5270


# **Next** **Step** **-** **Feature** **Engineering**

In [20]:
# Make copies before adding features (this removes the warning)
X_train = X_train.copy()
test = test.copy()

# Now safely add the weighted features
for i in range(1, 6):  # Component1 to Component5
    for j in range(1, 11):  # Property1 to Property10
        prop_col = f"Component{i}_Property{j}"
        frac_col = f"Component{i}_fraction"
        new_col = f"{prop_col}_weighted"

        X_train[new_col] = X_train[prop_col] * X_train[frac_col]
        test[new_col] = test[prop_col] * test[frac_col]


In [None]:
# 80:20 split
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Retrain with more estimators and slight tuning
model = MultiOutputRegressor(LGBMRegressor(random_state=42, n_estimators=1000, learning_rate=0.05))
model.fit(X_tr, y_tr)

# Predict and evaluate
val_preds = model.predict(X_val)
mape_score = mean_absolute_percentage_error(y_val, val_preds)
print(f"Overall Validation MAPE after feature engineering: {mape_score:.4f}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001762 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25727
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 105
[LightGBM] [Info] Start training from score -0.007867
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001852 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25727
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 105
[LightGBM] [Info] Start training from score -0.004643
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001870 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 25727
[LightGBM] [Info] Number of data points in the train set: 1600, number of used features: 105
[LightGBM] [Info] Star