# Amini Soil Prediction Challenge

#### Load required packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load datasets
train_df = pd.read_csv('Train.csv')
train_gap_df = pd.read_csv('Gap_Train.csv')
test_df = pd.read_csv('Test.csv')
test_gap_df = pd.read_csv('Gap_Test.csv')

### Check the shpe of test and train data

In [None]:
train_df.shape, train_gap_df.shape

((7744, 44), (85184, 5))

In [None]:
# Pivot train_gap_df so each PID has one row and each nutrient has its own column
gap_wide = train_gap_df.pivot(index="PID", columns="Nutrient", values="Gap")

# Rename columns to make them clear as gap targets
gap_wide.columns = [f"Gap_{col}" for col in gap_wide.columns]

# Reset index so PID is a column again (not the index)
gap_wide = gap_wide.reset_index()

# Merge gap values into train_df using PID
merged_train_df = train_df.merge(gap_wide, on="PID", how="left")

# Preview the result
merged_train_df.head()


Unnamed: 0,site,PID,lon,lat,pH,alb,bio1,bio12,bio15,bio7,...,Gap_Ca,Gap_Cu,Gap_Fe,Gap_K,Gap_Mg,Gap_Mn,Gap_N,Gap_P,Gap_S,Gap_Zn
0,site_id_bIEHwl,ID_I5RGjv,70.603761,46.173798,7.75,176,248,920,108,190,...,-19931.6,-8.5016,-218.784,-377.24,-6737.2,-247.8,-3696.0,39.0072,-4.5272,-1.9944
1,site_id_nGvnKc,ID_8jWzJ5,70.590479,46.078924,7.1,181,250,1080,113,191,...,-3575.2,-12.9328,-291.648,-407.04,-706.4,-1242.96,-4156.0,4.432,-46.976,-7.4128
2,site_id_nGvnKc,ID_UgzkN8,70.582553,46.04882,6.95,188,250,1109,111,191,...,-5506.8,-3.4208,-223.164,-388.92,-996.48,-189.4,-10120.0,-23.656,-20.12,-5.294
3,site_id_nGvnKc,ID_DLLHM9,70.573267,46.02191,7.83,174,250,1149,112,191,...,-19701.6,-8.9168,-241.624,-542.96,-2120.24,-215.68,-6708.0,-78.104,-32.104,-14.104
4,site_id_7SA9rO,ID_d009mj,70.58533,46.204336,8.07,188,250,869,114,191,...,-20980.4,-8.4658,-197.684,-205.4,-3309.6,-425.74,-2588.4,37.14,-12.7676,-1.173


In [None]:
for col in gap_wide.columns:
  print(col)

PID
Gap_B
Gap_Ca
Gap_Cu
Gap_Fe
Gap_K
Gap_Mg
Gap_Mn
Gap_N
Gap_P
Gap_S
Gap_Zn


In [None]:
list(gap_wide.columns)

['PID',
 'Gap_B',
 'Gap_Ca',
 'Gap_Cu',
 'Gap_Fe',
 'Gap_K',
 'Gap_Mg',
 'Gap_Mn',
 'Gap_N',
 'Gap_P',
 'Gap_S',
 'Gap_Zn']

In [None]:
merged_train_df.shape

(7744, 55)

In [None]:
# Merge the gap data into the train dataset on PID
merged_train_df = train_df.merge(gap_wide, on='PID', how='left')

In [None]:
merged_train_df.head()

Unnamed: 0,site,PID,lon,lat,pH,alb,bio1,bio12,bio15,bio7,...,Gap_Ca,Gap_Cu,Gap_Fe,Gap_K,Gap_Mg,Gap_Mn,Gap_N,Gap_P,Gap_S,Gap_Zn
0,site_id_bIEHwl,ID_I5RGjv,70.603761,46.173798,7.75,176,248,920,108,190,...,-19931.6,-8.5016,-218.784,-377.24,-6737.2,-247.8,-3696.0,39.0072,-4.5272,-1.9944
1,site_id_nGvnKc,ID_8jWzJ5,70.590479,46.078924,7.1,181,250,1080,113,191,...,-3575.2,-12.9328,-291.648,-407.04,-706.4,-1242.96,-4156.0,4.432,-46.976,-7.4128
2,site_id_nGvnKc,ID_UgzkN8,70.582553,46.04882,6.95,188,250,1109,111,191,...,-5506.8,-3.4208,-223.164,-388.92,-996.48,-189.4,-10120.0,-23.656,-20.12,-5.294
3,site_id_nGvnKc,ID_DLLHM9,70.573267,46.02191,7.83,174,250,1149,112,191,...,-19701.6,-8.9168,-241.624,-542.96,-2120.24,-215.68,-6708.0,-78.104,-32.104,-14.104
4,site_id_7SA9rO,ID_d009mj,70.58533,46.204336,8.07,188,250,869,114,191,...,-20980.4,-8.4658,-197.684,-205.4,-3309.6,-425.74,-2588.4,37.14,-12.7676,-1.173


In [None]:
merged_train_df.shape

(7744, 55)

In [None]:
for col in merged_train_df.columns:
    print(col)

site
PID
lon
lat
pH
alb
bio1
bio12
bio15
bio7
bp
cec20
dows
ecec20
hp20
ls
lstd
lstn
mb1
mb2
mb3
mb7
mdem
para
parv
ph20
slope
snd20
soc20
tim
wp
xhp20
BulkDensity
N
P
K
Ca
Mg
S
Fe
Mn
Zn
Cu
B
Gap_B
Gap_Ca
Gap_Cu
Gap_Fe
Gap_K
Gap_Mg
Gap_Mn
Gap_N
Gap_P
Gap_S
Gap_Zn


In [None]:
merged_train_df.shape

(7744, 55)

#### Feature Selection

In [None]:
# Drop non-feature columns
drop_cols = ['site', 'PID'] + [col for col in train_df.columns if col.startswith('Gap_')] + ['N', 'P', 'K', 'Ca', 'Mg', 'S', 'Fe', 'Mn', 'Zn', 'Cu', 'B']
new_train_data = train_df.drop(columns=drop_cols)


In [None]:
# Drop non-feature columns
drop_cols = ['site'] + [col for col in train_df.columns if col.startswith('Gap_')] + ['N', 'P', 'K', 'Ca', 'Mg', 'S', 'Fe', 'Mn', 'Zn', 'Cu', 'B']
new_train_data = train_df.drop(columns=drop_cols)

In [None]:
for col in new_train_data.columns:
  print(col)

PID
lon
lat
pH
alb
bio1
bio12
bio15
bio7
bp
cec20
dows
ecec20
hp20
ls
lstd
lstn
mb1
mb2
mb3
mb7
mdem
para
parv
ph20
slope
snd20
soc20
tim
wp
xhp20
BulkDensity


In [None]:
new_train_data.shape

(7744, 32)

In [None]:
55-24

31

In [None]:
missing_counts = new_train_data.isnull().sum()
print(missing_counts)

PID            0
lon            0
lat            0
pH             0
alb            0
bio1           0
bio12          0
bio15          0
bio7           0
bp             0
cec20          0
dows           0
ecec20         5
hp20           5
ls             0
lstd           0
lstn           0
mb1            0
mb2            0
mb3            0
mb7            0
mdem           0
para           0
parv           0
ph20           0
slope          0
snd20          0
soc20          0
tim            0
wp             0
xhp20          5
BulkDensity    4
dtype: int64


In [None]:
for col in new_train_data.columns:
    n_missing = new_train_data[col].isnull().sum()
    print(f"{col}: {n_missing}")

PID: 0
lon: 0
lat: 0
pH: 0
alb: 0
bio1: 0
bio12: 0
bio15: 0
bio7: 0
bp: 0
cec20: 0
dows: 0
ecec20: 5
hp20: 5
ls: 0
lstd: 0
lstn: 0
mb1: 0
mb2: 0
mb3: 0
mb7: 0
mdem: 0
para: 0
parv: 0
ph20: 0
slope: 0
snd20: 0
soc20: 0
tim: 0
wp: 0
xhp20: 5
BulkDensity: 4


In [None]:
from sklearn.impute import SimpleImputer

# Check missing counts
print("Missing before imputation:")
print(new_train_data.isnull().sum().sort_values(ascending=False).head(10))

# Impute ecec20, hp20, xhp20, BulkDensity with median
imputer = SimpleImputer(strategy='median')
cols_to_impute = ['ecec20', 'hp20', 'xhp20', 'BulkDensity']
new_train_data[cols_to_impute] = imputer.fit_transform(new_train_data[cols_to_impute])

# Verify no more missing
print("\nMissing after imputation:")
print(new_train_data[cols_to_impute].isnull().sum())

Missing before imputation:
ecec20         5
hp20           5
xhp20          5
BulkDensity    4
alb            0
bio1           0
PID            0
lon            0
bio15          0
bio12          0
dtype: int64

Missing after imputation:
ecec20         0
hp20           0
xhp20          0
BulkDensity    0
dtype: int64


In [None]:
sum_missing = new_train_data.isnull().sum()
print(sum_missing)

PID            0
lon            0
lat            0
pH             0
alb            0
bio1           0
bio12          0
bio15          0
bio7           0
bp             0
cec20          0
dows           0
ecec20         0
hp20           0
ls             0
lstd           0
lstn           0
mb1            0
mb2            0
mb3            0
mb7            0
mdem           0
para           0
parv           0
ph20           0
slope          0
snd20          0
soc20          0
tim            0
wp             0
xhp20          0
BulkDensity    0
dtype: int64


In [None]:
list(new_train_data.columns)

['PID',
 'lon',
 'lat',
 'pH',
 'alb',
 'bio1',
 'bio12',
 'bio15',
 'bio7',
 'bp',
 'cec20',
 'dows',
 'ecec20',
 'hp20',
 'ls',
 'lstd',
 'lstn',
 'mb1',
 'mb2',
 'mb3',
 'mb7',
 'mdem',
 'para',
 'parv',
 'ph20',
 'slope',
 'snd20',
 'soc20',
 'tim',
 'wp',
 'xhp20',
 'BulkDensity']

In [None]:
gap_cols = [col for col in merged_train_df.columns if col.startswith("Gap_")]
nutrient_cols = ["N", "P", "K", "Ca", "Mg", "S", "Fe", "Mn", "Zn", "Cu", "B"]
drop_for_X = ["site", "PID"] + gap_cols + nutrient_cols

y = merged_train_df[[
  'Gap_B',
 'Gap_Ca',
 'Gap_Cu',
 'Gap_Fe',
 'Gap_K',
 'Gap_Mg',
 'Gap_Mn',
 'Gap_N',
 'Gap_P',
 'Gap_S',
 'Gap_Zn'
 ]]
X = merged_train_df.drop(columns=drop_for_X)


In [None]:
for col in merged_train_df.columns:
  print(col)

site
PID
lon
lat
pH
alb
bio1
bio12
bio15
bio7
bp
cec20
dows
ecec20
hp20
ls
lstd
lstn
mb1
mb2
mb3
mb7
mdem
para
parv
ph20
slope
snd20
soc20
tim
wp
xhp20
BulkDensity
N
P
K
Ca
Mg
S
Fe
Mn
Zn
Cu
B
Gap_B
Gap_Ca
Gap_Cu
Gap_Fe
Gap_K
Gap_Mg
Gap_Mn
Gap_N
Gap_P
Gap_S
Gap_Zn


In [None]:
print("X columns (features):", X.columns.tolist())
print("X shape:", X.shape)
print("y columns (targets):", y.columns.tolist())
print("y shape:", y.shape)

X columns (features): ['lon', 'lat', 'pH', 'alb', 'bio1', 'bio12', 'bio15', 'bio7', 'bp', 'cec20', 'dows', 'ecec20', 'hp20', 'ls', 'lstd', 'lstn', 'mb1', 'mb2', 'mb3', 'mb7', 'mdem', 'para', 'parv', 'ph20', 'slope', 'snd20', 'soc20', 'tim', 'wp', 'xhp20', 'BulkDensity']
X shape: (7744, 31)
y columns (targets): ['Gap_B', 'Gap_Ca', 'Gap_Cu', 'Gap_Fe', 'Gap_K', 'Gap_Mg', 'Gap_Mn', 'Gap_N', 'Gap_P', 'Gap_S', 'Gap_Zn']
y shape: (7744, 11)


In [None]:
for col in X.columns:
  print(col)

lon
lat
pH
alb
bio1
bio12
bio15
bio7
bp
cec20
dows
ecec20
hp20
ls
lstd
lstn
mb1
mb2
mb3
mb7
mdem
para
parv
ph20
slope
snd20
soc20
tim
wp
xhp20
BulkDensity


## Split the data into Traning, testing and validation sets

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.20, random_state=42
)

print("Training X:", X_train.shape, "Validation X:", X_val.shape)
print("Training y:", y_train.shape, "Validation y:", y_val.shape)

Training X: (6195, 31) Validation X: (1549, 31)
Training y: (6195, 11) Validation y: (1549, 11)


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error

# Instantiate a base regressor (e.g. RandomForest)
base_rf = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

# Wrap it in MultiOutputRegressor
multi_rf = MultiOutputRegressor(base_rf)

# Fit on the training split
multi_rf.fit(X_train, y_train)

# Predict on validation split
y_pred = multi_rf.predict(X_val)


In [None]:
y_pred.shape

(1549, 11)

In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

# After you’ve trained and predicted (y_pred is shape [n_val, 11])

for idx, nutrient in enumerate(y_train.columns):
    mse = mean_squared_error(
        y_val.iloc[:, idx],
        y_pred[:, idx]
    )
    rmse = np.sqrt(mse)
    print(f"{nutrient}:   RMSE = {rmse:.3f}")

# Overall RMSE across all nutrient gaps:
overall_mse = mean_squared_error(
    y_val.values.flatten(),
    y_pred.flatten()
)
overall_rmse = np.sqrt(overall_mse)
print(f"\nOverall RMSE (all gaps combined): {overall_rmse:.3f}")


Gap_B:   RMSE = 0.574
Gap_Ca:   RMSE = 3844.033
Gap_Cu:   RMSE = 12.753
Gap_Fe:   RMSE = 104.669
Gap_K:   RMSE = 505.897
Gap_Mg:   RMSE = 852.630
Gap_Mn:   RMSE = 135.136
Gap_N:   RMSE = 1222.015
Gap_P:   RMSE = 115.829
Gap_S:   RMSE = 41.526
Gap_Zn:   RMSE = 6.204

Overall RMSE (all gaps combined): 1253.989


#### Retrain on Full Data

In [None]:
# Pivot train_gap_df so each PID has one row and each nutrient has its own column
gap_wide_test = test_gap_df.pivot(index="PID", columns="Nutrient", values="Required")

# Rename columns to make them clear as gap targets
gap_wide_test.columns = [f"Req_{col}" for col in gap_wide_test.columns]

# Reset index so PID is a column again (not the index)
gap_wide_test = gap_wide_test.reset_index()

# Merge gap values into train_df using PID
merged_test_df = test_df.merge(gap_wide_test, on="PID", how="left")

# Preview the result
merged_test_df.head()

Unnamed: 0,site,PID,lon,lat,pH,alb,bio1,bio12,bio15,bio7,...,Req_Ca,Req_Cu,Req_Fe,Req_K,Req_Mg,Req_Mn,Req_N,Req_P,Req_S,Req_Zn
0,site_id_hgJpkz,ID_NGS9Bx,69.170794,44.522885,6.86,144,256,910,108,186,...,12.0,0.2,0.8,52.0,8.0,0.4,100.0,40.0,12.0,0.4
1,site_id_olmuI5,ID_YdVKXw,68.885265,44.741057,7.08,129,260,851,110,187,...,12.0,0.2,0.8,52.0,8.0,0.4,100.0,40.0,12.0,0.4
2,site_id_PTZdJz,ID_MZAlfE,68.97021,44.675777,6.5,142,259,901,109,187,...,12.0,0.2,0.8,52.0,8.0,0.4,100.0,40.0,12.0,0.4
3,site_id_DOTgr8,ID_GwCCMN,69.068751,44.647707,6.82,142,261,847,109,187,...,12.0,0.2,0.8,52.0,8.0,0.4,100.0,40.0,12.0,0.4
4,site_id_1rQNvy,ID_K8sowf,68.990002,44.577607,6.52,145,253,1109,110,186,...,12.0,0.2,0.8,52.0,8.0,0.4,100.0,40.0,12.0,0.4


In [None]:
for col in merged_test_df.columns:
  print(col)

site
PID
lon
lat
pH
alb
bio1
bio12
bio15
bio7
bp
cec20
dows
ecec20
hp20
ls
lstd
lstn
mb1
mb2
mb3
mb7
mdem
para
parv
ph20
slope
snd20
soc20
tim
wp
xhp20
BulkDensity
Req_B
Req_Ca
Req_Cu
Req_Fe
Req_K
Req_Mg
Req_Mn
Req_N
Req_P
Req_S
Req_Zn


In [None]:
# Drop non-feature columns
# List out the non‐feature columns in one flat list
drop_cols = [
    'site', 'PID',
    'Req_B', 'Req_Ca', 'Req_Cu', 'Req_Fe', 'Req_K',
    'Req_Mg', 'Req_Mn', 'Req_N', 'Req_P', 'Req_S', 'Req_Zn'
]

# Drop those columns from merged_test_df
new_test_data = merged_test_df.drop(columns=drop_cols)

# Verify
print("Remaining columns:", new_test_data.columns.tolist())
print("Shape after dropping:", new_test_data.shape)


Remaining columns: ['lon', 'lat', 'pH', 'alb', 'bio1', 'bio12', 'bio15', 'bio7', 'bp', 'cec20', 'dows', 'ecec20', 'hp20', 'ls', 'lstd', 'lstn', 'mb1', 'mb2', 'mb3', 'mb7', 'mdem', 'para', 'parv', 'ph20', 'slope', 'snd20', 'soc20', 'tim', 'wp', 'xhp20', 'BulkDensity']
Shape after dropping: (2418, 31)


In [None]:
new_test_data.shape

(2418, 31)

In [None]:
for col in new_test_data.columns:
  print(col)

lon
lat
pH
alb
bio1
bio12
bio15
bio7
bp
cec20
dows
ecec20
hp20
ls
lstd
lstn
mb1
mb2
mb3
mb7
mdem
para
parv
ph20
slope
snd20
soc20
tim
wp
xhp20
BulkDensity


In [None]:
list(new_test_data.columns)

['lon',
 'lat',
 'pH',
 'alb',
 'bio1',
 'bio12',
 'bio15',
 'bio7',
 'bp',
 'cec20',
 'dows',
 'ecec20',
 'hp20',
 'ls',
 'lstd',
 'lstn',
 'mb1',
 'mb2',
 'mb3',
 'mb7',
 'mdem',
 'para',
 'parv',
 'ph20',
 'slope',
 'snd20',
 'soc20',
 'tim',
 'wp',
 'xhp20',
 'BulkDensity']

In [None]:
X = new_test_data
y = merged_test_df[['Req_B',
  'Req_Ca',
  'Req_Cu',
  'Req_Fe',
  'Req_K',
  'Req_Mg',
  'Req_Mn',
  'Req_N',
  'Req_P',
  'Req_S',
  'Req_Zn']]

In [None]:
X.shape, y.shape

((2418, 31), (2418, 11))

In [None]:
preds_full = multi_rf.predict(X)
preds_full.shape

(2418, 11)

In [None]:
# Convert preds_full into a DataFrame

col_names = y.columns.tolist()

preds_df = pd.DataFrame(preds_full, columns=col_names)

print(preds_df.columns)
print(preds_df.head())

Index(['Req_B', 'Req_Ca', 'Req_Cu', 'Req_Fe', 'Req_K', 'Req_Mg', 'Req_Mn',
       'Req_N', 'Req_P', 'Req_S', 'Req_Zn'],
      dtype='object')
      Req_B      Req_Ca     Req_Cu     Req_Fe     Req_K     Req_Mg    Req_Mn  \
0 -0.461552 -14918.8634 -12.396686 -324.33250 -353.5204 -4095.6538 -375.4548   
1 -0.447146 -15870.1526 -11.921492 -280.61944 -340.4522 -5464.0810 -382.0058   
2 -0.495820 -13397.8820 -11.943414 -341.20460 -357.8598 -4387.0202 -389.7280   
3 -0.506064 -14010.3576 -12.158618 -353.23946 -353.1620 -4506.4320 -385.2876   
4 -0.423098 -11576.7152  -9.824004 -309.40296 -414.6404 -3579.7242 -357.8694   

      Req_N      Req_P      Req_S    Req_Zn  
0 -4667.632  -4.309384 -11.238414 -4.023982  
1 -3043.810  19.218436 -11.493892 -2.726818  
2 -4860.562  32.215716  -9.871214 -3.729908  
3 -5006.726  33.992968 -10.115650 -4.571270  
4 -4465.954  25.966968  -8.457940 -4.193122  


In [None]:
# Build a long‐format DataFrame of predictions
# all_preds = []
# for idx, nutrient in enumerate(y.columns):
#     df_sub = pd.DataFrame({
#         'PID': merged_test_df['PID'],
#         'Nutrient': nutrient.replace('Req_', ''),
#         'Gap': preds_full[:, idx]
#     })
#     all_preds.append(df_sub)

# SampleSubmission = pd.concat(all_preds, axis=0)
# SampleSubmission['PID'] = SampleSubmission['PID'].astype(str) + '_' + SampleSubmission['Nutrient']
# SampleSubmission = SampleSubmission[['PID', 'Gap']]
# SampleSubmission.to_csv('SampleSubmission.csv', index=False)
# print("✅ SampleSubmission.csv created (shape:", SampleSubmission.shape, ")")


#### Remote sensing data

In [None]:
MODIS_MOD16A2 = pd.read_csv('MODIS_MOD16A2_data.csv')
MODIS_MOD13Q1 = pd.read_csv("MODIS_MOD13Q1_data.csv")

In [None]:
MODIS_MOD16A2.shape, MODIS_MOD13Q1.shape

((935363, 6), (545563, 13))

In [None]:
for col in MODIS_MOD13Q1.columns:
  print(col)

EVI
NDVI
RelativeAzimuth
SolarZenith
ViewZenith
date
lat
lon
sur_refl_b01
sur_refl_b02
sur_refl_b03
sur_refl_b07
PID


In [None]:
for col in MODIS_MOD16A2.columns:
  print(col)

ET
PET
date
lat
lon
PID


In [None]:
# Create a copy of the DataFrames
mod09_df = MODIS_MOD16A2.copy()
mod13_df = MODIS_MOD13Q1.copy()

In [None]:
# Average over time coz we have multiple dates per PID
mod09_avg = mod09_df.groupby("PID")[["ET", "PET"]].mean().reset_index()
mod13_avg = mod13_df.groupby("PID")[[
    "EVI", "NDVI", "RelativeAzimuth", "SolarZenith", "ViewZenith",
    "sur_refl_b01", "sur_refl_b02", "sur_refl_b03", "sur_refl_b07"
]].mean().reset_index()

In [None]:
# Merge MODIS datasets on PID
modis_combined = pd.merge(mod09_avg, mod13_avg, on="PID", how="outer")

In [None]:
for col in modis_combined.columns:
  print(col)

PID
ET
PET
EVI
NDVI
RelativeAzimuth
SolarZenith
ViewZenith
sur_refl_b01
sur_refl_b02
sur_refl_b03
sur_refl_b07


In [None]:
final_data = pd.merge(merged_test_df, modis_combined, on="PID", how="left")


In [None]:
for col in final_data.columns:
  print(col)

site
PID
lon
lat
pH
alb
bio1
bio12
bio15
bio7
bp
cec20
dows
ecec20
hp20
ls
lstd
lstn
mb1
mb2
mb3
mb7
mdem
para
parv
ph20
slope
snd20
soc20
tim
wp
xhp20
BulkDensity
Req_B
Req_Ca
Req_Cu
Req_Fe
Req_K
Req_Mg
Req_Mn
Req_N
Req_P
Req_S
Req_Zn
ET
PET
EVI
NDVI
RelativeAzimuth
SolarZenith
ViewZenith
sur_refl_b01
sur_refl_b02
sur_refl_b03
sur_refl_b07


In [None]:
final_data = final_data.drop(columns=['PID', 'site'])

In [None]:
for col in final_data.columns:
  print(col)

lon
lat
pH
alb
bio1
bio12
bio15
bio7
bp
cec20
dows
ecec20
hp20
ls
lstd
lstn
mb1
mb2
mb3
mb7
mdem
para
parv
ph20
slope
snd20
soc20
tim
wp
xhp20
BulkDensity
Req_B
Req_Ca
Req_Cu
Req_Fe
Req_K
Req_Mg
Req_Mn
Req_N
Req_P
Req_S
Req_Zn
ET
PET
EVI
NDVI
RelativeAzimuth
SolarZenith
ViewZenith
sur_refl_b01
sur_refl_b02
sur_refl_b03
sur_refl_b07


In [None]:
y = final_data[['Req_B',
  'Req_Ca',
  'Req_Cu',
  'Req_Fe',
  'Req_K',
  'Req_Mg',
  'Req_Mn',
  'Req_N',
  'Req_P',
  'Req_S',
  'Req_Zn']]
X = final_data

In [None]:
for col in merged_test_df.columns:
  print(col)

site
PID
lon
lat
pH
alb
bio1
bio12
bio15
bio7
bp
cec20
dows
ecec20
hp20
ls
lstd
lstn
mb1
mb2
mb3
mb7
mdem
para
parv
ph20
slope
snd20
soc20
tim
wp
xhp20
BulkDensity
Req_B
Req_Ca
Req_Cu
Req_Fe
Req_K
Req_Mg
Req_Mn
Req_N
Req_P
Req_S
Req_Zn


In [None]:
X.shape, y.shape

((2418, 53), (2418, 11))

In [None]:
for col in merged_train_df.columns:
  print(col)

site
PID
lon
lat
pH
alb
bio1
bio12
bio15
bio7
bp
cec20
dows
ecec20
hp20
ls
lstd
lstn
mb1
mb2
mb3
mb7
mdem
para
parv
ph20
slope
snd20
soc20
tim
wp
xhp20
BulkDensity
N
P
K
Ca
Mg
S
Fe
Mn
Zn
Cu
B
Gap_B
Gap_Ca
Gap_Cu
Gap_Fe
Gap_K
Gap_Mg
Gap_Mn
Gap_N
Gap_P
Gap_S
Gap_Zn


In [None]:
X.shape, y.shape

((2418, 53), (2418, 11))

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error

# Instantiate a base regressor (e.g. RandomForest)
base_rf = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

# Wrap it in MultiOutputRegressor
multi_rf = MultiOutputRegressor(base_rf)

# Fit on the training split
multi_rf.fit(X, y)

# Predict on validation split
y_pred = multi_rf.predict(X)

In [None]:
for col in X.columns:
  print(col)

lon
lat
pH
alb
bio1
bio12
bio15
bio7
bp
cec20
dows
ecec20
hp20
ls
lstd
lstn
mb1
mb2
mb3
mb7
mdem
para
parv
ph20
slope
snd20
soc20
tim
wp
xhp20
BulkDensity
Req_B
Req_Ca
Req_Cu
Req_Fe
Req_K
Req_Mg
Req_Mn
Req_N
Req_P
Req_S
Req_Zn
ET
PET
EVI
NDVI
RelativeAzimuth
SolarZenith
ViewZenith
sur_refl_b01
sur_refl_b02
sur_refl_b03
sur_refl_b07


In [None]:
y_pred.shape

(2418, 11)

In [None]:
from sklearn.metrics import mean_squared_error
import numpy as np

# After you’ve trained and predicted (y is shape [n_val, 11])

for idx, nutrient in enumerate(y.columns):
    mse = mean_squared_error(
        y.iloc[:, idx],
        y_pred[:, idx]
    )
    rmse = np.sqrt(mse)
    print(f"{nutrient}:   RMSE = {rmse:.3f}")

# Overall RMSE across all nutrient gaps:
overall_mse = mean_squared_error(
    y.values.flatten(),
    y_pred.flatten()
)
overall_rmse = np.sqrt(overall_mse)
print(f"\nOverall RMSE (all gaps combined): {overall_rmse:.3f}")

Req_B:   RMSE = 0.000
Req_Ca:   RMSE = 0.000
Req_Cu:   RMSE = 0.000
Req_Fe:   RMSE = 0.000
Req_K:   RMSE = 0.000
Req_Mg:   RMSE = 0.000
Req_Mn:   RMSE = 0.000
Req_N:   RMSE = 0.000
Req_P:   RMSE = 0.000
Req_S:   RMSE = 0.000
Req_Zn:   RMSE = 0.000

Overall RMSE (all gaps combined): 0.000


In [None]:
# Build a long‐format DataFrame of predictions
all_preds = []
for idx, nutrient in enumerate(y.columns):
     df_sub = pd.DataFrame({
         'PID': merged_test_df['PID'],
         'Nutrient': nutrient.replace('Req_', ''),
         'Gap': preds_full[:, idx]
     })
     all_preds.append(df_sub)

SampleSubmission = pd.concat(all_preds, axis=0)
SampleSubmission['PID'] = SampleSubmission['PID'].astype(str) + '_' + SampleSubmission['Nutrient']
SampleSubmission = SampleSubmission[['PID', 'Gap']]
SampleSubmission.to_csv('SampleSubmission.csv', index=False)
print("✅ SampleSubmission.csv created (shape:", SampleSubmission.shape, ")")

✅ SampleSubmission.csv created (shape: (26598, 2) )
