# Amini Soil Prediction Challenge

#### Load required packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

import warnings
warnings.filterwarnings('ignore')

In [3]:
# Load datasets
train_df = pd.read_csv('Train.csv')
train_gap_df = pd.read_csv('Gap_Train.csv')

In [6]:
# Pivot train_gap_df so each PID has one row and each nutrient has its own column
gap_wide = train_gap_df.pivot(index="PID", columns="Nutrient", values="Gap")

# Rename columns to make them clear as gap targets
gap_wide.columns = [f"Gap_{col}" for col in gap_wide.columns]

# Reset index so PID is a column again (not the index)
gap_wide = gap_wide.reset_index()

# Merge gap values into train_df using PID
merged_train_df = train_df.merge(gap_wide, on="PID", how="left")

# Preview the result
merged_train_df.head()


Unnamed: 0,site,PID,lon,lat,pH,alb,bio1,bio12,bio15,bio7,...,Gap_Ca,Gap_Cu,Gap_Fe,Gap_K,Gap_Mg,Gap_Mn,Gap_N,Gap_P,Gap_S,Gap_Zn
0,site_id_bIEHwl,ID_I5RGjv,70.603761,46.173798,7.75,176,248,920,108,190,...,-19931.6,-8.5016,-218.784,-377.24,-6737.2,-247.8,-3696.0,39.0072,-4.5272,-1.9944
1,site_id_nGvnKc,ID_8jWzJ5,70.590479,46.078924,7.1,181,250,1080,113,191,...,-3575.2,-12.9328,-291.648,-407.04,-706.4,-1242.96,-4156.0,4.432,-46.976,-7.4128
2,site_id_nGvnKc,ID_UgzkN8,70.582553,46.04882,6.95,188,250,1109,111,191,...,-5506.8,-3.4208,-223.164,-388.92,-996.48,-189.4,-10120.0,-23.656,-20.12,-5.294
3,site_id_nGvnKc,ID_DLLHM9,70.573267,46.02191,7.83,174,250,1149,112,191,...,-19701.6,-8.9168,-241.624,-542.96,-2120.24,-215.68,-6708.0,-78.104,-32.104,-14.104
4,site_id_7SA9rO,ID_d009mj,70.58533,46.204336,8.07,188,250,869,114,191,...,-20980.4,-8.4658,-197.684,-205.4,-3309.6,-425.74,-2588.4,37.14,-12.7676,-1.173


In [12]:
for col in gap_wide.columns:
  print(col)

PID
Gap_B
Gap_Ca
Gap_Cu
Gap_Fe
Gap_K
Gap_Mg
Gap_Mn
Gap_N
Gap_P
Gap_S
Gap_Zn


In [8]:
# Merge the gap data into the train dataset on PID
merged_train_df = train_df.merge(gap_wide, on='PID', how='left')

In [9]:
merged_train_df.head()

Unnamed: 0,site,PID,lon,lat,pH,alb,bio1,bio12,bio15,bio7,...,Gap_Ca,Gap_Cu,Gap_Fe,Gap_K,Gap_Mg,Gap_Mn,Gap_N,Gap_P,Gap_S,Gap_Zn
0,site_id_bIEHwl,ID_I5RGjv,70.603761,46.173798,7.75,176,248,920,108,190,...,-19931.6,-8.5016,-218.784,-377.24,-6737.2,-247.8,-3696.0,39.0072,-4.5272,-1.9944
1,site_id_nGvnKc,ID_8jWzJ5,70.590479,46.078924,7.1,181,250,1080,113,191,...,-3575.2,-12.9328,-291.648,-407.04,-706.4,-1242.96,-4156.0,4.432,-46.976,-7.4128
2,site_id_nGvnKc,ID_UgzkN8,70.582553,46.04882,6.95,188,250,1109,111,191,...,-5506.8,-3.4208,-223.164,-388.92,-996.48,-189.4,-10120.0,-23.656,-20.12,-5.294
3,site_id_nGvnKc,ID_DLLHM9,70.573267,46.02191,7.83,174,250,1149,112,191,...,-19701.6,-8.9168,-241.624,-542.96,-2120.24,-215.68,-6708.0,-78.104,-32.104,-14.104
4,site_id_7SA9rO,ID_d009mj,70.58533,46.204336,8.07,188,250,869,114,191,...,-20980.4,-8.4658,-197.684,-205.4,-3309.6,-425.74,-2588.4,37.14,-12.7676,-1.173


In [11]:
for col in merged_train_df.columns:
    print(col)

site
PID
lon
lat
pH
alb
bio1
bio12
bio15
bio7
bp
cec20
dows
ecec20
hp20
ls
lstd
lstn
mb1
mb2
mb3
mb7
mdem
para
parv
ph20
slope
snd20
soc20
tim
wp
xhp20
BulkDensity
N
P
K
Ca
Mg
S
Fe
Mn
Zn
Cu
B
Gap_B
Gap_Ca
Gap_Cu
Gap_Fe
Gap_K
Gap_Mg
Gap_Mn
Gap_N
Gap_P
Gap_S
Gap_Zn


In [13]:
# Drop non-feature columns
drop_cols = ['site', 'PID'] + [col for col in train_df.columns if col.startswith('Gap_')] + ['N', 'P', 'K', 'Ca', 'Mg', 'S', 'Fe', 'Mn', 'Zn', 'Cu', 'B']
features_df = train_df.drop(columns=drop_cols)


In [16]:
for col in features_df.columns:
  print(col)

lon
lat
pH
alb
bio1
bio12
bio15
bio7
bp
cec20
dows
ecec20
hp20
ls
lstd
lstn
mb1
mb2
mb3
mb7
mdem
para
parv
ph20
slope
snd20
soc20
tim
wp
xhp20
BulkDensity


In [17]:
missing_counts = features_df.isnull().sum()
print(missing_counts)

lon            0
lat            0
pH             0
alb            0
bio1           0
bio12          0
bio15          0
bio7           0
bp             0
cec20          0
dows           0
ecec20         5
hp20           5
ls             0
lstd           0
lstn           0
mb1            0
mb2            0
mb3            0
mb7            0
mdem           0
para           0
parv           0
ph20           0
slope          0
snd20          0
soc20          0
tim            0
wp             0
xhp20          5
BulkDensity    4
dtype: int64


In [18]:
for col in features_df.columns:
    n_missing = features_df[col].isnull().sum()
    print(f"{col}: {n_missing}")

lon: 0
lat: 0
pH: 0
alb: 0
bio1: 0
bio12: 0
bio15: 0
bio7: 0
bp: 0
cec20: 0
dows: 0
ecec20: 5
hp20: 5
ls: 0
lstd: 0
lstn: 0
mb1: 0
mb2: 0
mb3: 0
mb7: 0
mdem: 0
para: 0
parv: 0
ph20: 0
slope: 0
snd20: 0
soc20: 0
tim: 0
wp: 0
xhp20: 5
BulkDensity: 4


In [19]:
# Drop rows where any of these columns is null:
features_dropped_rows = features_df.dropna(subset=['ecec20', 'hp20', 'xhp20', 'BulkDensity'])

print("Rows before:", features_df.shape[0],
      "Rows after dropping missing:", features_dropped_rows.shape[0])

Rows before: 7744 Rows after dropping missing: 7735


In [21]:
sum_missing = features_dropped_rows.isnull().sum()
print(sum_missing)

lon            0
lat            0
pH             0
alb            0
bio1           0
bio12          0
bio15          0
bio7           0
bp             0
cec20          0
dows           0
ecec20         0
hp20           0
ls             0
lstd           0
lstn           0
mb1            0
mb2            0
mb3            0
mb7            0
mdem           0
para           0
parv           0
ph20           0
slope          0
snd20          0
soc20          0
tim            0
wp             0
xhp20          0
BulkDensity    0
dtype: int64


In [22]:
list(features_dropped_rows.columns)

['lon',
 'lat',
 'pH',
 'alb',
 'bio1',
 'bio12',
 'bio15',
 'bio7',
 'bp',
 'cec20',
 'dows',
 'ecec20',
 'hp20',
 'ls',
 'lstd',
 'lstn',
 'mb1',
 'mb2',
 'mb3',
 'mb7',
 'mdem',
 'para',
 'parv',
 'ph20',
 'slope',
 'snd20',
 'soc20',
 'tim',
 'wp',
 'xhp20',
 'BulkDensity']