# Soil Classification- Challenge 1

### Kaggle Leaderboard Rank - 18th

#### Team Leader: Yashodip More, Electrical Engineering, RC Patel Institute of Technology, Shirpur, Maharashtra – yashodipmore2004@gmail.com
#### Team Member: S.M. Sakthivel, AI & Data Science, Achariya College of Engineering Technology, Puducherry – s.m.sakthivelofficial@gmail.com
#### Team Member: Komal Kumavat, Electrical Engineering, RC Patel Institute of Technology, Shirpur, Maharashtra – komalkumavat025@gmail.com

## STEP 1: IMPORTS & SETUP

In [3]:

import os
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split
from torchvision import datasets, transforms, models
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm


## Imports, Device Setup & Paths

In [1]:
# Imports
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
from PIL import Image

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using:", device)

# Paths
BASE_PATH = '/kaggle/input/soil-classification/soil_classification-2025'
TRAIN_DIR = os.path.join(BASE_PATH, 'train')
TEST_DIR = os.path.join(BASE_PATH, 'test')
LABELS_CSV = os.path.join(BASE_PATH, 'train_labels.csv')
TEST_IDS_CSV = os.path.join(BASE_PATH, 'test_ids.csv')

Using: cuda


## Load Labels, Train/Val Split & Transformations

In [2]:
# Load labels
df = pd.read_csv(LABELS_CSV)
df['image'] = df['image_id']
label_mapping = {label: idx for idx, label in enumerate(df['soil_type'].unique())}
inv_label_mapping = {v: k for k, v in label_mapping.items()}
df['label'] = df['soil_type'].map(label_mapping)

#  Train/Val split
train_df, val_df = train_test_split(df, test_size=0.15, stratify=df['label'], random_state=42)

#  Transformations
image_transforms = {
    'train': transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(15),
        transforms.ToTensor(),
        transforms.Normalize([0.5]*3, [0.5]*3)
    ]),
    'val': transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.5]*3, [0.5]*3)
    ]),
    'test': transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.5]*3, [0.5]*3)
    ])
}

##  Dataset, Dataloaders, Model Definition & Training Setup

In [3]:


#  Dataset
class SoilDataset(Dataset):
    def __init__(self, dataframe, img_dir, transform=None, is_test=False):
        self.df = dataframe
        self.img_dir = img_dir
        self.transform = transform
        self.is_test = is_test

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        image_id = self.df.iloc[idx]['image']
        img_path = os.path.join(self.img_dir, image_id)
        image = Image.open(img_path).convert('RGB')

        if self.transform:
            image = self.transform(image)

        if self.is_test:
            return image, image_id
        else:
            label = self.df.iloc[idx]['label']
            return image, label

#  Dataloaders
train_dataset = SoilDataset(train_df, TRAIN_DIR, transform=image_transforms['train'])
val_dataset = SoilDataset(val_df, TRAIN_DIR, transform=image_transforms['val'])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

#  Model
model = models.resnet18(pretrained=True)
model.fc = nn.Linear(model.fc.in_features, len(label_mapping))
model = model.to(device)

#  Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)




Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 84.5MB/s]


## Training & Validation Loop

In [5]:
#  Training loop
EPOCHS = 10
for epoch in range(EPOCHS):
    model.train()
    train_loss = 0
    for images, labels in tqdm(train_loader, desc=f'Epoch {epoch+1}/{EPOCHS}'):
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    # Validation
    model.eval()
    val_preds, val_labels = [], []
    with torch.no_grad():
        for images, labels in val_loader:
            images = images.to(device)
            outputs = model(images)
            preds = outputs.argmax(1).cpu().numpy()
            val_preds.extend(preds)
            val_labels.extend(labels.numpy())

    f1_scores = []
    for i in range(len(label_mapping)):
        f1 = f1_score(np.array(val_labels) == i, np.array(val_preds) == i)
        f1_scores.append(f1)

    print(f"Epoch {epoch+1} - Train Loss: {train_loss:.4f}, Min F1: {min(f1_scores):.4f}, F1s: {f1_scores}")

import numpy as np

# Save validation predictions and labels from the last epoch
np.save('val_labels.npy', np.array(val_labels))
np.save('val_preds.npy', np.array(val_preds))


Epoch 1/10: 100%|██████████| 33/33 [00:15<00:00,  2.12it/s]


Epoch 1 - Train Loss: 5.3401, Min F1: 0.9333, F1s: [0.9625, 0.9333333333333333, 1.0, 0.9705882352941176]


Epoch 2/10: 100%|██████████| 33/33 [00:13<00:00,  2.41it/s]


Epoch 2 - Train Loss: 3.3736, Min F1: 0.9091, F1s: [0.954248366013072, 0.9090909090909091, 1.0, 0.9855072463768115]


Epoch 3/10: 100%|██████████| 33/33 [00:13<00:00,  2.47it/s]


Epoch 3 - Train Loss: 2.2536, Min F1: 0.9855, F1s: [0.9937106918238994, 1.0, 1.0, 0.9855072463768115]


Epoch 4/10: 100%|██████████| 33/33 [00:13<00:00,  2.44it/s]


Epoch 4 - Train Loss: 2.1742, Min F1: 0.9836, F1s: [0.9873417721518988, 0.9836065573770492, 1.0, 0.9855072463768115]


Epoch 5/10: 100%|██████████| 33/33 [00:13<00:00,  2.44it/s]


Epoch 5 - Train Loss: 1.5055, Min F1: 0.9565, F1s: [0.975, 0.983050847457627, 1.0, 0.9565217391304348]


Epoch 6/10: 100%|██████████| 33/33 [00:13<00:00,  2.38it/s]


Epoch 6 - Train Loss: 1.9789, Min F1: 0.9836, F1s: [0.9873417721518988, 0.9836065573770492, 1.0, 0.9855072463768115]


Epoch 7/10: 100%|██████████| 33/33 [00:13<00:00,  2.43it/s]


Epoch 7 - Train Loss: 1.3890, Min F1: 0.9855, F1s: [0.9937106918238994, 1.0, 1.0, 0.9855072463768115]


Epoch 8/10: 100%|██████████| 33/33 [00:13<00:00,  2.42it/s]


Epoch 8 - Train Loss: 1.1169, Min F1: 0.9831, F1s: [0.9875, 0.983050847457627, 1.0, 0.9855072463768115]


Epoch 9/10: 100%|██████████| 33/33 [00:13<00:00,  2.48it/s]


Epoch 9 - Train Loss: 2.2064, Min F1: 0.9836, F1s: [0.9873417721518988, 0.9836065573770492, 1.0, 0.9855072463768115]


Epoch 10/10: 100%|██████████| 33/33 [00:13<00:00,  2.47it/s]


Epoch 10 - Train Loss: 1.1174, Min F1: 0.9706, F1s: [0.9875, 1.0, 1.0, 0.9705882352941176]


In [7]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pandas as pd
import json

# Load saved validation data
val_labels = np.load('val_labels.npy')
val_preds = np.load('val_preds.npy')

# If not already in memory, define your label mapping here (must match your training code!)
# label_mapping = {'Alluvial soil': 0, 'Black soil': 1, ...}
# inv_label_mapping = {v: k for k, v in label_mapping.items()}
target_names = [inv_label_mapping[i] for i in range(len(label_mapping))]

# Compute metrics
report = classification_report(val_labels, val_preds, target_names=target_names, output_dict=True)
conf_matrix = confusion_matrix(val_labels, val_preds)
accuracy = accuracy_score(val_labels, val_preds)

# Export as CSV
pd.DataFrame(report).transpose().to_csv("ml_metrics_report.csv", index=True)
pd.DataFrame(conf_matrix, index=target_names, columns=target_names).to_csv("ml_confusion_matrix.csv")

# Export as JSON
with open("ml_metrics_report.json", "w") as f:
    json.dump({
        "classification_report": report,
        "confusion_matrix": conf_matrix.tolist(),
        "accuracy": accuracy,
    }, f, indent=4)

print(" ML metrics exported: 'ml_metrics_report.csv', 'ml_confusion_matrix.csv', 'ml_metrics_report.json'")

 ML metrics exported: 'ml_metrics_report.csv', 'ml_confusion_matrix.csv', 'ml_metrics_report.json'


## Test Set Prediction & Submission

In [6]:

#  Test prediction
test_ids = pd.read_csv(TEST_IDS_CSV)
test_ids['image'] = test_ids['image_id']
test_dataset = SoilDataset(test_ids, TEST_DIR, transform=image_transforms['test'], is_test=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

model.eval()
test_preds = []
image_names = []

with torch.no_grad():
    for images, image_ids in test_loader:
        images = images.to(device)
        outputs = model(images)
        preds = outputs.argmax(1).cpu().numpy()
        test_preds.extend(preds)
        image_names.extend(image_ids)

#  Map back to soil type
final_labels = [inv_label_mapping[p] for p in test_preds]
submission = pd.DataFrame({
    'image_id': image_names,
    'soil_type': final_labels
})

submission.to_csv('submission.csv', index=False)
print("submission.csv saved!")


submission.csv saved!


In [7]:
import pandas as pd

# Load and display the submission file
submission = pd.read_csv('submission.csv')
submission.head(77)  # Show the first 10 predictions (you can change the number)


Unnamed: 0,image_id,soil_type
0,img_cdf80d6f.jpeg,Alluvial soil
1,img_c0142a80.jpg,Alluvial soil
2,img_91168fb0.jpg,Alluvial soil
3,img_9822190f.jpg,Alluvial soil
4,img_e5fc436c.jpeg,Alluvial soil
...,...,...
72,img_64d9cdbe.jpg,Clay soil
73,img_5e5ff453.jpg,Clay soil
74,img_2c4f84e3.jpg,Clay soil
75,img_0a40bbe2.jpg,Clay soil
