In [1]:
#STEP 1: IMPORTS & SETUP
import os
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split
from torchvision import datasets, transforms, models
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm


In [2]:
# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using:", device)

# Paths
BASE_PATH = '/kaggle/input/soil-classification/soil_classification-2025'
TRAIN_DIR = os.path.join(BASE_PATH, 'train')
TEST_DIR = os.path.join(BASE_PATH, 'test')
LABELS_CSV = os.path.join(BASE_PATH, 'train_labels.csv')
TEST_IDS_CSV = os.path.join(BASE_PATH, 'test_ids.csv')

Using: cuda


In [3]:
#  Load labels
df = pd.read_csv(LABELS_CSV)
df['image'] = df['image_id']
label_mapping = {label: idx for idx, label in enumerate(df['soil_type'].unique())}
inv_label_mapping = {v: k for k, v in label_mapping.items()}
df['label'] = df['soil_type'].map(label_mapping)

#  Train/Val split
train_df, val_df = train_test_split(df, test_size=0.15, stratify=df['label'], random_state=42)

#  Transformations
image_transforms = {
    'train': transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(15),
        transforms.ToTensor(),
        transforms.Normalize([0.5]*3, [0.5]*3)
    ]),
    'val': transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.5]*3, [0.5]*3)
    ]),
    'test': transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.5]*3, [0.5]*3)
    ])
}

In [4]:


#  Dataset
class SoilDataset(Dataset):
    def __init__(self, dataframe, img_dir, transform=None, is_test=False):
        self.df = dataframe
        self.img_dir = img_dir
        self.transform = transform
        self.is_test = is_test

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        image_id = self.df.iloc[idx]['image']
        img_path = os.path.join(self.img_dir, image_id)
        image = Image.open(img_path).convert('RGB')

        if self.transform:
            image = self.transform(image)

        if self.is_test:
            return image, image_id
        else:
            label = self.df.iloc[idx]['label']
            return image, label

#  Dataloaders
train_dataset = SoilDataset(train_df, TRAIN_DIR, transform=image_transforms['train'])
val_dataset = SoilDataset(val_df, TRAIN_DIR, transform=image_transforms['val'])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

#  Model
model = models.resnet18(pretrained=True)
model.fc = nn.Linear(model.fc.in_features, len(label_mapping))
model = model.to(device)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)




Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 167MB/s] 


In [5]:
#  Training loop
EPOCHS = 10
for epoch in range(EPOCHS):
    model.train()
    train_loss = 0
    for images, labels in tqdm(train_loader, desc=f'Epoch {epoch+1}/{EPOCHS}'):
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    # Validation
    model.eval()
    val_preds, val_labels = [], []
    with torch.no_grad():
        for images, labels in val_loader:
            images = images.to(device)
            outputs = model(images)
            preds = outputs.argmax(1).cpu().numpy()
            val_preds.extend(preds)
            val_labels.extend(labels.numpy())

    f1_scores = []
    for i in range(len(label_mapping)):
        f1 = f1_score(np.array(val_labels) == i, np.array(val_preds) == i)
        f1_scores.append(f1)

    print(f"Epoch {epoch+1} - Train Loss: {train_loss:.4f}, Min F1: {min(f1_scores):.4f}, F1s: {f1_scores}")


Epoch 1/10: 100%|██████████| 33/33 [00:20<00:00,  1.60it/s]


Epoch 1 - Train Loss: 14.6895, Min F1: 0.8788, F1s: [0.9473684210526315, 0.8787878787878789, 1.0, 0.9428571428571428]


Epoch 2/10: 100%|██████████| 33/33 [00:12<00:00,  2.59it/s]


Epoch 2 - Train Loss: 3.8636, Min F1: 0.9123, F1s: [0.9693251533742331, 0.912280701754386, 1.0, 0.9705882352941176]


Epoch 3/10: 100%|██████████| 33/33 [00:12<00:00,  2.67it/s]


Epoch 3 - Train Loss: 2.5771, Min F1: 0.9474, F1s: [0.9693251533742331, 0.9473684210526316, 1.0, 0.9705882352941176]


Epoch 4/10: 100%|██████████| 33/33 [00:12<00:00,  2.62it/s]


Epoch 4 - Train Loss: 2.8722, Min F1: 0.9589, F1s: [0.974025974025974, 0.967741935483871, 0.9873417721518987, 0.958904109589041]


Epoch 5/10: 100%|██████████| 33/33 [00:12<00:00,  2.67it/s]


Epoch 5 - Train Loss: 2.4283, Min F1: 0.9831, F1s: [0.9937106918238994, 0.983050847457627, 1.0, 1.0]


Epoch 6/10: 100%|██████████| 33/33 [00:12<00:00,  2.55it/s]


Epoch 6 - Train Loss: 1.6562, Min F1: 0.9552, F1s: [0.9753086419753086, 0.983050847457627, 1.0, 0.955223880597015]


Epoch 7/10: 100%|██████████| 33/33 [00:12<00:00,  2.62it/s]


Epoch 7 - Train Loss: 1.0339, Min F1: 0.9831, F1s: [0.9937106918238994, 0.983050847457627, 1.0, 1.0]


Epoch 8/10: 100%|██████████| 33/33 [00:12<00:00,  2.59it/s]


Epoch 8 - Train Loss: 1.4037, Min F1: 0.9677, F1s: [0.9746835443037974, 0.967741935483871, 0.9743589743589743, 1.0]


Epoch 9/10: 100%|██████████| 33/33 [00:12<00:00,  2.63it/s]


Epoch 9 - Train Loss: 1.0665, Min F1: 1.0000, F1s: [1.0, 1.0, 1.0, 1.0]


Epoch 10/10: 100%|██████████| 33/33 [00:12<00:00,  2.60it/s]


Epoch 10 - Train Loss: 1.3993, Min F1: 0.9855, F1s: [0.9937106918238994, 1.0, 1.0, 0.9855072463768115]


In [6]:

#  Test prediction
test_ids = pd.read_csv(TEST_IDS_CSV)
test_ids['image'] = test_ids['image_id']
test_dataset = SoilDataset(test_ids, TEST_DIR, transform=image_transforms['test'], is_test=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

model.eval()
test_preds = []
image_names = []

with torch.no_grad():
    for images, image_ids in test_loader:
        images = images.to(device)
        outputs = model(images)
        preds = outputs.argmax(1).cpu().numpy()
        test_preds.extend(preds)
        image_names.extend(image_ids)

#  Map back to soil type
final_labels = [inv_label_mapping[p] for p in test_preds]
submission = pd.DataFrame({
    'image_id': image_names,
    'soil_type': final_labels
})

submission.to_csv('submission.csv', index=False)
print("submission.csv saved!")


submission.csv saved!


In [7]:
import pandas as pd

# Load and display the submission file
submission = pd.read_csv('submission.csv')
submission.head(77)  # Show the first 10 predictions (you can change the number)


Unnamed: 0,image_id,soil_type
0,img_cdf80d6f.jpeg,Alluvial soil
1,img_c0142a80.jpg,Alluvial soil
2,img_91168fb0.jpg,Alluvial soil
3,img_9822190f.jpg,Alluvial soil
4,img_e5fc436c.jpeg,Alluvial soil
...,...,...
72,img_64d9cdbe.jpg,Clay soil
73,img_5e5ff453.jpg,Clay soil
74,img_2c4f84e3.jpg,Clay soil
75,img_0a40bbe2.jpg,Clay soil
