In [1]:
import os
import numpy as np
import pandas as pd
import cv2
from PIL import Image
from tqdm import tqdm
import random
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import timm
import albumentations as A
from albumentations.pytorch import ToTensorV2

  check_for_updates()


In [2]:
# seed for reproduciblity
def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [5]:
# data augmentation
def get_test_transforms(img_size=384):
    return A.Compose([
        A.Resize(img_size, img_size),
        A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ToTensorV2(),
    ])

In [6]:
# input test data structure
class SoilTestDataset(Dataset):
    """Dataset class for test data"""
    def __init__(self, df, img_dir, transform=None):
        self.df = df
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.df.iloc[idx]['image_id'])
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        if self.transform:
            augmented = self.transform(image=img)
            img = augmented['image']

        return img

In [7]:
# initializing model
class SoilClassifier(nn.Module):
    def __init__(self, model_name='efficientnet_b3', pretrained=False, num_classes=4):
        super(SoilClassifier, self).__init__()
        self.model = timm.create_model(model_name, pretrained=pretrained)

        # Get the number of features in the last layer
        if 'efficientnet' in model_name:
            n_features = self.model.classifier.in_features
            self.model.classifier = nn.Identity()
        else:  # For other models like ResNet
            n_features = self.model.fc.in_features
            self.model.fc = nn.Identity()

        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(n_features, num_classes)

    def forward(self, x):
        features = self.model(x)
        features = self.dropout(features)
        return self.classifier(features)

In [9]:
def make_predictions(model, dataloader, device):
    model.eval()
    predictions = []
    probabilities = []

    print("Making predictions...")
    with torch.no_grad():
        for batch_idx, images in enumerate(tqdm(dataloader, desc="Predicting")):
            images = images.to(device)

            # Forward pass
            outputs = model(images)
            probs = torch.nn.functional.softmax(outputs, dim=1)

            # Get predictions
            _, preds = torch.max(outputs, 1)

            # Store results
            predictions.extend(preds.cpu().numpy())
            probabilities.extend(probs.cpu().numpy())

    return predictions, probabilities

In [10]:
def main():
    set_seed(42)

    # Device agnostic code
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # File paths
    TEST_DIR = '/kaggle/input/soil-classification/soil_classification-2025/test'
    TEST_CSV = '/kaggle/input/soil-classification/soil_classification-2025/test_ids.csv'
    MODEL_PATH = '/kaggle/input/your-model-path/model.pth'

    # Configuration
    CONFIG = {
        'IMG_SIZE': 384,
        'BATCH_SIZE': 32,  # Increased batch size for faster inference
        'NUM_WORKERS': 4,
        'MODEL_NAME': 'efficientnet_b3',
        'NUM_CLASSES': 4,
        'DEVICE': device,
    }

    # Soil type mapping
    soil_types = {
        'Alluvial soil': 0,
        'Black Soil': 1,
        'Clay soil': 2,
        'Red soil': 3
    }

    # Reverse mapping for predictions
    idx_to_soil = {v: k for k, v in soil_types.items()}
    print("Soil type mapping:")
    for k, v in soil_types.items():
        print(f"{v}: {k}")

    # Load test data
    test_df = pd.read_csv(TEST_CSV)
    print(f"\nTest data shape: {test_df.shape}")
    print("Sample of test data:")
    print(test_df.head())

    # Load the pre-trained model
    print("\nLoading pre-trained model...")
    model = SoilClassifier(
        model_name=CONFIG['MODEL_NAME'],
        pretrained=False,
        num_classes=CONFIG['NUM_CLASSES']
    ).to(CONFIG['DEVICE'])

    # Load the trained weights
    model.load_state_dict(torch.load(MODEL_PATH, map_location=CONFIG['DEVICE']))
    model.eval()
    print("Model loaded successfully!")

    # Create test dataset and dataloader
    test_dataset = SoilTestDataset(
        test_df,
        TEST_DIR,
        transform=get_test_transforms(CONFIG['IMG_SIZE'])
    )

    test_loader = DataLoader(
        test_dataset,
        batch_size=CONFIG['BATCH_SIZE'],
        shuffle=False,
        num_workers=CONFIG['NUM_WORKERS'],
        pin_memory=True
    )

    print(f"\nTest dataset size: {len(test_dataset)}")
    print(f"Number of batches: {len(test_loader)}")

    # Make predictions
    predictions, probabilities = make_predictions(model, test_loader, CONFIG['DEVICE'])

    print(f"\nGenerated {len(predictions)} predictions")
    print("Prediction distribution:")
    pred_counts = np.bincount(predictions)
    for i, count in enumerate(pred_counts):
        if count > 0:
            print(f"{idx_to_soil[i]}: {count} ({count/len(predictions)*100:.1f}%)")

    # Convert predictions to soil type names
    predicted_soil_types = [idx_to_soil[pred] for pred in predictions]

    # Create submission dataframe
    submission_df = test_df.copy()
    submission_df['soil_type'] = predicted_soil_types

    print("\nSubmission dataframe:")
    print(submission_df.head())
    print(f"Submission shape: {submission_df.shape}")

    # Verify submission format
    print("\nVerifying submission format...")
    print(f"Required columns: ['image_id', 'soil_type']")
    print(f"Actual columns: {list(submission_df.columns)}")
    print("\nUnique soil types in submission:")
    print(submission_df['soil_type'].value_counts())

    # Check for any missing values
    if submission_df.isnull().sum().sum() > 0:
        print("\nWarning: Found missing values in submission!")
        print(submission_df.isnull().sum())
    else:
        print("\nNo missing values found. Submission looks good!")

    # Save submission file
    submission_file = 'submission.csv'
    submission_df[['image_id', 'soil_type']].to_csv(submission_file, index=False)

    print(f"\nSubmission file '{submission_file}' created successfully!")
    print(f"File size: {os.path.getsize(submission_file)} bytes")

    # Display first few rows of the final submission
    print("\nFinal submission (first 10 rows):")
    print(pd.read_csv(submission_file).head(10))

    # Display prediction confidence statistics
    print("\nPrediction Confidence Statistics:")
    probabilities_array = np.array(probabilities)
    max_probs = np.max(probabilities_array, axis=1)

    print(f"Mean confidence: {np.mean(max_probs):.4f}")
    print(f"Median confidence: {np.median(max_probs):.4f}")
    print(f"Min confidence: {np.min(max_probs):.4f}")
    print(f"Max confidence: {np.max(max_probs):.4f}")

    # Count predictions by confidence level
    high_conf = np.sum(max_probs > 0.9)
    med_conf = np.sum((max_probs > 0.7) & (max_probs <= 0.9))
    low_conf = np.sum(max_probs <= 0.7)

    print(f"\nConfidence distribution:")
    print(f"High confidence (>0.9): {high_conf} ({high_conf/len(max_probs)*100:.1f}%)")
    print(f"Medium confidence (0.7-0.9): {med_conf} ({med_conf/len(max_probs)*100:.1f}%)")
    print(f"Low confidence (<=0.7): {low_conf} ({low_conf/len(max_probs)*100:.1f}%)")

    print("\n" + "="*50)
    print("INFERENCE COMPLETE!")
    print("submission.csv file created.")
    print("="*50)

In [None]:
if __name__ == "__main__":
    main()