In [None]:
# 1. Imports
import os
import pandas as pd
import numpy as np
from PIL import Image
from sklearn.preprocessing impor
t LabelEncoder
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.models import efficientnet_b3, EfficientNet_B3_Weights
from tqdm import tqdm

# 2. Load train CSV and encode labels
train_df = pd.read_csv("/kaggle/input/soil-classification/soil_classification-2025/train_labels.csv")
le = LabelEncoder()
train_df['label'] = le.fit_transform(train_df['soil_type'])
num_classes = len(le.classes_)

# 3. Image transforms
transform = transforms.Compose([
    transforms.Resize((300, 300)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# 4. Dataset class
class SoilDataset(Dataset):
    def __init__(self, df, img_dir, transform=None):
        self.df = df.reset_index(drop=True)
        self.img_dir = img_dir
        self.transform = transform
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        img_id = self.df.loc[idx, 'image_id']
        label = self.df.loc[idx, 'label']
        img_path = os.path.join(self.img_dir, img_id)
        image = Image.open(img_path).convert('RGB')
        
        if self.transform:
            image = self.transform(image)
        
        return image, label

# 5. Train/Val split
train_data, val_data = train_test_split(train_df, test_size=0.2, stratify=train_df['label'], random_state=42)
train_dataset = SoilDataset(train_data, "/kaggle/input/soil-classification/soil_classification-2025/train", transform)
val_dataset = SoilDataset(val_data, "/kaggle/input/soil-classification/soil_classification-2025/train", transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# 6. Load EfficientNet-B3 model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = efficientnet_b3(weights=EfficientNet_B3_Weights.DEFAULT)
model.classifier[1] = nn.Linear(model.classifier[1].in_features, num_classes)
model = model.to(device)

# 7. Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# 8. Training loop
for epoch in range(5):  # change epochs as needed
    model.train()
    train_loss = 0
    for images, labels in tqdm(train_loader, desc=f"Epoch {epoch+1} Training"):
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    
    print(f"Epoch {epoch+1} Loss: {train_loss/len(train_loader):.4f}")
    
    # Validation
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    print(f"Validation Accuracy: {correct/total:.4f}")

    # 9. Prepare Test Dataset
test_df = pd.read_csv("/kaggle/input/soil-classification/soil_classification-2025/test_ids.csv")

class SoilTestDataset(Dataset):
    def __init__(self, img_dir, df, transform=None):
        self.img_dir = img_dir
        self.df = df.reset_index(drop=True)
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_id = self.df.loc[idx, 'image_id']
        img_path = os.path.join(self.img_dir, img_id)
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        return image

test_dataset = SoilTestDataset(
    "/kaggle/input/soil-classification/soil_classification-2025/test",
    test_df,
    transform
)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 10. Inference and create submission
model.eval()
predictions = []
with torch.no_grad():
    for images in tqdm(test_loader, desc="Predicting"):
        images = images.to(device)
        outputs = model(images)
        _, preds = torch.max(outputs, 1)
        predictions.extend(preds.cpu().numpy())

# Convert back to original soil_type labels
predicted_labels = le.inverse_transform(predictions)
submission = pd.DataFrame({
    'image_id': test_df['image_id'],
    'soil_type': predicted_labels
})

submission.to_csv("submission.csv", index=False)
print("Submission file saved.")


