In [None]:
# Connect Colab to Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Verify GPU access
import torch
print(f"GPU Available: {torch.cuda.is_available()}")
print(f"GPU Model: {torch.cuda.get_device_name(0)}")

In [None]:
# Import all important libraries
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Subset, WeightedRandomSampler
from torchvision import transforms, models
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report

In [None]:
train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(0.2, 0.2, 0.2, 0.1),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

val_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])


In [None]:
class SoilDataset(Dataset):
    def __init__(self, root_dir, csv_file, transform=None):
        self.root_dir = root_dir
        self.df = pd.read_csv(csv_file)
        self.transform = transform
        self.classes = sorted(self.df['soil_type'].unique())
        self.class_to_idx = {cls: i for i, cls in enumerate(self.classes)}

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
      img_name = os.path.join(self.root_dir, self.df.iloc[idx]['image_id'])
      image = Image.open(img_name).convert('RGB')
      label = self.class_to_idx[self.df.iloc[idx]['soil_type']]
      if self.transform:
        image = self.transform(image)
        return image, label

In [None]:
SAVE_DIR = '/content/drive/MyDrive/soil_classification'
os.makedirs(SAVE_DIR, exist_ok=True)
full_dataset = SoilDataset(
    root_dir='/content/drive/MyDrive/soil_classification/train',
    csv_file='/content/drive/MyDrive/soil_classification/train_labels.csv',
    transform=train_transform
)

In [None]:
import numpy as np
indices = np.arange(len(full_dataset))
stratify_labels = full_dataset.df['soil_type'].values

In [None]:
train_idx, val_idx = train_test_split(
    indices,
    test_size=0.2,
    stratify=stratify_labels,
    random_state=42
)

In [None]:
train_dataset = Subset(full_dataset, train_idx)
val_dataset = Subset(full_dataset, val_idx)
val_dataset.dataset.transform = val_transform

In [None]:
print("Train transform:", train_dataset.dataset.transform)
print("Val transform:", val_dataset.dataset.transform)

In [None]:
from torch.utils.data import (
    Dataset,
    DataLoader,
    Subset,
    WeightedRandomSampler
)

In [None]:
class_counts = full_dataset.df['soil_type'].value_counts().sort_index()
class_weights = 1. / torch.tensor(class_counts.values, dtype=torch.float)

train_labels = stratify_labels[train_idx]
sample_weights = class_weights[torch.tensor([
    full_dataset.class_to_idx[cls] for cls in train_labels
])]

sampler = WeightedRandomSampler(
    weights=sample_weights,
    num_samples=len(sample_weights),
    replacement=True
)

In [None]:
BATCH_SIZE = 64

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    sampler=sampler,
    num_workers=2,
    pin_memory=True
)
val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=2,
    pin_memory=True
)

In [None]:
from torchvision import models

model = models.resnet50(pretrained=True)

num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, len(full_dataset.classes))

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [None]:
from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau

class_weights = class_weights.to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)

optimizer = Adam(model.parameters(), lr=1e-4)
scheduler = ReduceLROnPlateau(optimizer, 'max', patience=3, factor=0.5)

In [None]:
from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau

class_weights = class_weights.to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)

optimizer = Adam(model.parameters(), lr=1e-4)
scheduler = ReduceLROnPlateau(optimizer, 'max', patience=3, factor=0.5)


In [None]:
from tqdm import tqdm

best_f1 = 0.0
for epoch in range(25):
    model.train()
    train_loss = 0.0
    for inputs, labels in tqdm(train_loader, desc=f'Epoch {epoch+1}'):
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    model.eval()
    val_loss = 0.0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    train_loss /= len(train_loader)
    val_loss /= len(val_loader)
    val_f1 = f1_score(all_labels, all_preds, average=None)
    min_f1 = np.min(val_f1)

    scheduler.step(min_f1)

    if min_f1 > best_f1:
        best_f1 = min_f1
        torch.save(model.state_dict(), os.path.join(SAVE_DIR, 'best_model.pth'))

    print(f'\nEpoch {epoch+1} Summary:')
    print(f'Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}')
    print('Class-wise F1 Scores:')
    for cls, score in zip(full_dataset.classes, val_f1):
        print(f'  {cls}: {score:.4f}')
    print(f'Minimum F1: {min_f1:.4f}')


In [None]:
class TestDataset(Dataset):
    def __init__(self, img_dir, csv_path, transform=None):
        """
        Args:
            img_dir (string): Path to directory with test images
            csv_path (string): Path to CSV file with 'image_id' column
            transform (callable): Optional transform to apply
        """
        self.img_dir = img_dir
        self.df = pd.read_csv(csv_path)
        self.transform = transform

        # Verify required columns
        if 'image_id' not in self.df.columns:
            raise ValueError("CSV must contain 'image_id' column")

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_name = os.path.join(self.img_dir, self.df.iloc[idx]['image_id'])
        image = Image.open(img_name).convert('RGB')

        if self.transform:
            image = self.transform(image)

        return image, self.df.iloc[idx]['image_id']  # Return image + original ID

In [None]:
# Set paths (update with your actual paths)
TEST_DIR = '/content/drive/MyDrive/soil_classification/test'
TEST_CSV = '/content/drive/MyDrive/soil_classification/test_ids.csv'

# Create dataset and loader
test_dataset = TestDataset(
    img_dir=TEST_DIR,
    csv_path=TEST_CSV,
    transform=val_transform  # Same as validation transforms
)

test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=2
)

In [None]:
model.eval()
predictions = []
image_ids = []
all_outputs = []  # NEW: Store all model outputs

with torch.no_grad():
    for images, ids in tqdm(test_loader):
        images = images.to(device)
        outputs = model(images)
        _, preds = torch.max(outputs, 1)

        # Store all relevant data
        predictions.extend(preds.cpu().numpy())
        image_ids.extend(ids)
        all_outputs.extend(outputs.cpu())  # NEW: Save all outputs

In [None]:
# Now process outputs together
confidences = [
    torch.nn.functional.softmax(output, dim=0)[pred].item()
    for output, pred in zip(all_outputs, predictions)
]

# Verify lengths match
assert len(image_ids) == len(predictions) == len(confidences), \
    f"Length mismatch: {len(image_ids)} ids, {len(predictions)} preds, {len(confidences)} confs"

# Create DataFrame
results = pd.DataFrame({
    'image_id': image_ids,
    'predicted_class': [full_dataset.classes[p] for p in predictions],
    'confidence': confidences
})

results.to_csv(os.path.join(SAVE_DIR, 'test_predictions.csv'), index=False)