### Data loading

In [11]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import rasterio

import matplotlib.pyplot as plt
from collections import Counter

# Load file paths
train_files = [line.strip() for line in open("/Users/user/Downloads/DLF/train_filenames.lst")]
test_files = [line.strip() for line in open("/Users/user/Downloads/DLF/test_filenames.lst")]

# Extract species names from file names
def extract_species(filename):
    parts = filename.split('_')
    return f"{parts[0]}_{parts[1]}"  # Join genus and species

train_species = [extract_species(f) for f in train_files]
test_species = [extract_species(f) for f in test_files]

class TreeSpeciesDataset(Dataset):
    def __init__(self, file_paths, transform=None):
        self.file_paths = file_paths
        self.transform = transform
        self.species_to_idx = self._create_label_mapping()
        
    def _create_label_mapping(self):
        return {species: idx for idx, species in 
            enumerate(sorted(set(extract_species(f) for f in self.file_paths)))}  # Added missing parenthesis

    
    def __len__(self):
        return len(self.file_paths)
    
    def __getitem__(self, idx):
        img_path = self.file_paths[idx]
        
        with rasterio.open(img_path) as src:
            img = src.read().astype(np.float32) / 65535.0  # Normalize here
            
        img_tensor = torch.from_numpy(img)
        
        if self.transform:
            img_tensor = self.transform(img_tensor)
            
        species = extract_species(img_path)
        label = self.species_to_idx[species]
        
        return img_tensor, label



In [12]:
import torch.nn as nn
import torch.nn.functional as F

class TreeSpeciesCNN(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        # Input shape: (12, 20, 20) - 12 bands, 20x20 pixels
        
        self.conv_layers = nn.Sequential(
            # Conv Block 1
            nn.Conv2d(12, 32, kernel_size=3, padding=1),  # Output: (32, 20, 20)
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2),  # Output: (32, 10, 10)
            
            # Conv Block 2
            nn.Conv2d(32, 64, kernel_size=3, padding=1),  # Output: (64, 10, 10)
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2),  # Output: (64, 5, 5)
            
            # Conv Block 3
            nn.Conv2d(64, 128, kernel_size=3, padding=1),  # Output: (128, 5, 5)
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d(1)  # Output: (128, 1, 1)
        )
        
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(64, num_classes)
        )
        
    def forward(self, x):
        x = self.conv_layers(x)
        return self.classifier(x)

In [13]:

from sklearn.model_selection import train_test_split
from torch.utils.data.sampler import WeightedRandomSampler
from sklearn.utils.class_weight import compute_class_weight

# Split with stratification to maintain class distribution
train_files, val_files = train_test_split(
    train_files, 
    test_size=0.2,
    stratify=[extract_species(f) for f in train_files],  # Maintain class balance
    random_state=42
)

# Create datasets
train_dataset = TreeSpeciesDataset(train_files)
val_dataset = TreeSpeciesDataset(val_files)
test_dataset = TreeSpeciesDataset(test_files)  # Keep this completely separate

# Create samplers and loaders
# For training: use weighted sampler
all_train_labels = [train_dataset.species_to_idx[extract_species(f)] for f in train_files]
class_weights = compute_class_weight('balanced', classes=np.unique(all_train_labels), y=all_train_labels)
weights = torch.tensor(class_weights, dtype=torch.float)

train_sampler = WeightedRandomSampler(
    weights=weights[all_train_labels],
    num_samples=len(train_dataset),
    replacement=True
)

# Data Loaders
train_loader = DataLoader(
    train_dataset,
    batch_size=32,
    sampler=train_sampler,
    num_workers=4
)

val_loader = DataLoader(
    val_dataset,
    batch_size=32,
    shuffle=False,  # Don't shuffle for validation
    num_workers=4
)

test_loader = DataLoader(
    test_dataset,
    batch_size=32,
    shuffle=False,  # Never shuffle test data
    num_workers=4
)

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = TreeSpeciesCNN(num_classes=len(train_dataset.species_to_idx)).to(device)
criterion = nn.CrossEntropyLoss(weight=weights.to(device))
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [8]:
# Test a single batch
sample_batch, sample_labels = next(iter(train_loader))
print("Batch shape:", sample_batch.shape)  # Should be (32, 12, 20, 20)
print("Label shape:", sample_labels.shape)  # Should be (32,)
model = TreeSpeciesCNN(num_classes=len(train_dataset.species_to_idx)).to(device)
outputs = model(sample_batch.to(device))
print("Output shape:", outputs.shape)  # Should be (32, num_classes)

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'TreeSpeciesDataset' on <module '__main__' (built-in)>


KeyboardInterrupt: 

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize model, loss, and optimizer
model = TreeSpeciesCNN(num_classes=len(train_dataset.species_to_idx)).to(device)
criterion = nn.CrossEntropyLoss(weight=weights.to(device))
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
best_val_acc = 0.0

for epoch in range(100):
    # Training phase
    model.train()
    train_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
    
    # Validation phase
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    val_acc = 100 * correct / total
    print(f'Epoch {epoch+1}: '
          f'Train Loss: {train_loss/len(train_loader):.4f} | '
          f'Val Loss: {val_loss/len(val_loader):.4f} | '
          f'Val Acc: {val_acc:.2f}%')
    
    # Save best model
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), 'best_model.pth')
        print('Saved best model!')

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/multiprocessing/spawn.py", line 116, in spawn_main
    exitcode = _main(fd, parent_sentinel)
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/multiprocessing/spawn.py", line 126, in _main
    self = reduction.pickle.load(from_parent)
AttributeError: Can't get attribute 'TreeSpeciesDataset' on <module '__main__' (built-in)>


KeyboardInterrupt: 