In [4]:
pip install --upgrade torch torchvision





[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
# ==============================================================================
# PHASE 2: SEMI-SUPERVISED LEARNING (FROM SCRATCH & SANITIZED)
# ==============================================================================
# This cell contains the complete, corrected workflow for Phase 2.
# It will:
# 1. Sanitize the data by ensuring all referenced image files exist.
# 2. Generate pseudo-labels for the unlabeled data using the Phase 1 model.
# 3. Combine the original and pseudo-labeled data.
# 4. Train a new, improved "student" model on the combined dataset.
# 5. Use the student model to generate the final predictions.

print("\n### RUNNING PHASE 2: SEMI-SUPERVISED LEARNING (FROM SCRATCH) ###")

# --------------------------------------------------------------------------
# Step 2.1: Re-establish Configuration and Helper Variables
# --------------------------------------------------------------------------
print("Step 2.1: Configuring paths and parameters...")

import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models, transforms
from torch.utils.data import DataLoader, Dataset, random_split
import pandas as pd
import os
from PIL import Image
import time

# Ensure these variables from Phase 1 are correctly defined
BASE_PATH = r'C:\Users\harkp\Desktop\HV-AI-2025\HV-AI-2025'
DATA_DIR = os.path.join(BASE_PATH, 'labeled_data', 'images')
UNLABELED_DIR = os.path.join(BASE_PATH, 'unlabeled_data')
CSV_PATH = os.path.join(BASE_PATH, 'labeled_data', 'labeled_data.csv')
TEST_DIR = os.path.join(BASE_PATH, 'test_images') 

MODEL_PATH_PHASE1 = 'phase1_baseline_model.pth'
MODEL_PATH_PHASE2 = 'phase2_student_model.pth'
OUTPUT_CSV_DIR = os.path.join(BASE_PATH, 'CSV jupyter notebook')
OUTPUT_CSV_PHASE2 = os.path.join(OUTPUT_CSV_DIR, 'phase2_predictions.csv')

CONFIDENCE_THRESHOLD = 0.95
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# We need the class mappings from the original dataset.
# This code assumes 'full_dataset' and 'data_transforms' from Cell 1 exist.
try:
    idx_to_class = full_dataset.idx_to_class
    num_classes = len(full_dataset.class_names)
    print("Using class info from existing 'full_dataset' object.")
except NameError:
    print("Recreating helper objects to get class info...")
    class LabeledDataset(Dataset):
        def __init__(self, csv_path, img_dir, transform=None):
            self.labels_df = pd.read_csv(csv_path)
            self.img_dir = img_dir
            self.transform = transform
            self.class_names = sorted(self.labels_df.iloc[:, 1].unique())
            self.class_to_idx = {name: i for i, name in enumerate(self.class_names)}
            self.idx_to_class = {i: name for name, i in self.class_to_idx.items()}
        def __len__(self): return len(self.labels_df)
        def __getitem__(self, idx):
            img_name = self.labels_df.iloc[idx, 0]
            label_name = self.labels_df.iloc[idx, 1]
            img_path = os.path.join(self.img_dir, img_name)
            image = Image.open(img_path).convert('RGB')
            label_idx = self.class_to_idx[label_name]
            if self.transform: image = self.transform(image)
            return image, label_idx
            
    class TransformedDataset(Dataset):
        def __init__(self, subset, transform):
            self.subset = subset
            self.transform = transform
        def __getitem__(self, index):
            x, y = self.subset[index]
            return self.transform(x), y
        def __len__(self): return len(self.subset)

    data_transforms = {
        'train': transforms.Compose([transforms.Resize((224, 224)), transforms.RandomHorizontalFlip(), transforms.RandomRotation(10), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]),
        'val': transforms.Compose([transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])]),
    }
    full_dataset = LabeledDataset(csv_path=CSV_PATH, img_dir=DATA_DIR)
    idx_to_class = full_dataset.idx_to_class
    num_classes = len(full_dataset.class_names)

print(f"Test data directory: {TEST_DIR}")
print(f"Confidence threshold set to: {CONFIDENCE_THRESHOLD}")
print("-" * 50)

# --------------------------------------------------------------------------
# Step 2.2: Generate Pseudo-Labels
# --------------------------------------------------------------------------
print("Step 2.2: Loading Phase 1 model and generating pseudo-labels...")

teacher_model = models.resnet18(weights=None)
num_ftrs = teacher_model.fc.in_features
teacher_model.fc = nn.Linear(num_ftrs, num_classes)
teacher_model.load_state_dict(torch.load(MODEL_PATH_PHASE1))
teacher_model = teacher_model.to(device)
teacher_model.eval()

softmax = nn.Softmax(dim=1)
pseudo_labels = []

unlabeled_image_files = [f for f in os.listdir(UNLABELED_DIR) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
print(f"Found {len(unlabeled_image_files)} unlabeled images to process.")

for filename in unlabeled_image_files:
    img_path = os.path.join(UNLABELED_DIR, filename)
    image = Image.open(img_path).convert('RGB')
    image_tensor = data_transforms['val'](image).unsqueeze(0).to(device)

    with torch.no_grad():
        outputs = teacher_model(image_tensor)
        probabilities = softmax(outputs)
        confidence, predicted_idx = torch.max(probabilities, 1)

        if confidence.item() > CONFIDENCE_THRESHOLD:
            predicted_class_name = idx_to_class[predicted_idx.item()]
            pseudo_labels.append({'img_name': filename, 'label': predicted_class_name})

print(f"Generated {len(pseudo_labels)} high-confidence pseudo-labels.")
pseudo_labels_df = pd.DataFrame(pseudo_labels)
print("-" * 50)

# --------------------------------------------------------------------------
# Step 2.3: Create and **SANITIZE** Combined Dataset (BUG FIX)
# --------------------------------------------------------------------------
print("Step 2.3: Combining and SANITIZING original and pseudo-labeled data...")

original_labels_df = pd.read_csv(CSV_PATH)
original_labels_df.rename(columns={original_labels_df.columns[0]: 'img_name', original_labels_df.columns[1]: 'label'}, inplace=True)

# --- Sanitization Step for Labeled Data ---
print(f"Original labeled dataframe size: {len(original_labels_df)}")
exists = original_labels_df['img_name'].apply(lambda x: os.path.exists(os.path.join(DATA_DIR, x)))
original_labels_df = original_labels_df[exists].copy() # Use .copy() to avoid SettingWithCopyWarning
print(f"Sanitized labeled dataframe size (found images): {len(original_labels_df)}")
original_labels_df['source_dir'] = DATA_DIR

# --- Sanitization Step for Pseudo-Labeled Data ---
if not pseudo_labels_df.empty:
    print(f"Original pseudo-labels dataframe size: {len(pseudo_labels_df)}")
    exists_pseudo = pseudo_labels_df['img_name'].apply(lambda x: os.path.exists(os.path.join(UNLABELED_DIR, x)))
    pseudo_labels_df = pseudo_labels_df[exists_pseudo].copy()
    print(f"Sanitized pseudo-labels dataframe size (found images): {len(pseudo_labels_df)}")
    pseudo_labels_df['source_dir'] = UNLABELED_DIR

# Combine the two sanitized dataframes
combined_labels_df = pd.concat([original_labels_df, pseudo_labels_df], ignore_index=True)

print(f"Final combined dataset size: {len(combined_labels_df)}")

class CombinedDataset(Dataset):
    def __init__(self, df, class_to_idx, transform=None):
        self.df = df
        self.transform = transform
        self.class_to_idx = class_to_idx

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_name = self.df.iloc[idx]['img_name']
        label_name = self.df.iloc[idx]['label']
        source_dir = self.df.iloc[idx]['source_dir']
        img_path = os.path.join(source_dir, img_name)
        image = Image.open(img_path).convert('RGB')
        label_idx = self.class_to_idx[label_name]
        if self.transform:
            image = self.transform(image)
        return image, label_idx

combined_full_dataset = CombinedDataset(combined_labels_df, class_to_idx=full_dataset.class_to_idx)
train_size_2 = int(0.85 * len(combined_full_dataset))
val_size_2 = len(combined_full_dataset) - train_size_2
train_subset_2, val_subset_2 = random_split(combined_full_dataset, [train_size_2, val_size_2])

train_dataset_2 = TransformedDataset(train_subset_2, data_transforms['train'])
val_dataset_2 = TransformedDataset(val_subset_2, data_transforms['val'])

train_loader_2 = DataLoader(train_dataset_2, batch_size=32, shuffle=True)
val_loader_2 = DataLoader(val_dataset_2, batch_size=32, shuffle=False)
print("-" * 50)

# --------------------------------------------------------------------------
# Step 2.4: Fine-Tune the Student Model
# --------------------------------------------------------------------------
print("Step 2.4: Defining and fine-tuning the student model...")

student_model = models.resnet18(weights=None)
student_model.fc = nn.Linear(num_ftrs, num_classes)
student_model.load_state_dict(torch.load(MODEL_PATH_PHASE1))
student_model = student_model.to(device)

for param in student_model.layer4.parameters():
    param.requires_grad = True

params_to_update = [param for param in student_model.parameters() if param.requires_grad]
optimizer_2 = optim.Adam(params_to_update, lr=0.0001)
criterion_2 = nn.CrossEntropyLoss()

print("Step 2.5: Starting student model training...")
start_time_2 = time.time()
num_epochs_2 = 10
best_val_acc_2 = 0.0

for epoch in range(num_epochs_2):
    student_model.train()
    running_loss = 0.0
    for inputs, labels in train_loader_2:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer_2.zero_grad()
        outputs = student_model(inputs)
        loss = criterion_2(outputs, labels)
        loss.backward()
        optimizer_2.step()
        running_loss += loss.item() * inputs.size(0)
    epoch_loss = running_loss / len(train_loader_2.dataset)

    student_model.eval()
    val_loss = 0.0
    corrects = 0
    with torch.no_grad():
        for inputs, labels in val_loader_2:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = student_model(inputs)
            loss = criterion_2(outputs, labels)
            _, preds = torch.max(outputs, 1)
            val_loss += loss.item() * inputs.size(0)
            corrects += torch.sum(preds == labels.data)
    epoch_val_loss = val_loss / len(val_loader_2.dataset)
    epoch_val_acc = corrects.double() / len(val_loader_2.dataset)

    print(f"Epoch {epoch+1}/{num_epochs_2} | Train Loss: {epoch_loss:.4f} | Val Loss: {epoch_val_loss:.4f} | Val Acc: {epoch_val_acc:.4f}")

    if epoch_val_acc > best_val_acc_2:
        best_val_acc_2 = epoch_val_acc
        torch.save(student_model.state_dict(), MODEL_PATH_PHASE2)
        print(f"  -> Validation accuracy improved. Saving student model to {MODEL_PATH_PHASE2}")

training_time_2 = time.time() - start_time_2
print(f"\nPhase 2 Training complete in {training_time_2 // 60:.0f}m {training_time_2 % 60:.0f}s")
print(f"Best student model validation accuracy: {best_val_acc_2:.4f}")
print("-" * 50)

# --------------------------------------------------------------------------
# Step 2.6: Generate Final Predictions with Student Model
# --------------------------------------------------------------------------
print("Step 2.6: Loading best student model and generating final predictions...")

final_model = models.resnet18(weights=None)
final_model.fc = nn.Linear(num_ftrs, num_classes)
final_model.load_state_dict(torch.load(MODEL_PATH_PHASE2))
final_model = final_model.to(device)
final_model.eval()

final_predictions = []
test_image_files = [f for f in os.listdir(TEST_DIR) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
print(f"Found {len(test_image_files)} images in {TEST_DIR} to predict.")

for filename in test_image_files:
    img_path = os.path.join(TEST_DIR, filename)
    image = Image.open(img_path).convert('RGB')
    image_tensor = data_transforms['val'](image).unsqueeze(0).to(device)

    with torch.no_grad():
        outputs = final_model(image_tensor)
        _, predicted_idx = torch.max(outputs, 1)
        predicted_class_name = idx_to_class[predicted_idx.item()]
    
    final_predictions.append({'path': filename, 'predicted_label': predicted_class_name})
        
final_pred_df = pd.DataFrame(final_predictions)
os.makedirs(os.path.dirname(OUTPUT_CSV_PHASE2), exist_ok=True)
final_pred_df.to_csv(OUTPUT_CSV_PHASE2, index=False)
print(f"Final predictions saved to {OUTPUT_CSV_PHASE2}")

if not final_pred_df.empty:
    print("\n--- Final Prediction Results ---")
    print(final_pred_df.head())

print("\n### PHASE 2 COMPLETE ###")



### RUNNING PHASE 2: SEMI-SUPERVISED LEARNING (FROM SCRATCH) ###
Step 2.1: Configuring paths and parameters...
Using class info from existing 'full_dataset' object.
Test data directory: C:\Users\harkp\Desktop\HV-AI-2025\HV-AI-2025\test_images
Confidence threshold set to: 0.95
--------------------------------------------------
Step 2.2: Loading Phase 1 model and generating pseudo-labels...
Found 14800 unlabeled images to process.
Generated 797 high-confidence pseudo-labels.
--------------------------------------------------
Step 2.3: Combining and SANITIZING original and pseudo-labeled data...
Original labeled dataframe size: 779
Sanitized labeled dataframe size (found images): 775
Original pseudo-labels dataframe size: 797
Sanitized pseudo-labels dataframe size (found images): 797
Final combined dataset size: 1572
--------------------------------------------------
Step 2.4: Defining and fine-tuning the student model...
Step 2.5: Starting student model training...
Epoch 1/10 | Train Lo