In [None]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torchvision import models, transforms
from PIL import Image
from tqdm import tqdm

# Paths
DATA_DIR = '/kaggle/input/soil-classification-part-2/soil_competition-2025'
TRAIN_DIR = os.path.join(DATA_DIR, 'train')
TEST_DIR = os.path.join(DATA_DIR, 'test')
train_labels_csv = os.path.join(DATA_DIR, 'train_labels.csv')
test_ids_csv = os.path.join(DATA_DIR, 'test_ids.csv')

train_df = pd.read_csv(train_labels_csv)
test_df = pd.read_csv(test_ids_csv)

# Device and model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = models.efficientnet_b3(weights='IMAGENET1K_V1')
model.classifier = nn.Identity()
model.eval().to(device)

# Transform
transform = transforms.Compose([
    transforms.Resize((300, 300)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.475, 0.453, 0.406],
                         std=[0.228, 0.224, 0.225])
])

# Feature extractor
def extract_features(img_path):
    img = Image.open(img_path).convert("RGB")
    img = transform(img).unsqueeze(0).to(device)
    with torch.no_grad():
        feat = model(img)
    return feat.squeeze().cpu().numpy()

# Step 1: Extract features for all soil training images
train_features = []
train_image_paths = [os.path.join(TRAIN_DIR, img_id) for img_id in train_df['image_id']]

print("Extracting features from training images...")
for path in tqdm(train_image_paths):
    train_features.append(extract_features(path))

train_features = np.array(train_features)

# Step 2: Compute centroid of soil class
soil_centroid = train_features.mean(axis=0)

# Step 3: Compute cosine similarity to centroid for thresholding
from sklearn.metrics.pairwise import cosine_similarity

similarities = cosine_similarity(train_features, soil_centroid.reshape(1, -1)).flatten()
threshold = np.percentile(similarities, 5)  # tune this (e.g., 5th percentile)

print(f"Similarity threshold = {threshold:.4f}")

# Step 4: Predict on test images
test_image_paths = [os.path.join(TEST_DIR, img_id) for img_id in test_df['image_id']]
pred_labels = []

print("Classifying test images...")
for path in tqdm(test_image_paths):
    feat = extract_features(path)
    sim = cosine_similarity(feat.reshape(1, -1), soil_centroid.reshape(1, -1))[0][0]
    pred = 1 if sim >= threshold else 0
    pred_labels.append(pred)

# Save predictions
submission = pd.DataFrame({
    'image_id': test_df['image_id'],
    'soil_type': pred_labels
})

submission_path = "/kaggle/working/submission_pu_similarity.csv"
submission.to_csv(submission_path, index=False)
print(f"Submission saved to {submission_path}")
