In [9]:
import os
import cv2
import numpy as np
import pandas as pd
from skimage.feature import hog
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import random
from tqdm import tqdm

# --- Configuration ---
# ‼️ CORRECTED TEST PATH
TRAIN_DATA_PATH = r"D:\dogs-vs-cats\train\train"
TEST_DATA_PATH = r"D:\dogs-vs-cats\test1\test1" # <-- LIKELY FIX IS HERE

# --- Model Parameters ---
IMAGE_DIMENSIONS = (64, 64)
SAMPLE_SIZE = 4000

In [10]:
def load_and_preprocess_training_data(data_dir, image_dims, sample_size):
    """
    Loads a random sample of training images, resizes them, converts to grayscale, 
    and extracts HOG features.
    """
    features = []
    labels = []
    
    all_filenames = [f for f in os.listdir(data_dir) if f.endswith('.jpg')]
    sample_size = min(len(all_filenames), sample_size)
    sampled_filenames = random.sample(all_filenames, sample_size)

    print(f"Loading and processing {len(sampled_filenames)} training images...")

    for filename in tqdm(sampled_filenames):
        img_path = os.path.join(data_dir, filename)
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        
        if img is not None:
            img_resized = cv2.resize(img, image_dims)
            hog_features = hog(img_resized, pixels_per_cell=(8, 8),
                               cells_per_block=(2, 2), visualize=False)
            
            features.append(hog_features)
            label = 0 if 'cat' in filename else 1
            labels.append(label)

    print("\nTraining data processing complete.")
    return np.array(features), np.array(labels)

In [11]:
# Check if the path exists before trying to load data
if not os.path.isdir(TRAIN_DATA_PATH):
    print(f"❌ Error: The directory '{TRAIN_DATA_PATH}' does not exist.")
    print("Please check the path in Cell 1.")
else:
    # Load data and extract features
    X, y = load_and_preprocess_training_data(TRAIN_DATA_PATH, IMAGE_DIMENSIONS, SAMPLE_SIZE)
    print(f"\n✅ Feature matrix shape: {X.shape}")
    print(f"✅ Labels vector shape: {y.shape}")

Loading and processing 4000 training images...


100%|██████████████████████████████████████████████████████████████████████████████| 4000/4000 [01:03<00:00, 63.15it/s]



Training data processing complete.

✅ Feature matrix shape: (4000, 1764)
✅ Labels vector shape: (4000,)


In [12]:
# Check if the data is valid before proceeding
if 'X' in locals() and X.shape[0] > 0:
    # 1. Split data for training and validation
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    print(f"Data successfully split: {len(X_train)} training samples, {len(X_val)} validation samples.")

    # 2. Train the SVM Model
    print("\n⏳ Training the SVM classifier... (This may take several minutes)")
    svm_model = SVC(kernel='linear', C=1.0, random_state=42, verbose=True)
    svm_model.fit(X_train, y_train)
    print("\n✅ Training complete.")
    
    # 3. Evaluate the model's performance
    print("\n📊 Evaluating model performance...")
    y_pred = svm_model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f"\nAccuracy on Validation Set: {accuracy * 100:.2f}%")
    print("\nClassification Report:")
    print(classification_report(y_val, y_pred, target_names=['Cat', 'Dog']))

else:
    print("❌ ERROR: Data variable 'X' is empty or not defined.")

Data successfully split: 3200 training samples, 800 validation samples.

⏳ Training the SVM classifier... (This may take several minutes)
[LibSVM]
✅ Training complete.

📊 Evaluating model performance...

Accuracy on Validation Set: 67.75%

Classification Report:
              precision    recall  f1-score   support

         Cat       0.69      0.65      0.67       406
         Dog       0.66      0.71      0.68       394

    accuracy                           0.68       800
   macro avg       0.68      0.68      0.68       800
weighted avg       0.68      0.68      0.68       800



In [14]:
def predict_on_test_data(model, data_dir, image_dims):
    """
    Loads test images, preprocesses them, and uses the trained model to make predictions.
    """
    image_ids = []
    predictions = []
    
    test_filenames = [f for f in os.listdir(data_dir) if f.endswith('.jpg')]
    test_filenames.sort(key=lambda x: int(x.split('.')[0])) # Sort by image number
    
    print(f"\nStarting predictions on {len(test_filenames)} test images...")
    
    for filename in tqdm(test_filenames):
        image_id = filename.split('.')[0]
        img_path = os.path.join(data_dir, filename)
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        
        if img is not None:
            img_resized = cv2.resize(img, image_dims)
            hog_features = hog(img_resized, pixels_per_cell=(8, 8),
                               cells_per_block=(2, 2), visualize=False)
            
            prediction = model.predict(hog_features.reshape(1, -1))[0]
            
            image_ids.append(image_id)
            predictions.append(prediction)
            
    print("\nTest prediction complete.")
    return image_ids, predictions

In [15]:
# Check if the model has been trained and the test path is valid
if 'svm_model' in locals() and os.path.isdir(TEST_DATA_PATH):
    # Run the prediction process
    ids, labels = predict_on_test_data(svm_model, TEST_DATA_PATH, IMAGE_DIMENSIONS)

    # Create a pandas DataFrame for the submission file
    submission_df = pd.DataFrame({'id': ids, 'label': labels})
    submission_df['id'] = pd.to_numeric(submission_df['id'])
    submission_df = submission_df.sort_values(by='id')

    # Save the DataFrame to a CSV file
    submission_df.to_csv('submission.csv', index=False)

    print("\n✅ Submission file 'submission.csv' has been created successfully!")
    print("You can now upload this file to the Kaggle competition.")
    
    # Display the first few rows of the submission file
    print("\nSubmission file preview:")
    print(submission_df.head())
    
elif not os.path.isdir(TEST_DATA_PATH):
     print(f"❌ Error: The test data directory '{TEST_DATA_PATH}' does not exist.")
else:
    print("❌ Error: 'svm_model' not found. Please successfully run the training cell first.")


Starting predictions on 12500 test images...


100%|████████████████████████████████████████████████████████████████████████████| 12500/12500 [03:28<00:00, 59.81it/s]


Test prediction complete.

✅ Submission file 'submission.csv' has been created successfully!
You can now upload this file to the Kaggle competition.

Submission file preview:
   id  label
0   1      0
1   2      0
2   3      0
3   4      0
4   5      1





In [17]:
import pandas as pd

# This code assumes the variables 'ids' and 'labels' already exist from the previous step

# Check if the prediction variables exist before creating the file
if 'ids' in locals() and 'labels' in locals():
    # 1. Create a pandas DataFrame from your 'ids' and 'labels'
    submission_df = pd.DataFrame({
        'id': ids,
        'label': labels
    })

    # 2. Convert 'id' column to numbers to ensure it sorts correctly
    submission_df['id'] = pd.to_numeric(submission_df['id'])
    submission_df = submission_df.sort_values(by='id')

    # 3. Save the DataFrame to a CSV file named 'submission.csv'
    # 'index=False' is crucial to match the Kaggle format
    submission_df.to_csv('submission.csv', index=False)

    print("✅ Submission file 'submission.csv' has been created successfully!")
    print("\nFile preview:")
    print(submission_df.head())

else:
    print("❌ Error: Prediction data ('ids' and 'labels') not found.")
    print("Please make sure you have successfully run the prediction cell first.")

✅ Submission file 'submission.csv' has been created successfully!

File preview:
   id  label
0   1      0
1   2      0
2   3      0
3   4      0
4   5      1
