# Feature Extraction and Transformation Pipeline

This notebook demonstrates the process of feature extraction, clustering, histogram creation, and IDF computation using custom transformers and pipelines.

In [4]:
import os
import cv2
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from classes.feature_extractor import FeatureExtractor
from classes.clusterer import Clusterer
from classes.idf_transformer import IDFTransformer
import joblib
import matplotlib.pyplot as plt

## Data Preparation

Load images, create a DataFrame, encode labels, and split the data into training and testing sets.

In [5]:
# Function to load images from a directory
def load_images_from_directory(directory):
    data = []
    for root, _, files in os.walk(directory):
        for filename in files:
            if filename.endswith(".jpg") or filename.endswith(".png"):
                img = cv2.imread(os.path.join(root, filename), cv2.IMREAD_GRAYSCALE)
                if img is not None:
                    data.append({'filename': filename, 'image': img})
    return data


In [6]:
# Specify the directory containing images
image_directory = 'data/preprocessed'

# Load images
images_data = load_images_from_directory(image_directory)

# Convert to DataFrame
df = pd.DataFrame(images_data)
df.set_index('filename', inplace=True)
df['Target'] = df.index.map(lambda x: x.split('_')[0])

# Encode the labels
label_encoder = LabelEncoder()
df['Target'] = label_encoder.fit_transform(df['Target'])

# Split the data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

## Define the Pipelines

Create separate pipelines for ORB and SIFT that include all the steps.

In [7]:
# Function to extract images from DataFrame
def extract_images(df):
    if 'image' not in df.columns:
        raise KeyError("The DataFrame does not contain an 'image' column.")
    return df['image']

# Create the pipeline for ORB
pipeline_ORB = Pipeline([
    ('extract_images', FunctionTransformer(extract_images, validate=False)),
    ('feature_extractor', FeatureExtractor(method='ORB')),
    ('clusterer', Clusterer(num_clusters=100)),
    # ('idf_transformer', IDFTransformer())
])

# Create the pipeline for SIFT
pipeline_SIFT = Pipeline([
    ('extract_images', FunctionTransformer(extract_images, validate=False)),
    ('feature_extractor', FeatureExtractor(method='SIFT')),
    ('clusterer', Clusterer(num_clusters=100)),
    # ('idf_transformer', IDFTransformer())
])

## Transform the Data and Add Histograms to DataFrame

Use the pipelines to fit and transform the training data, and add the resulting histograms back into the DataFrame.

In [5]:
# Fit the pipelines on the training data
orb_histograms_train = pipeline_ORB.fit_transform(train_df)
sift_histograms_train = pipeline_SIFT.fit_transform(train_df)

# Transform the testing data
orb_histograms_test = pipeline_ORB.transform(test_df)
sift_histograms_test = pipeline_SIFT.transform(test_df)

# Add histograms to DataFrame
train_df['histogram_ORB_IDF'] = list(orb_histograms_train)
test_df['histogram_ORB_IDF'] = list(orb_histograms_test)
train_df['histogram_SIFT_IDF'] = list(sift_histograms_train)
test_df['histogram_SIFT_IDF'] = list(sift_histograms_test)



In [None]:
joblib.dump(orb_histograms_train, 'models/pipeline_ORB_100.joblib')
joblib.dump(sift_histograms_train, 'models/pipeline_SIFT_100.joblib')

In [9]:
orb_histograms=joblib.load('models/orb_histograms.joblib')
sift_histograms=joblib.load('models/sift_histograms.joblib')

In [None]:
# Display the first histogram to verify
if not train_df.empty:
    first_user = train_df.index[1]
    first_histogram = train_df.iloc[1]['histogram_ORB_IDF']
    plt.bar(range(100), first_histogram)
    plt.title(f'Visual Words Histogram for User: {first_user}')
    plt.xlabel('Visual Word Index')
    plt.ylabel('Frequency')
    plt.show()

In [8]:
# -----------------------------
# Create Pipelines
# -----------------------------
# Pipeline for ORB Features with SVM Classifier
from sklearn.svm import SVC


pipeline_ORB_SVM = Pipeline([
    ('extract_images', FunctionTransformer(extract_images, validate=False)),
    ('feature_extractor', FeatureExtractor(method='ORB')),
    ('clusterer', Clusterer(num_clusters=100)),
    # ('idf_transformer', IDFTransformer()),
    ('classifier', SVC(kernel='rbf', C=50, random_state=0))
])

# Pipeline for SIFT Features with SVM Classifier
pipeline_SIFT_SVM = Pipeline([
    ('extract_images', FunctionTransformer(extract_images, validate=False)),
    ('feature_extractor', FeatureExtractor(method='SIFT')),
    ('clusterer', Clusterer(num_clusters=100)),
    # ('idf_transformer', IDFTransformer()),
    ('classifier', SVC(kernel='rbf', C=50, random_state=0))
])

In [9]:
test_df['Target']

filename
user043_ghazaal_007.png      42
user047_azan_002.png         46
user053_abjadiyah_039.png    52
user010_ghaleez_012.png       9
user081_qashtah_027.png      80
                             ..
user025_azan_006.png         24
user012_abjadiyah_039.png    11
user071_ghazaal_003.png      70
user073_azan_009.png         72
user040_qashtah_027.png      39
Name: Target, Length: 1629, dtype: int64

In [10]:
# Fit the pipelines on the training data
pipeline_ORB_SVM.fit(train_df, train_df['Target'])
# pipeline_SIFT_SVM.fit(train_df)


In [11]:
joblib.dump(pipeline_ORB_SVM, 'models/pipeline_ORB_SVM_100.joblib')

['models/pipeline_ORB_SVM_100.joblib']

In [12]:
pipeline_ORB_SVM=joblib.load('models/pipeline_ORB_SVM_100.joblib')

In [13]:
x = pipeline_ORB_SVM.predict(test_df)

In [51]:
import numpy as np

# Ensure all target labels are of type np.int64
train_df['Target'] = train_df['Target'].astype(np.int64)
test_df['Target'] = test_df['Target'].astype(np.int64)


In [None]:
print(train_df['Target'].dtype)

In [14]:
# Example test data

# Make predictions on the test set
from sklearn.metrics import classification_report, accuracy_score


print("Making predictions on the test set...")
predictions = pipeline_ORB_SVM.predict(test_df)
print("Predictions completed.\n")

# Calculate the accuracy
accuracy = accuracy_score(test_df['Target'], predictions)
print(f"Accuracy of the pipeline: {accuracy:.2f}")

# # # Generate and print the classification report
# report = classification_report(test_df['Target'], predictions, target_names=label_encoder.classes_)
# print("\nClassification Report:\n", report)

Making predictions on the test set...
Predictions completed.

Accuracy of the pipeline: 0.31


In [47]:
for x in list(test_df['Target'].values):
    if type(x) is not np.int64:
        print(x) 

In [15]:
# -----------------------------
# Define Parameter Grid for GridSearchCV
# -----------------------------
from sklearn.model_selection import GridSearchCV


param_grid = {
    'clusterer__num_clusters': [200, 300, 400, 500, 600, 700, 800]
}

# -----------------------------
# Initialize GridSearchCV for ORB Pipeline
# -----------------------------
grid_search_ORB = GridSearchCV(
    estimator=pipeline_ORB_SVM,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)


In [None]:
# -----------------------------
# Fit GridSearchCV for ORB Pipeline
# -----------------------------
from sklearn.metrics import classification_report, accuracy_score  # This is correct


print("Starting GridSearchCV for ORB Pipeline...")
grid_search_ORB.fit(train_df, train_df['Target'])
print("GridSearchCV for ORB Pipeline completed.\n")

# -----------------------------
# Best Parameters and Score for ORB
# -----------------------------
print("Best Parameters for ORB Pipeline:", grid_search_ORB.best_params_)
print("Best Cross-Validation Accuracy for ORB Pipeline:", grid_search_ORB.best_score_)

# -----------------------------
# Predict and Evaluate ORB Pipeline
# -----------------------------
best_ORB = grid_search_ORB.best_estimator_
predictions_ORB = best_ORB.predict(test_df['histogram_ORB_IDF'])

print("\nORB SVM Classification Report:")
print(classification_report(test_df['Target'], predictions_ORB))
print("ORB SVM Accuracy:", accuracy_score(test_df['Target'], predictions_ORB))

Starting GridSearchCV for ORB Pipeline...
Fitting 5 folds for each of 7 candidates, totalling 35 fits
[CV] END ........................clusterer__num_clusters=200; total time= 5.7min
[CV] END ........................clusterer__num_clusters=200; total time= 5.8min
[CV] END ........................clusterer__num_clusters=200; total time= 3.1min
[CV] END ........................clusterer__num_clusters=200; total time= 4.3min
[CV] END ........................clusterer__num_clusters=200; total time= 3.6min
[CV] END ........................clusterer__num_clusters=300; total time= 5.7min
[CV] END ........................clusterer__num_clusters=300; total time= 5.8min
[CV] END ........................clusterer__num_clusters=300; total time= 3.1min
[CV] END ........................clusterer__num_clusters=300; total time= 4.3min
[CV] END ........................clusterer__num_clusters=300; total time= 3.7min
