# Feature Extraction and Transformation Pipeline

This notebook demonstrates the process of feature extraction, clustering, histogram creation, and IDF computation using custom transformers and pipelines.

In [1]:
import os
import cv2
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from classes.feature_extractor import FeatureExtractor
from classes.clusterer import Clusterer
from utilities import utils, process, modify, evaluate
import joblib
import matplotlib.pyplot as plt

In [2]:
import numpy as np
import random

random.seed(42)
np.random.seed(42)

## Data Preparation

Load images, create a DataFrame, encode labels, and split the data into training and testing sets.

In [2]:
df = utils.load_images_to_dataframe('data/preprocessed')

# Encode the labels
label_encoder = LabelEncoder()
df['Target'] = label_encoder.fit_transform(df['Target'])

# Split the data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

df.head(5)

Unnamed: 0_level_0,image,Target
filename,Unnamed: 1_level_1,Unnamed: 2_level_1
user064_qashtah_029.png,"[[255, 255, 255, 255, 255, 255, 255, 255, 255,...",63
user035_ghaleez_011.png,"[[255, 255, 255, 255, 255, 255, 255, 255, 255,...",34
user051_ghazaal_009.png,"[[255, 255, 255, 255, 255, 255, 255, 255, 255,...",50
user081_mehras_046.png,"[[255, 255, 255, 255, 255, 255, 255, 255, 255,...",80
user037_sakhar_017.png,"[[255, 255, 255, 255, 255, 255, 255, 255, 255,...",36


## Find the optimal kmeans clusters number



In [None]:
Clusterer.find_best_cluster_number(train_df)

## Transform the Data and Add Histograms to DataFrame

Use the pipelines to fit and transform the training data, and add the resulting histograms back into the DataFrame.

In [None]:
# Display the first histogram to verify
if not train_df.empty:
    first_user = train_df.index[1]
    first_histogram = train_df.iloc[1]['histogram_ORB_IDF']
    plt.bar(range(100), first_histogram)
    plt.title(f'Visual Words Histogram for User: {first_user}')
    plt.xlabel('Visual Word Index')
    plt.ylabel('Frequency')
    plt.show()

In [3]:
def extract_images(df):
    if 'image' not in df.columns:
        raise KeyError("The DataFrame does not contain an 'image' column.")
    return df['image']

In [4]:
# -----------------------------
# Create Pipelines
# -----------------------------
# Pipeline for ORB Features with SVM Classifier
from sklearn.svm import SVC


pipeline_ORB_SVM = Pipeline([
    ('extract_images', FunctionTransformer(utils.extract_images, validate=False)),
    ('feature_extractor', FeatureExtractor(method='ORB')),
    ('clusterer', Clusterer(num_clusters=500)),
    # ('idf_transformer', IDFTransformer()),
    ('classifier', SVC(class_weight='balanced', C=50, random_state=0, gamma=0.001, degree=2))
])

# Pipeline for SIFT Features with SVM Classifier
pipeline_SIFT_SVM = Pipeline([
    ('extract_images', FunctionTransformer(utils.extract_images, validate=False)),
    ('feature_extractor', FeatureExtractor(method='SIFT')),
    ('clusterer', Clusterer(num_clusters=500)),
    # ('idf_transformer', IDFTransformer()),
    ('classifier', SVC(class_weight='balanced', C=50, random_state=0, gamma=0.001, degree=2))
])

In [5]:
import time

# Measure the time to fit the ORB pipeline
start_time_orb = time.time()
pipeline_ORB_SVM.fit(train_df, train_df['Target'])
end_time_orb = time.time()
time_taken_orb = end_time_orb - start_time_orb
print(f"Time taken to fit ORB pipeline: {time_taken_orb:.2f} seconds")

# Measure the time to fit the SIFT pipeline
start_time_sift = time.time()
pipeline_SIFT_SVM.fit(train_df, train_df['Target'])
end_time_sift = time.time()
time_taken_sift = end_time_sift - start_time_sift
print(f"Time taken to fit SIFT pipeline: {time_taken_sift:.2f} seconds")

Time taken to fit ORB pipeline: 1091.94 seconds
Time taken to fit SIFT pipeline: 450.77 seconds


In [8]:
joblib.dump(pipeline_ORB_SVM, 'models/pipeline_ORB_SVM_500.joblib')
joblib.dump(pipeline_SIFT_SVM, 'models/pipeline_SIFT_SVM_500.joblib')

['models/pipeline_SIFT_SVM_500.joblib']

In [6]:
loaded_pipeline_ORB_SVM=joblib.load('models/pipeline_ORB_SVM_500.joblib')
loaded_pipeline_SIFT_SVM=joblib.load('models/pipeline_SIFT_SVM_500.joblib')

In [9]:
accuracy_df = evaluate.evaluate_models(pipeline_ORB_SVM, pipeline_SIFT_SVM, test_df)
accuracy_df

Unnamed: 0,Model,Accuracy
0,ORB,0.311234
1,SIFT,0.319214


In [10]:
accuracy_df['Time'] = [time_taken_orb, time_taken_sift]
accuracy_df

Unnamed: 0,Model,Accuracy,Time
0,ORB,0.311234,1091.93586
1,SIFT,0.319214,450.765967


In [11]:
accuracy_df.to_latex('tables/accuracy_table.tex', index=False)

In [8]:
directory = 'data/output'
dict = process.process_output_directory(directory, loaded_pipeline_SIFT_SVM, loaded_pipeline_ORB_SVM, label_encoder)

Processing subdirectory: data/output/noise/noise_10


Loaded DataFrame for noise_10:
                                                                     image  \
filename                                                                     
user064_qashtah_029.png  [[255, 255, 255, 255, 255, 255, 255, 255, 255,...   
user035_ghaleez_011.png  [[255, 255, 255, 255, 255, 255, 255, 255, 255,...   
user051_ghazaal_009.png  [[255, 255, 255, 255, 255, 255, 255, 255, 255,...   
user081_mehras_046.png   [[255, 255, 255, 255, 255, 255, 255, 255, 255,...   
user037_sakhar_017.png   [[255, 255, 255, 255, 255, 255, 255, 255, 255,...   

                         Target  
filename                         
user064_qashtah_029.png      63  
user035_ghaleez_011.png      34  
user051_ghazaal_009.png      50  
user081_mehras_046.png       80  
user037_sakhar_017.png       36  
SIFT Predictions for noise_10: [34 34 62 ... 15 57 15]
SIFT Accuracy for noise_10: 0.03524066797642436
ORB Predictions for noise_10: [63 57 28 ... 18  4  9]
ORB Accuracy for noise_10: 

In [17]:
dict

{'data/output/noise':    transformation  accuracy_sift  accuracy_orb
 0  noise_noise_10       0.035241      0.070481
 1  noise_noise_20       0.040152      0.055869
 2  noise_noise_30       0.047520      0.063482,
 'data/output/rotate':   transformation  accuracy_sift  accuracy_orb
 0      rotate_90       0.098600      0.060413
 1      rotate_45       0.042731      0.028733
 2     rotate_135       0.036960      0.031434,
 'data/output/scaling':        transformation  accuracy_sift  accuracy_orb
 0   scaling_scale_0_5       0.169941      0.019524
 1   scaling_scale_1_5       0.006262      0.017927
 2  scaling_scale_0_75       0.276891      0.286100
 3  scaling_scale_1_25       0.006017      0.051940}

In [16]:
import IPython.display as display


for df_temp in dict.values():
    display.display(df_temp)
    df_temp.to_latex(f'{df_temp['transformation'].iloc[0]}.tex')

Unnamed: 0,transformation,accuracy_sift,accuracy_orb
0,noise_noise_10,0.035241,0.070481
1,noise_noise_20,0.040152,0.055869
2,noise_noise_30,0.04752,0.063482


Unnamed: 0,transformation,accuracy_sift,accuracy_orb
0,rotate_90,0.0986,0.060413
1,rotate_45,0.042731,0.028733
2,rotate_135,0.03696,0.031434


Unnamed: 0,transformation,accuracy_sift,accuracy_orb
0,scaling_scale_0_5,0.169941,0.019524
1,scaling_scale_1_5,0.006262,0.017927
2,scaling_scale_0_75,0.276891,0.2861
3,scaling_scale_1_25,0.006017,0.05194


In [20]:
dict['data/output/noise']

Unnamed: 0,transformation,accuracy_sift,accuracy_orb
0,noise_noise_10,0.035241,0.070481
1,noise_noise_20,0.040152,0.055869
2,noise_noise_30,0.04752,0.063482


In [None]:
# -----------------------------
# Define Parameter Grid for GridSearchCV
# -----------------------------
from sklearn.model_selection import GridSearchCV


param_grid = {
    'clusterer__num_clusters': [200, 300, 400, 500, 600, 700, 800]
}

# -----------------------------
# Initialize GridSearchCV for ORB Pipeline
# -----------------------------
grid_search_ORB = GridSearchCV(
    estimator=pipeline_ORB_SVM,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)


In [None]:
# -----------------------------
# Fit GridSearchCV for ORB Pipeline
# -----------------------------
from sklearn.metrics import classification_report, accuracy_score  # This is correct


print("Starting GridSearchCV for ORB Pipeline...")
grid_search_ORB.fit(train_df, train_df['Target'])
print("GridSearchCV for ORB Pipeline completed.\n")

# -----------------------------
# Best Parameters and Score for ORB
# -----------------------------
print("Best Parameters for ORB Pipeline:", grid_search_ORB.best_params_)
print("Best Cross-Validation Accuracy for ORB Pipeline:", grid_search_ORB.best_score_)

# -----------------------------
# Predict and Evaluate ORB Pipeline
# -----------------------------
best_ORB = grid_search_ORB.best_estimator_
predictions_ORB = best_ORB.predict(test_df['histogram_ORB_IDF'])

print("\nORB SVM Classification Report:")
print(classification_report(test_df['Target'], predictions_ORB))
print("ORB SVM Accuracy:", accuracy_score(test_df['Target'], predictions_ORB))

In [None]:
grid_search_ORB.best_estimator_