# Feature Extraction and Transformation Pipeline

This notebook demonstrates the process of feature extraction, clustering, histogram creation, and IDF computation using custom transformers and pipelines.

In [1]:
import os
import cv2
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from classes.feature_extractor import FeatureExtractor
from classes.clusterer import Clusterer
from classes.idf_transformer import IDFTransformer
from utilities import utils, modify
import joblib
import matplotlib.pyplot as plt

## Data Preparation

Load images, create a DataFrame, encode labels, and split the data into training and testing sets.

In [2]:
df = utils.load_images_to_dataframe('data/preprocessed')

# Encode the labels
label_encoder = LabelEncoder()
df['Target'] = label_encoder.fit_transform(df['Target'])

# Split the data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

df.head(5)

Unnamed: 0_level_0,image,Target
filename,Unnamed: 1_level_1,Unnamed: 2_level_1
user001_abjadiyah_031.png,"[[255, 255, 255, 255, 255, 255, 255, 255, 255,...",0
user001_abjadiyah_032.png,"[[255, 255, 255, 255, 255, 255, 255, 255, 255,...",0
user001_abjadiyah_033.png,"[[255, 255, 255, 255, 255, 255, 255, 255, 255,...",0
user001_abjadiyah_034.png,"[[255, 255, 255, 255, 255, 255, 255, 255, 255,...",0
user001_abjadiyah_035.png,"[[255, 255, 255, 255, 255, 255, 255, 255, 255,...",0


## Find the optimal kmeans clusters number



In [None]:
Clusterer.find_best_cluster_number(train_df)

## Transform the Data and Add Histograms to DataFrame

Use the pipelines to fit and transform the training data, and add the resulting histograms back into the DataFrame.

In [None]:
# Display the first histogram to verify
if not train_df.empty:
    first_user = train_df.index[1]
    first_histogram = train_df.iloc[1]['histogram_ORB_IDF']
    plt.bar(range(100), first_histogram)
    plt.title(f'Visual Words Histogram for User: {first_user}')
    plt.xlabel('Visual Word Index')
    plt.ylabel('Frequency')
    plt.show()

In [3]:
def extract_images(df):
    if 'image' not in df.columns:
        raise KeyError("The DataFrame does not contain an 'image' column.")
    return df['image']

In [4]:
# -----------------------------
# Create Pipelines
# -----------------------------
# Pipeline for ORB Features with SVM Classifier
from sklearn.svm import SVC


pipeline_ORB_SVM = Pipeline([
    ('extract_images', FunctionTransformer(utils.extract_images, validate=False)),
    ('feature_extractor', FeatureExtractor(method='ORB')),
    ('clusterer', Clusterer(num_clusters=500)),
    # ('idf_transformer', IDFTransformer()),
    ('classifier', SVC(class_weight='balanced', C=50, random_state=0, gamma=0.001, degree=2))
])

# Pipeline for SIFT Features with SVM Classifier
pipeline_SIFT_SVM = Pipeline([
    ('extract_images', FunctionTransformer(utils.extract_images, validate=False)),
    ('feature_extractor', FeatureExtractor(method='SIFT')),
    ('clusterer', Clusterer(num_clusters=500)),
    # ('idf_transformer', IDFTransformer()),
    ('classifier', SVC(class_weight='balanced', C=50, random_state=0, gamma=0.001, degree=2))
])

In [5]:
# Fit the pipelines on the training data
pipeline_ORB_SVM.fit(train_df, train_df['Target'])
# pipeline_SIFT_SVM.fit(train_df, train_df['Target'])


KeyboardInterrupt: 

In [None]:
joblib.dump(pipeline_ORB_SVM, 'models/pipeline_ORB_SVM_500.joblib')
joblib.dump(pipeline_ORB_SVM, 'models/pipeline_SIFT_SVM_500.joblib')

In [12]:
pipeline_ORB_SVM=joblib.load('models/pipeline_ORB_SVM_500.joblib')
pipeline_SIFT_SVM=joblib.load('models/pipeline_SIFT_SVM_500.joblib')

In [13]:
pipeline_ORB_SVM

In [None]:
x = pipeline_ORB_SVM.predict(test_df)

In [None]:
import numpy as np

# Ensure all target labels are of type np.int64
train_df['Target'] = train_df['Target'].astype(np.int64)
test_df['Target'] = test_df['Target'].astype(np.int64)


In [None]:
print(train_df['Target'].dtype)

In [15]:
# Example test data

# Make predictions on the test set
from sklearn.metrics import classification_report, accuracy_score


print("Making predictions on the test set...")
predictions = pipeline_SIFT_SVM.predict(test_df)
print("Predictions completed.\n")

# Calculate the accuracy
accuracy = accuracy_score(test_df['Target'], predictions)
print(f"Accuracy of the pipeline: {accuracy:.2f}")

# # # Generate and print the classification report
# report = classification_report(test_df['Target'], predictions, target_names=label_encoder.classes_)
# print("\nClassification Report:\n", report)

Making predictions on the test set...


Predictions completed.

Accuracy of the pipeline: 0.31


In [None]:
for x in list(test_df['Target'].values):
    if type(x) is not np.int64:
        print(x) 

In [None]:
# -----------------------------
# Define Parameter Grid for GridSearchCV
# -----------------------------
from sklearn.model_selection import GridSearchCV


param_grid = {
    'clusterer__num_clusters': [200, 300, 400, 500, 600, 700, 800]
}

# -----------------------------
# Initialize GridSearchCV for ORB Pipeline
# -----------------------------
grid_search_ORB = GridSearchCV(
    estimator=pipeline_ORB_SVM,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2
)


In [None]:
# -----------------------------
# Fit GridSearchCV for ORB Pipeline
# -----------------------------
from sklearn.metrics import classification_report, accuracy_score  # This is correct


print("Starting GridSearchCV for ORB Pipeline...")
grid_search_ORB.fit(train_df, train_df['Target'])
print("GridSearchCV for ORB Pipeline completed.\n")

# -----------------------------
# Best Parameters and Score for ORB
# -----------------------------
print("Best Parameters for ORB Pipeline:", grid_search_ORB.best_params_)
print("Best Cross-Validation Accuracy for ORB Pipeline:", grid_search_ORB.best_score_)

# -----------------------------
# Predict and Evaluate ORB Pipeline
# -----------------------------
best_ORB = grid_search_ORB.best_estimator_
predictions_ORB = best_ORB.predict(test_df['histogram_ORB_IDF'])

print("\nORB SVM Classification Report:")
print(classification_report(test_df['Target'], predictions_ORB))
print("ORB SVM Accuracy:", accuracy_score(test_df['Target'], predictions_ORB))

In [None]:
grid_search_ORB.best_estimator_