In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier
import itertools
from tabulate import tabulate
import xgboost as xgb
import os
from tqdm import tqdm
from multiprocessing import Pool
from sklearn.preprocessing import LabelEncoder
from openpyxl import Workbook


In [None]:
# set the batch size for reading large datasets. 
# adjust based on available memory
batch_size = 30000

In [None]:
# directory containing the csv files with features extracted from images
csv_directory = '/Users/tony/Desktop/coffeebeans/DataSet'

In [None]:
# list of csv files containing different sets of features
csv_files = ['original_features.csv', 
             'original_features_augmented.csv', 
             'denoised_features.csv', 
             'denoised_features_augmented.csv']

dfs = {}
for file in csv_files:
    file_path = os.path.join(csv_directory, file) # Construct full path for each file
    try:
        df_list = [] # To store chunks of the dataframe
        with tqdm(total=os.path.getsize(file_path), # Get file size for progress bar
                  desc=f'Reading {file}', # Description for the progress bar
                  unit='B', # Unit of measurement for progress bar
                  unit_scale=True, # Scale unit measurement
                  bar_format='{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}{postfix}]',
                  colour='GREEN') as pbar: # Set color for the progress bar
            bytes_read = 0
            for chunk in pd.read_csv(file_path, chunksize=batch_size):
                df_list.append(chunk) # Append each chunk to the list
                bytes_read += chunk.memory_usage(deep=True).sum()
                pbar.update(bytes_read - pbar.n)  # Update progress bar
        dfs[file] = pd.concat(df_list, ignore_index=True) # Concatenate all chunks into a single dataframe
    except FileNotFoundError:
        print(f"Error: CSV file not found: {file_path}") # Handle missing file error
        exit(1)

In [None]:
# List of classifiers to be used for training and evaluation
models = [
    RandomForestClassifier(max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300, random_state=42),

    KNeighborsClassifier(n_neighbors=10, weights='distance'),

    DecisionTreeClassifier(max_depth=10, min_samples_leaf=1, min_samples_split=5, random_state=42),

    GradientBoostingClassifier(learning_rate=0.2, max_depth=5, n_estimators=300, random_state=42),

    ExtraTreesClassifier(max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=300, random_state=42),
    
    xgb.XGBClassifier(learning_rate=0.1, max_depth=10, n_estimators=300, eval_metric='logloss', random_state=42)
]

In [None]:
# Evaluate a trained model using multiple metrics
def evaluate_model(model, X_test, y_test):
    """
    Evaluate a model on test data using accuracy, precision, recall, F1 score, and mean squared error.
    
    Parameters:
    - model: Trained model to be evaluated
    - X_test: Features of the test dataset
    - y_test: True labels of the test dataset
    
    Returns:
    - Tuple containing the evaluation metrics (accuracy, precision, recall, F1 score, MSE)
    """
    try:
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        mse = mean_squared_error(y_test, y_pred)
        return accuracy, precision, recall, f1, mse
    except ValueError as e:
        print(f"Error: Invalid input for evaluation metrics: {str(e)}")
        return None

In [None]:
# Extract different types of features from the dataset columns
def extract_features(X):
    """
    extracts different types of features (texture, color, shape) from the dataset
    
    parameters:
        X: DataFrame containing the features

    returns:
        three lists containing the names of texture, color and shape features respectively
    """
    texture_features = [col for col in X.columns 
                        if 'LBP_Pattern_' in col 
                        or 'GLCM_Feature_' in col 
                        or 'HOG_' in col 
                        or 'Gabor_Feature_' in col]
    
    color_features = [col for col in X.columns 
                      if 'Color_Moment_' in col 
                      or 'Color_Histogram_' in col 
                      or 'Color_Coherence_Vector_' in col
                      or 'Color_Name_Histogram_' in col]
    
    shape_features = [col for col in X.columns 
                      if 'Hu_Moment_' in col 
                      or 'Contour_' in col 
                      or 'Fourier_Descriptor_' in col 
                      or 'Zernike_Moment_' in col]
    
    return texture_features, color_features, shape_features

In [None]:
# function to train and evaluate models without k-fold cross-validation
def train_and_evaluate_models(X_train, y_train, X_test, y_test, models):
    """
    Trains and evaluates a list of models on training and test datasets.
    
    Parameters:
    - X_train: Training features
    - y_train: Training labels
    - X_test: Test features
    - y_test: Test labels
    - models: List of models to train and evaluate
    
    Returns:
    - List of evaluation results for each model
    """
    results = []
    for model in models:
        model.fit(X_train, y_train) # Train model
        accuracy, precision, recall, f1, mse = evaluate_model(model, X_test, y_test) # Evaluate model
        results.append([type(model).__name__, accuracy, precision, recall, f1, mse])
    return results

In [None]:
# Train and evaluate models using k-fold cross-validation
def train_and_evaluate_models_kfold(X, y, models, k=5):
    """
    Trains and evaluates models using k-fold cross-validation.
    
    Parameters:
    - X: Features of the dataset
    - y: Labels of the dataset
    - models: List of models to train and evaluate
    - k: Number of folds for cross-validation (default: 5)
    
    Returns:
    - List of average evaluation metrics for each model across k folds
    """
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    results = []
    for model in models:
        accuracy_scores = []
        precision_scores = []
        recall_scores = []
        f1_scores = []
        mse_scores = []
        for train_index, test_index in kf.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y[train_index], y[test_index]
            model.fit(X_train, y_train) # Train model on each fold
            accuracy, precision, recall, f1, mse = evaluate_model(model, X_test, y_test) # Evaluate model
            accuracy_scores.append(accuracy)
            accuracy_scores.append(accuracy)
            precision_scores.append(precision)
            recall_scores.append(recall)
            f1_scores.append(f1)
            mse_scores.append(mse)

        # Store average results across all folds
        results.append([
            type(model).__name__,
            np.mean(accuracy_scores),
            np.mean(precision_scores),
            np.mean(recall_scores),
            np.mean(f1_scores),
            np.mean(mse_scores)
        ])
    return results

In [None]:
def evaluate_features(file, df, workbook):
    """
    Evaluate and compare various feature extraction methods on the dataset, 
    both with and without k-fold cross-validation, and save the results to an Excel workbook.
    
    Parameters:
    - file: The name of the dataset file being evaluated.
    - df: The DataFrame containing the dataset.
    - workbook: The Excel workbook where results will be saved.
    """
    worksheet = workbook.create_sheet(title=file)

    # Check for missing values in the dataset
    if df.isnull().values.any():
        print("Warning: Missing values detected in the dataset.")
        
        # Impute missing values using KNN imputer
        imputer = KNNImputer(n_neighbors=5)
        df_imputed = imputer.fit_transform(df)
        
        # Convert the imputed array back to a DataFrame
        df = pd.DataFrame(df_imputed, columns=df.columns)
        print("Missing values have been imputed using KNN.")
    
    # Separate features (X) and labels (y)
    try:
        X = df.drop(['path', 'filename', 'label'], axis=1) # Drop non-feature columns
        y = df['label']
    except KeyError as e:
        print(f"Error: Required column not found in the dataset: {str(e)}")
        return

    # Encode labels to numerical format
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)
    
    # Extract different sets of features (texture, color, shape)
    texture_features, color_features, shape_features = extract_features(X)
    
    # Split the dataset into training and testing sets
    try:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    except ValueError as e:
        print(f"Error: Invalid input for train-test split: {str(e)}")
        return
    
    # # # Evaluate each feature extraction method without k-fold cross-validation
    best_features = {'texture': None, 'color': None, 'shape': None}
    best_scores = {'texture': 0, 'color': 0, 'shape': 0}
    
    row = 1 # Start writing to Excel from the first row
    for feature_set, features in zip(['Texture', 'Color', 'Shape'], [texture_features, color_features, shape_features]):
        worksheet.cell(row=row, column=1, value=f"Feature set: {feature_set}")
        print(f"\nFeature set: {feature_set}")
        row += 1

        # Group features by their extraction method
        feature_methods = {}
        for feature in features:
            method = '_'.join(feature.split('_')[:-1])
            if method not in feature_methods:
                feature_methods[method] = [feature]
            else:
                feature_methods[method].append(feature)
        
        # Evaluate models on each feature extraction method
        for method, method_features in feature_methods.items():
            print(f"\nFeature extraction: {method}")
            worksheet.cell(row=row, column=1, value=f"Feature extraction: {method}")
            row += 1

            X_train_subset = X_train[method_features]
            X_test_subset = X_test[method_features]
            
            results = train_and_evaluate_models(X_train_subset, y_train, X_test_subset, y_test, models)

            print(tabulate(results, headers=["Model", "Accuracy", "Precision", "Recall", "F1 Score", "MSE"], tablefmt="grid"))

            for r in results:
                worksheet.append([''] + r)
            row += len(results) + 1

            # Track the best-performing feature method
            best_model_index = np.argmax([r[1] for r in results])
            if results[best_model_index][1] > best_scores[feature_set.lower()]:
                best_scores[feature_set.lower()] = results[best_model_index][1]
                best_features[feature_set.lower()] = method
    
    # Output the best feature methods found without k-fold
    print("\nBest features (without k-fold):")
    for category, method in best_features.items():
        print(f"{category.capitalize()}: {method}")

    worksheet.cell(row=row, column=1, value="Best features (without k-fold):")
    row += 1
    for category, method in best_features.items():
        worksheet.cell(row=row, column=1, value=f"{category.capitalize()}: {method}")
        row += 1

    # # # Evaluate each feature extraction method with k-fold cross-validation
    best_features_kfold = {'texture': None, 'color': None, 'shape': None}
    best_scores_kfold = {'texture': 0, 'color': 0, 'shape': 0}
    
    row += 1 # Add a blank row before the next section
    for feature_set, features in zip(['Texture', 'Color', 'Shape'], [texture_features, color_features, shape_features]):
        print(f"\nFeature set: {feature_set} (with k-fold)")
        worksheet.cell(row=row, column=1, value=f"Feature set: {feature_set} (with k-fold)")
        row += 1 

        # Group features by their extraction method
        feature_methods = {}
        for feature in features:
            method = '_'.join(feature.split('_')[:-1])
            if method not in feature_methods:
                feature_methods[method] = [feature]
            else:
                feature_methods[method].append(feature)
        
        # Evaluate models on each feature extraction method with k-fold
        for method, method_features in feature_methods.items():
            print(f"\nFeature extraction: {method}")
            worksheet.cell(row=row, column=1, value=f"Feature extraction: {method}")
            row += 1

            X_subset = X[method_features]
            
            results = train_and_evaluate_models_kfold(X_subset, y, models)

            print(tabulate(results, headers=["Model", "Accuracy", "Precision", "Recall", "F1 Score", "MSE"], tablefmt="grid"))

            for r in results:
                worksheet.append([''] + r)
            row += len(results) + 1

            # Track the best-performing feature method with k-fold
            best_model_index = np.argmax([r[1] for r in results])
            if results[best_model_index][1] > best_scores_kfold[feature_set.lower()]:
                best_scores_kfold[feature_set.lower()] = results[best_model_index][1]
                best_features_kfold[feature_set.lower()] = method
    
    # Output the best feature methods found with k-fold
    print("\nBest features (with k-fold):")
    for category, method in best_features_kfold.items():
        print(f"{category.capitalize()}: {method}")

    worksheet.cell(row=row, column=1, value="Best features (with k-fold):")
    row += 1
    for category, method in best_features_kfold.items():
        worksheet.cell(row=row, column=1, value=f"{category.capitalize()}: {method}")
        row += 1

    # Generate combinations of best features for evaluation
    best_feature_combinations = []
    for r in range(1, len(best_features) + 1):
        combinations = itertools.combinations(best_features.items(), r)
        for combination in combinations:
            feature_combination = []
            combination_name = "test_" + "_".join([f"{category}_{method}" for category, method in combination])
            for category, method in combination:
                feature_combination.extend([col for col in X.columns if method in col])
            best_feature_combinations.append((combination_name, feature_combination))
    
    # Evaluate best feature combinations without k-fold cross-validation
    row += 1
    print("\nBest feature combinations (without k-fold):")
    worksheet.cell(row=row, column=1, value="Best feature combinations (without k-fold):")
    row += 1
    for combination_name, features in best_feature_combinations:
        print(f"\nFeature combination: {combination_name}")
        worksheet.cell(row=row, column=1, value=f"Feature combination: {combination_name}")
        row += 1

        X_train_subset = X_train[features]
        X_test_subset = X_test[features]
        
        results = train_and_evaluate_models(X_train_subset, y_train, X_test_subset, y_test, models)

        print(tabulate(results, headers=["Model", "Accuracy", "Precision", "Recall", "F1 Score", "MSE"], tablefmt="grid"))

        for r in results:
            worksheet.append([''] + r)
        row += len(results) + 1

    # Evaluate best feature combinations with k-fold cross-validation
    print("\nBest feature combinations (with k-fold):")
    worksheet.cell(row=row, column=1, value="Best feature combinations (with k-fold):")
    row += 1
    for combination_name, features in best_feature_combinations:
        print(f"\nFeature combination: {combination_name}")
        worksheet.cell(row=row, column=1, value=f"Feature combination: {combination_name}")
        row += 1

        X_subset = X[features]
        
        results = train_and_evaluate_models_kfold(X_subset, y, models)

        print(tabulate(results, headers=["Model", "Accuracy", "Precision", "Recall", "F1 Score", "MSE"], tablefmt="grid"))

        for r in results:
            worksheet.append([''] + r)
        row += len(results) + 1

In [None]:
if __name__ == '__main__':
    """
    Main execution block for the feature evaluation script.
    """
    workbook = Workbook() # Initialize an Excel workbook to store results

    # Iterate over each dataset, evaluating features and saving results
    for file, df in dfs.items():
        print(f"\n=============== {file} ===============")
        evaluate_features(file, df, workbook)# Evaluate features for each file

    # Clean up the default sheet created by openpyxl
    workbook.remove(workbook['Sheet'])
    workbook.save("classification_results.xlsx") # Save the workbook as an Excel file