In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import time
from pathlib import Path
from scipy import ndimage, fft
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, precision_recall_curve, average_precision_score
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.utils import shuffle

In [3]:
class LightFluxProcessor:

    def __init__(self, fourier=True, normalize=True, gaussian=True, standardize=True):
        self.fourier = fourier
        self.normalize = normalize
        self.gaussian = gaussian
        self.standardize = standardize

    def fourier_transform(self, X):
        return np.abs(np.fft.fft(X, n=X.size))

    def process(self, df_train_x, df_dev_x):
        if self.fourier:
            print("Applying Fourier...")
            df_train_x = np.apply_along_axis(self.fourier_transform, axis=1, arr=df_train_x)
            df_dev_x = np.apply_along_axis(self.fourier_transform, axis=1, arr=df_dev_x)

            df_train_x = df_train_x[:, :(df_train_x.shape[1] // 2)]
            df_dev_x = df_dev_x[:, :(df_dev_x.shape[1] // 2)]

        if self.normalize:
            print("Normalizing...")
            df_train_x = normalize(df_train_x)
            df_dev_x = normalize(df_dev_x)

        if self.gaussian:
            print("Applying Gaussian Filter...")
            df_train_x = ndimage.filters.gaussian_filter(df_train_x, sigma=10)
            df_dev_x = ndimage.filters.gaussian_filter(df_dev_x, sigma=10)

        if self.standardize:
            print("Standardizing...")
            std_scaler = StandardScaler()
            df_train_x = std_scaler.fit_transform(df_train_x)
            df_dev_x = std_scaler.transform(df_dev_x)

        print("Finished Processing!")
        return df_train_x, df_dev_x

In [4]:
def np_X_Y_from_df(df):
    df = shuffle(df)
    df_X = df.drop(['LABEL'], axis=1)
    X = np.array(df_X)
    Y_raw = np.array(df['LABEL']).reshape((len(df['LABEL']),1))
    Y = Y_raw == 2
    return X, Y

In [None]:
def main():
    train_dataset_path = "Data/exoTrain.csv"
    dev_dataset_path = "Data/exoTest.csv"

    print("Loading datasets...")
    df_train = pd.read_csv(train_dataset_path, encoding = "ISO-8859-1")
    df_dev = pd.read_csv(dev_dataset_path, encoding = "ISO-8859-1")
    print("Loaded datasets!")

    df_train_x = df_train.drop('LABEL', axis=1)
    df_dev_x = df_dev.drop('LABEL', axis=1)
    df_train_y = df_train.LABEL
    df_dev_y = df_dev.LABEL
    LFP = LightFluxProcessor(
        fourier=True,
        normalize=False,
        gaussian=False,
        standardize=False)
    df_train_x, df_dev_x = LFP.process(df_train_x, df_dev_x)

    df_train_processed = pd.DataFrame(df_train_x).join(pd.DataFrame(df_train_y))
    df_dev_processed = pd.DataFrame(df_dev_x).join(pd.DataFrame(df_dev_y))

    X_train, Y_train = np_X_Y_from_df(df_train_processed)
    X_dev, Y_dev = np_X_Y_from_df(df_dev_processed)
    
    print("Describing Datasets...")
    (num_examples, n_x) = X_train.shape
    n_y = Y_train.shape[1] 
    print("X_train.shape: ", X_train.shape)
    print("Y_train.shape: ", Y_train.shape)
    print("X_dev.shape: ", X_dev.shape)
    print("Y_dev.shape: ", Y_dev.shape)
    print("n_x: ", n_x)
    print("num_examples: ", num_examples)
    print("n_y: ", n_y)
    
    ## 
    # Perform hyperparameter tuning
    best_params, model = perform_grid_search(X_train, Y_train)

    # Train the best model
    print("Training with the best parameters...", best_params)
    

    X_train_sm, Y_train_sm = X_train, Y_train

    # Train
    print("Training...")
    model.fit(X_train_sm, Y_train_sm)

    train_outputs = model.predict(X_train_sm)
    dev_outputs = model.predict(X_dev)
    print("Finished Training!")
    

    train_prob = model.decision_function(X_train)  
    dev_prob = model.decision_function(X_dev)


    train_outputs = (train_prob > 0).astype(int)
    dev_outputs = (dev_prob > 0).astype(int)

    accuracy_train = accuracy_score(Y_train_sm, train_outputs)
    accuracy_dev = accuracy_score(Y_dev, dev_outputs)
    precision_train = precision_score(Y_train_sm, train_outputs)
    precision_dev = precision_score(Y_dev, dev_outputs)
    recall_train = recall_score(Y_train_sm, train_outputs)
    recall_dev = recall_score(Y_dev, dev_outputs)
    confusion_matrix_train = confusion_matrix(Y_train_sm, train_outputs)
    confusion_matrix_dev = confusion_matrix(Y_dev, dev_outputs)
    classification_report_train = classification_report(Y_train_sm, train_outputs)
    classification_report_dev = classification_report(Y_dev, dev_outputs)

    # Calculate AUC scores
    ap_train = average_precision_score(Y_train_sm, train_prob)
    ap_dev = average_precision_score(Y_dev, dev_prob)

    # Display metrics
    print("AUC training set: %.3f" % ap_train)
    print("AUC dev set: %.3f" % ap_dev)
    print("Accuracy training set: %.3f" % accuracy_train)
    print("Accuracy dev set: %.3f" % accuracy_dev)
    print("Precision training set: %.3f" % precision_train)
    print("Precision dev set: %.3f" % precision_dev)
    print("Recall training set: %.3f" % recall_train)
    print("Recall dev set: %.3f" % recall_dev)
    print(" ")

    # Precision-Recall Curve
    precision, recall, thresholds = precision_recall_curve(Y_train_sm, train_prob)
    plt.figure(figsize=(8, 6))
    plt.plot(recall, precision, color='b', alpha=0.8)
    plt.fill_between(recall, precision, alpha=0.2, color='b', step='post')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title('Precision-Recall Curve')
    plt.show()

    # Confusion matrices
    print("Confusion Matrix - Train Set")
    print(confusion_matrix_train)
    print("Confusion Matrix - Dev Set")
    print(confusion_matrix_dev)
    
main()


Loading datasets...
Loaded datasets!
Applying Fourier...
Finished Processing!
Describing Datasets...
X_train.shape:  (5087, 1598)
Y_train.shape:  (5087, 1)
X_dev.shape:  (570, 1598)
Y_dev.shape:  (570, 1)
n_x:  1598
num_examples:  5087
n_y:  1
Hyperparameter tuning in progress...


Grid Search Progress:   0%|                                                            | 0/72 [00:00<?, ?combination/s]

Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [1]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import pandas as pd
from tqdm import tqdm
import joblib
import json

def perform_grid_search(X_train, Y_train):
    param_grid = {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto', 0.1],
        'shrinking': [True, False],
        'max_iter': [1000, 2000],
    }

    svc = SVC()
    grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
    
    print("Hyperparameter tuning in progress...")

    try:
        with tqdm(total=len(param_grid['C']) * len(param_grid['kernel']) * len(param_grid['gamma']) *
                  len(param_grid['shrinking']) * len(param_grid['max_iter']),
                  desc="Grid Search Progress", unit="combination") as progress_bar:
            grid_search.fit(X_train, Y_train)
            progress_bar.update(1)  # Manually update the progress bar as GridSearchCV doesn't provide progress updates
        print("\nGrid search complete.")
    except Exception as e:
        print(f"Error during grid search: {str(e)}")
        return None, None

    # Save the best model to a binary file using joblib
    best_model = grid_search.best_estimator_
    joblib.dump(best_model, 'best_model.joblib')

    # Save the best parameters to a JSON file
    best_params = grid_search.best_params_
    with open('best_params.json', 'w') as json_file:
        json.dump(best_params, json_file)

    # Extract results into a DataFrame and print it
    results_df = pd.DataFrame(grid_search.cv_results_)
    print("\nGrid search results:")
    print(results_df)

    # Save results to a CSV file
    results_df.to_csv('grid_search_results.csv', index=False)

    return 'best_model.joblib', 'best_params.json'
