In [1]:
import torch as t
import pandas as pd

# Misc
from typing import Callable
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
class GreyWolfOptimiser:
    """ Creates an optimiser for feature selection in a large dataset """

    def __init__(self, error_function : Callable[[t.Tensor], float], 
                 population_size : int, num_dimensions : int, num_iterations : int, 
                 mean : float = 1.0, c_err : float = 1.0, c_size : float = 1.0, a : float = 2.0) -> None:
        
        # This is the bottleneck. It evaluates how good a particular subset of features is. 
        # Function specs: input = t.Tensor (dim: num_dimensions). return = float (0-1).
        self.err = error_function
        
        # number of wolves (min 3, raise for larger feature spaces)
        self.pop = population_size
        
        # number of features to select from 
        self.dims = num_dimensions 
        
        # how many times to test solutions (raise for larger feature spaces)  
        self.iters = num_iterations
        
        # generalised mean exponent (1.0 = arithmetic mean, ~0.0+ = geometric)
        self.mean = mean
        
        # how much to prioritise error function minimisation vs. size of feature set
        self.c_err = c_err
        self.c_size = c_size
        
        # how much wolves explore (increase for more exploration)
        self.a = a

    def init_wolves(self) -> t.Tensor:
        """ Returns a population of wolves with random 1/0s per feature to include/exclude """
        return t.randint(2, (self.pop, self.dims))
        
    def fitness(self, wolves : t.Tensor) -> t.Tensor:
        """ Returns error of each wolf in the population (lower is better) """
        return t.Tensor([
            self.c_err * self.err(w) + self.c_size * (t.sum(w) / self.dims) for w in wolves
            ])

    def compare_wolves(self, wolves : t.Tensor) -> list:
        """ Returns [positions, fitness, followers] of alpha, beta, delta wolves """
        f = self.fitness(wolves)
        sort_f = t.argsort(f)        
        sorted_wolves = wolves[sort_f]
        return sorted_wolves[:3], f[sort_f][:3], sorted_wolves[3:]
    
    def next_positions(self, wolves : t.Tensor, top : t.Tensor, 
                       curr_iter : int) -> t.Tensor:
        """ 
        For each wolf, find possible next positions given alpha, beta, delta 
        
        Parameters
        ----------------
        wolves (type: t.Tensor, dim: (pop_size, feature_dims)): 
            positions of all wolves
        top (type: t.Tensor, dim: (3, feature_dims)): 
            positions of alpha, beta, delta wolves
        curr_iter (type: int): 
            current iteration number

        Returns
        ----------------
        next_positions (type: np.ndarray, dim: (pop_size, 3, feature_dims)): 
            three possible next positions for each wolf given alpha, beta, delta
        """
        
        # Update current a vector
        a_curr = self.a * (1 - curr_iter / self.iters)
        # Generate random vectors
        r1 = t.rand((1, self.dims))
        r2 = t.rand((1, self.dims))

        # Compute alpha, beta, delta distances
        d_alpha = t.abs(2 * r1 * top[0:1, :] - wolves)
        d_alpha = top[0:1, :] - (2 * a_curr * r2 - a_curr) * d_alpha
        d_beta = t.abs(2 * r1 * top[1:2, :] - wolves)
        d_beta = top[1:2, :] - (2 * a_curr * r2 - a_curr) * d_beta
        d_delta = t.abs(2 * r1 * top[2:3, :] - wolves) 
        d_delta = top[2:3, :] - (2 * a_curr * r2 - a_curr) * d_delta
        
        # Compute next positions for followers 
        wolves = (d_alpha + d_beta + d_delta)**self.mean
        wolves = (wolves/3) ** (1 / self.mean)
        wolves = t.clamp(t.round(wolves), 0, 1)
        
        # Preserve existing top solutions
        return t.cat( (top, wolves) ) 
    
    def optimise(self) -> list:
        """ Returns the best wolf and error history after optimisation """

        wolves = self.init_wolves()
        err_history = []

        for i in range(self.iters):
            print(f'Iteration: {i}/{self.iters}')
            top, err, followers = self.compare_wolves(wolves)
            err_history.append(err[0])
            wolves = self.next_positions(followers, top, i)

        return self.compare_wolves(wolves)[0][0], err_history

In [2]:
data = pd.read_csv('../Datasets/reduced_II.csv')

In [4]:
y = data['anxiety_meter']
X = data.drop(['P_Id', 'anxiety_meter'], axis=1)

# Handling missing values

In [7]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Feature Engineering