In [133]:
import numpy as np
import pandas as pd
from numpy.random import rand, randint
from sklearn.metrics.pairwise import cosine_similarity

In [97]:
eigen_vecs_wavelet = pd.read_csv("../data/eigen_vecs_wavelet.csv")
eigen_vecs_gray = pd.read_csv("../data/eigen_vecs_gray.csv")
wavelet_std_features = pd.read_csv("../data/wavelet_std_features.csv")
gray_std_features = pd.read_csv("../data/gray_std_features.csv")

In [98]:
wavelet_std_features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1216,1217,1218,1219,1220,1221,1222,1223,1224,names
0,0.552141,0.417563,-0.097526,0.213892,0.513422,0.535479,0.470071,0.599688,0.995303,1.093655,...,0.240740,0.088242,-0.210105,-0.312634,0.003390,0.163147,0.029706,-0.016715,-0.050974,Kim_Jong-Il
1,0.707605,1.058285,0.798634,1.056439,0.772568,0.882596,1.607739,1.539196,1.368010,1.313639,...,1.434498,1.369982,0.713475,0.692875,0.546367,-0.255205,-1.064934,-1.213023,-1.171687,Kim_Jong-Il
2,0.694298,0.829691,0.983415,0.931306,0.670054,0.909874,1.397697,1.394208,1.127612,1.249970,...,-0.021223,0.298454,0.463149,0.302656,-0.201223,-0.874549,-0.554349,-0.402519,-0.482302,Kim_Jong-Il
3,0.033103,0.219485,0.268445,0.293704,0.019686,0.296015,1.463204,1.757874,1.362924,1.321935,...,-1.197654,-1.080682,-1.210680,-1.242481,-1.088001,-0.611160,-0.647857,-1.021585,-1.187903,Kim_Jong-Il
4,0.417017,0.344503,-0.173195,0.154932,0.393859,0.418852,0.484160,0.376886,0.266048,0.110025,...,1.301302,1.135282,0.729333,0.458901,0.662662,1.065471,0.985344,1.402783,2.424354,Aaron_Peirsol
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7096,0.091953,0.095439,0.106280,0.096238,0.106066,0.048213,0.062412,0.070286,0.041856,0.150444,...,-0.072619,-0.235007,-0.550489,-0.497912,-0.990588,-1.339481,-0.896832,-0.597121,-0.570522,Gus_Van_Sant
7097,-0.723017,-1.374698,-0.926930,-1.261602,-1.267894,0.346733,1.062299,1.097703,1.247735,0.872234,...,-1.222377,-0.158230,0.527500,0.513077,-1.164360,-1.342067,-1.072720,-1.158803,-1.328136,Guy_Hemmings
7098,1.042449,1.232408,0.718422,1.094761,1.122931,0.980860,1.010454,1.069370,0.999389,0.840748,...,0.224155,0.487819,0.698232,0.732792,0.516362,0.046279,-0.223393,-0.111478,0.053074,Guy_Hemmings
7099,0.467225,0.456406,0.610015,0.503828,0.472147,0.435489,0.368918,0.361424,0.244407,0.269776,...,-2.436577,-2.166024,-1.954376,-1.787004,-1.640348,-1.565958,-1.446946,-1.299056,-1.207270,Guy_Ritchie


In [None]:
## some parameters for evolutionary algo

feature_percentage = 0.25



In [None]:
class Recognition_Objective:
    '''
    Class for implementing the recognition objective for selecting best set of individuals in Evolutionary Algo
    Trying Vectorized implementation for the code
    '''
    def __init__(self, eigen_arr = None, data_arr = None, names = None):
        self.eigen_arr = eigen_arr
        self.data_arr = data_arr
        self.df_name = names
    
    def _eigen_extractor(self, pop, n_pop):
        #extracting eigen values based on the population
        ## pop must be of size = (n_pop, n_features)
        pop_eigens = [] ## eigen vectors for all population
        pop1 = pop.astype(bool)
        for i in range(n_pop):
            pop_eigens.append(self.eigen_arr[:,pop1[i]].T)
        return np.asarray(pop_eigens)
        
    def _sort_score_fn(self, arr):
        sim = cosine_similarity(arr)
        sorted_indexes = np.argsort(-sim)
        recognition_score = 0
        for i, indx_arr in enumerate(sorted_indexes):
            name = self.df_name[i]
            true_indxs = np.where(self.df_name == name)[0]
            predicted_index = indx_arr[true_indxs]
            recognition_score += (np.sum(predicted_index<=(len(true_indxs)-1))/len(true_indxs))
        return recognition_score/(i+1) 
        
    def _recognition(self, pop, n_pop):
        pop_eigens = self._eigen_extractor(pop=pop, n_pop=n_pop)
        pop_eigens = np.expand_dims(pop_eigens, axis = 1)
        result = pop_eigens@np.expand_dims(self.data_arr, axis = 0)
        result = result.squeeze()
        return list(map(self._sort_score_fn, result))
    
    
class genetic_algorithm:
    '''
    Class for implementing our Genetic Algorithm
    '''
    def __init__(self, crossover_rate = 0.9, mutation_rate = 0.001, eigen_arr=None, data_arr=None, names=None):
        self.crossover_rate = crossover_rate
        self.eigen_arr = eigen_arr
        self.data_arr = data_arr
        self.names = names
        self.mutation_rate = mutation_rate
        self.recognition = Recognition_Objective(eigen_arr=self.eigen_arr, data_arr=self.data_arr, names=self.names)
        
    def crossover(self, p1, p2):
        # children are copies of parents by default
        c1, c2 = p1.copy(), p2.copy()
        # check for recombination
        if rand() < self.crossover_rate:
            # select crossover point that is not on the end of the string
            pt = randint(1, len(p1)-2)
            # perform crossover
            c1 = p1[:pt] + p2[pt:]
            c2 = p2[:pt] + p1[pt:]
        return [c1, c2]
    
    def mutation(self, bitstring):
        # mutation operator
        mutation_indx = np.random.rand(len(bitstring))<self.mutation_rate
        bitstring[mutation_indx] = 1-bitstring[mutation_indx]
        return bitstring
    
    def selection(pop, scores, k=3):
        # selection is based on the score of roulette wheel method
        probs = scores/np.sum(scores) ## probabilities associated with each individual based on recognition fitness score
        return np.random.choice(pop, p = probs)
    
    def step(self, pop, n_pop, best, best_eval, gen):
        '''
        Performs a single step for optimization
        '''
        scores = self.recognition._recognition(pop=pop, n_pop=n_pop)
        # check for new best solution
        for i in range(n_pop):
            if scores[i] > best_eval:
                best, best_eval = pop[i], scores[i]
                print(">%d, new best f(%s) = %.3f" % (gen,  pop[i], scores[i]))
        # select parents
        selected = [self.selection(pop, scores) for _ in range(n_pop)]
        # create the next generation
        children = []
        for i in range(0, n_pop, 2):
            # get selected parents in pairs
            p1, p2 = selected[i], selected[i+1]
            # crossover and mutation
            for c in self.crossover(p1, p2):
                # mutation
                c = self.mutation(c)
                # store for next generation
                children.append(c)
        # replace population
        pop = children
        return [pop, best, best_eval]
    
    def process(self, n_features, precentage_features, n_iter, n_pop):
        trim_indx = int(precentage_features*n_features)
        pop = np.uint8(np.arange(n_features)>=trim_indx) ## valid pattern for our solution
        np.random.shuffle(pop) ## shuffling to create initial population
        # keep track of best solution
        best, best_eval = 0, 0
        # enumerate generations
        for gen in range(n_iter):
            pop, best, best_eval = self.step(pop, n_pop, best, best_eval, gen)
        return [best, best_eval]