In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import random

In [2]:
data = pd.read_csv("dados_normalizados.csv")

y = data["Outcome"]
X = data.drop("Outcome", axis=1)

In [3]:
def fitness_function(selected_features):
    X_train, X_test, y_train, y_test = train_test_split(X[selected_features], y, test_size=0.2, random_state=42)
    knn_model = KNeighborsClassifier(n_neighbors=3)
    knn_model.fit(X_train, y_train)
    y_pred = knn_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

In [4]:
class GeneticAlgorithm:
    def __init__(self, 
                 fitness_function,
                 max_iter,
                 crossover_prob,
                 mutation_prob,
                 X,
                 y):
        self.fitness_function = fitness_function
        self.pop_size = None
        self.max_iter = max_iter
        self.crossover_prob = crossover_prob
        self.mutation_prob = mutation_prob
        self.X = X
        self.y = y
        self.population = None
        
    def generate(self):
        n_features_selected = random.randrange(2, self.X.shape[1])
        
        features_selected = []
        
        while len(features_selected) != n_features_selected:
            
            feature = self.X.columns[random.randrange(self.X.shape[1])]
            
            if not feature in features_selected:
                features_selected.append(feature)
                
        return features_selected
    
    def generate_population(self, size):
        self.pop_size = size
        
        init_population =[]
        
        for i in range(size):
            init_population.append(self.generate())
            
        self.population = init_population
            
        return init_population
    
    def mutation(self, indiv):
        new_indiv = indiv
        
        while True:
            feature = self.X.columns[random.randrange(self.X.shape[1])]
            
            if not feature in new_indiv:
                new_indiv[random.randrange(len(new_indiv))] = feature
                break
        
        return new_indiv
    
    def cross_over(self, father, mother):
        new_indiv = []
        
        child = self.population[father] + self.population[mother]
        
        n_features_selected = random.randrange(2, self.X.shape[1])
        
        for x in range(n_features_selected):
            if not child[x] in new_indiv:
                 new_indiv.append(child[x])
                    
        return new_indiv
    
    def find_best_30(self):
        results = []
        
        for i in range(len(self.population)):
            acc = self.fitness_function(self.population[i])
            results.append({"id": i, 
                            "accuracy": acc/len(self.population[i]),
                            "real_acc": acc})
            
        results_ordened = sorted(results, key=lambda x: x['accuracy'], reverse=True)
        
        finished_results = []
        
        for i in results_ordened[:30]:
            finished_results.append(self.population[i["id"]])
            
        return finished_results
    
    def find_best(self):
        results = []
        
        for i in range(len(self.population)):
            results.append({"id": i, "accuracy": self.fitness_function(self.population[i])})
            
        results_ordened = sorted(results, key=lambda x: x['accuracy'], reverse=True)
        
        final_results = []
        for i in range(10):
            final_results.append({"id": results_ordened[i]["id"], "acc": results_ordened[i]["accuracy"], "columns": self.population[results_ordened[i]["id"]]})
        
        return final_results
        
    def evolve(self):
        for i in range(self.max_iter):
            new_population = []
            
            survivors = self.find_best_30()
            
            new_population += survivors
            
            for x in survivors:
                new_population.append(self.mutation(x))
            
            for x in range(40):
                new_population.append(self.generate())
                
            self.population = new_population
            
            
my_ga = GeneticAlgorithm(fitness_function, 50, 0.8, 0.2, X, y)

my_ga.generate_population(100)

my_ga.evolve()

for x in my_ga.find_best():
    print(x)

{'id': 72, 'acc': 0.7735849056603774, 'columns': ['Insulin', 'Glucose', 'SkinThickness', 'BMI', 'BloodPressure', 'Age', 'Pregnancies']}
{'id': 76, 'acc': 0.7735849056603774, 'columns': ['Glucose', 'Pregnancies', 'SkinThickness', 'Insulin', 'BMI', 'BloodPressure', 'Age']}
{'id': 5, 'acc': 0.7358490566037735, 'columns': ['Glucose', 'Age']}
{'id': 35, 'acc': 0.7358490566037735, 'columns': ['Glucose', 'Age']}
{'id': 71, 'acc': 0.7358490566037735, 'columns': ['BMI', 'Glucose', 'Age', 'DiabetesPedigreeFunction', 'Insulin', 'Pregnancies']}
{'id': 86, 'acc': 0.7264150943396226, 'columns': ['BMI', 'Pregnancies', 'Glucose', 'Age', 'Insulin']}
{'id': 64, 'acc': 0.7169811320754716, 'columns': ['Age', 'BMI', 'SkinThickness', 'BloodPressure', 'Glucose', 'Insulin']}
{'id': 79, 'acc': 0.7169811320754716, 'columns': ['SkinThickness', 'Insulin', 'BMI', 'Age', 'Glucose', 'BloodPressure']}
{'id': 99, 'acc': 0.7169811320754716, 'columns': ['DiabetesPedigreeFunction', 'BMI', 'SkinThickness', 'Glucose', 'Age