In [8]:
import pandas as pd
import numpy as np
import random

class GeneticDockingOptimizer:
    def __init__(self, data_path, population_size=100, mutation_rate=0.05, crossover_rate=0.7):
        self.data = pd.read_csv(data_path)
        self.population_size = population_size
        self.mutation_rate = mutation_rate
        self.crossover_rate = crossover_rate
        self.population = [self.random_chromosome() for _ in range(self.population_size)]

    def random_chromosome(self):
        return [random.uniform(0, 5) for _ in range(3)]

    def score_chromosome(self, chromosome):
        return self.calculate_fitness(chromosome)

    def calculate_fitness(self, weights):
        self.data['score'] = self.data.apply(lambda row: self.score_row(row, weights), axis=1)
        return np.mean(self.data['score'])

    def score_row(self, row, weights):
        w_affinity, w_rmsd_lb, w_rmsd_ub = weights
        desired_affinity_range = (-8.0, -7.5)
        if not desired_affinity_range[0] <= row["Affinity (kcal/mol)"] <= desired_affinity_range[1]:
            return -1e10
        if row["Dist from best mode (rmsd l.b.)"] == 0.00 and row["Dist from best mode (rmsd u.b.)"] == 0.00:
            return -1e10
        return (w_affinity * row["Affinity (kcal/mol)"] - w_rmsd_lb * row["Dist from best mode (rmsd l.b.)"] - w_rmsd_ub * row["Dist from best mode (rmsd u.b.)"])

    def mutate(self, chromosome):
        return [gene if random.random() > self.mutation_rate else random.uniform(0, 5) for gene in chromosome]

    def crossover(self, parent1, parent2):
        if random.random() < self.crossover_rate:
            point = random.randint(1, len(parent1) - 1)
            return parent1[:point] + parent2[point:], parent2[:point] + parent1[point:]
        return parent1, parent2

    def select_parents(self, scores):
        total_score = sum(scores)
        pick = random.uniform(0, total_score)
        current = 0
        for i in range(len(scores)):
            current += scores[i]
            if current > pick:
                return self.population[i]

    def run(self, generations=200, verbose=False):
        best_scores = []
        for generation in range(generations):
            scores = [self.score_chromosome(chromosome) for chromosome in self.population]
            best_scores.append(max(scores))
            
            if verbose and generation % 10 == 0:
                print(f"Generation {generation}: Best Score: {best_scores[-1]}")
            
            new_population = []
            for i in range(self.population_size // 2):
                parent1 = self.select_parents(scores)
                parent2 = self.select_parents(scores)
                
                if parent1 is not None and parent2 is not None:
                    child1, child2 = self.crossover(parent1, parent2)
                    new_population.append(self.mutate(child1))
                    new_population.append(self.mutate(child2))
                else:
                    new_population.append(self.random_chromosome())
                    new_population.append(self.random_chromosome())
            
            self.population = new_population
            
        return max(self.population, key=self.score_chromosome)

    def get_top_n(self, n=50):
        return self.data.sort_values(by='score', ascending=False).head(n)

    def save_top_n(self, output_path, n=50):
        top_n = self.get_top_n(n)
        top_n.to_csv(output_path, index=False)

if __name__ == "__main__":
    DATA_PATH = r"C:\Users\Rac\OneDrive\Desktop\PDBBIND2014\combined_docking_results.csv"
    OUTPUT_PATH = r"C:\Users\Rac\OneDrive\Desktop\PDBBIND2014\Genetic results\top_50_poses.csv"

    optimizer = GeneticDockingOptimizer(DATA_PATH)
    best_chromosome = optimizer.run(verbose=True)
    optimizer.save_top_n(OUTPUT_PATH)


Generation 0: Best Score: -8630136986.977423
Generation 10: Best Score: -8630136989.50296
Generation 20: Best Score: -8630136989.25872
Generation 30: Best Score: -8630136989.487026
Generation 40: Best Score: -8630136989.302256
Generation 50: Best Score: -8630136991.146952
Generation 60: Best Score: -8630136989.816593
Generation 70: Best Score: -8630136988.503757
Generation 80: Best Score: -8630136990.137362
Generation 90: Best Score: -8630136987.253218
Generation 100: Best Score: -8630136987.370098
Generation 110: Best Score: -8630136990.813065
Generation 120: Best Score: -8630136989.874575
Generation 130: Best Score: -8630136989.448341
Generation 140: Best Score: -8630136990.438776
Generation 150: Best Score: -8630136988.8239
Generation 160: Best Score: -8630136987.715788
Generation 170: Best Score: -8630136988.579243
Generation 180: Best Score: -8630136988.143627
Generation 190: Best Score: -8630136989.772043
