In [31]:
import pandas as pd
import numpy as np
import pyswarms as ps


class PSODockingOptimizer:
    def __init__(self, data_path):
        self.data = pd.read_csv(data_path)
        self.score_column = "score"
    
    def score_row(self, row, weights):
        """Score a given row based on given weights."""
        w_affinity, w_rmsd_lb, w_rmsd_ub = weights
        desired_affinity_range = (-8.0, -7.5)
        
        # Affinity constraint
        if not desired_affinity_range[0] <= row["Affinity (kcal/mol)"] <= desired_affinity_range[1]:
            return -1e10  # Penalize the pose heavily if outside desired range
        
        # Exclude samples with 0.00 in both rmsd columns
        if row["Dist from best mode (rmsd l.b.)"] == 0.00 and row["Dist from best mode (rmsd u.b.)"] == 0.00:
            return -1e10  # Penalize these samples heavily
        
        # Calculate score
        return (w_affinity * row["Affinity (kcal/mol)"]
                - w_rmsd_lb * row["Dist from best mode (rmsd l.b.)"]
                - w_rmsd_ub * row["Dist from best mode (rmsd u.b.)"])
    
    def calculate_fitness(self, weights):
        """Calculate the fitness of the given weights."""
        self.data[self.score_column] = self.data.apply(lambda row: self.score_row(row, weights), axis=1)
        
        # Take the average score as the fitness value
        return np.mean(self.data[self.score_column])
    
    def fitness_function(self, positions):
        """Compute the fitness for each position in the swarm."""
        n_particles = positions.shape[0]
        scores = np.zeros(n_particles)
        
        for i in range(n_particles):
            scores[i] = -self.calculate_fitness(positions[i])
        
        return scores
    
    def optimize(self, n_particles=100, iters=200):
        """Perform the PSO optimization."""
        # Initialize the swarm (three dimensions for three weights)
        dimensions = 3
        options = {
            'c1': 1.5,
            'c2': 1.5,
            'w': 0.5
        }
        
        optimizer = ps.single.GlobalBestPSO(n_particles=n_particles,
                                            dimensions=dimensions,
                                            options=options,
                                            bounds=([0, 0, 0], [5, 5, 5]))
        
        # Execute optimization
        cost, pos = optimizer.optimize(self.fitness_function, iters=iters)
        
        return cost, pos
    
    def get_top_n(self, n=50):
        """Retrieve the top N records based on score."""
        return self.data.sort_values(by=self.score_column, ascending=False).head(n)

    def save_top_n(self, output_path, n=50):
        """Save the top N records to a CSV file."""
        top_n = self.get_top_n(n)
        top_n.to_csv(output_path, index=False)


if __name__ == "__main__":
    DATA_PATH = r"C:\Users\Rac\OneDrive\Desktop\PDBBIND2014\combined_docking_results.csv"
    OUTPUT_PATH = r"C:\Users\Rac\OneDrive\Desktop\PDBBIND2014\PSO results\top_50_poses.csv"
    
    optimizer = PSODockingOptimizer(DATA_PATH)
    optimizer.optimize()
    optimizer.save_top_n(OUTPUT_PATH)


2023-08-14 00:09:54,783 - pyswarms.single.global_best - INFO - Optimize for 200 iters with {'c1': 1.5, 'c2': 1.5, 'w': 0.5}
pyswarms.single.global_best: 100%|████████████|200/200, best_cost=8.63e+9
2023-08-14 00:13:30,455 - pyswarms.single.global_best - INFO - Optimization finished | best cost: 8630136986.410986, best pos: [0.00440337 0.10325613 0.0790321 ]


In [None]:
# Define the path to the pdbqt file
#file_path = r"C:\Users\Rac\OneDrive\Desktop\PDBBIND2014\docked_results\2g70\docked_result.pdbqt"

# Open and read the contents of the file
#with open(file_path, 'r') as file:
 #   content = file.read()

# Display the content
#print(content)