In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import matplotlib.pyplot as plt
import seaborn as sns

class MolecularDescriptorCalculator:
    @staticmethod
    def calculate_descriptors(smiles):
        mol = Chem.MolFromSmiles(smiles)
        return {
            'MolWt': Descriptors.MolWt(mol),
            'LogP': Descriptors.MolLogP(mol),
            'NumHDonors': Descriptors.NumHDonors(mol),
            'NumHAcceptors': Descriptors.NumHAcceptors(mol),
            'NumRotatableBonds': Descriptors.NumRotatableBonds(mol),
            'TPSA': Descriptors.TPSA(mol)
        }

class ProteinSequenceAnalyzer:
    @staticmethod
    def analyze_sequence(sequence):
        seq = Seq(sequence)
        return {
            'Length': len(seq),
            'MolecularWeight': SeqIO.SeqUtils.molecular_weight(seq),
            'IsoelectricPoint': SeqIO.SeqUtils.isoelectric_point(seq),
            'GCContent': SeqIO.SeqUtils.GC(seq)
        }

class DrugDiscoveryML:
    def __init__(self):
        self.model = RandomForestClassifier(n_estimators=100, random_state=42)
        self.X = None
        self.y = None

    def prepare_data(self, compounds, targets, activities):
        compound_features = []
        target_features = []

        for compound in compounds:
            compound_features.append(MolecularDescriptorCalculator.calculate_descriptors(compound))

        for target in targets:
            target_features.append(ProteinSequenceAnalyzer.analyze_sequence(target))

        self.X = pd.DataFrame(compound_features + target_features)
        self.y = pd.Series(activities)

    def train_model(self):
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.2, random_state=42)
        self.model.fit(X_train, y_train)
        y_pred = self.model.predict(X_test)

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')

        print(f"Accuracy: {accuracy:.2f}")
        print(f"Precision: {precision:.2f}")
        print(f"Recall: {recall:.2f}")
        print(f"F1-score: {f1:.2f}")

    def predict_activity(self, compound, target):
        compound_features = MolecularDescriptorCalculator.calculate_descriptors(compound)
        target_features = ProteinSequenceAnalyzer.analyze_sequence(target)
        features = pd.DataFrame([compound_features | target_features])
        return self.model.predict_proba(features)[0][1]

    def visualize_feature_importance(self):
        feature_importance = self.model.feature_importances_
        feature_names = self.X.columns

        plt.figure(figsize=(10, 6))
        sns.barplot(x=feature_importance, y=feature_names)
        plt.title("Feature Importance in Drug Discovery Model")
        plt.xlabel("Importance")
        plt.ylabel("Features")
        plt.tight_layout()
        plt.show()

compounds = [
    "CC1=C(C(=O)NC(=O)N1)N2CCN(CC2)C3=CC=C(C=C3)OCC4COC(O4)(F)F",
    "CC1=C(C(=O)NC(=O)N1)N2CCN(CC2)C3=CC=C(C=C3)OCC4COC(O4)(Cl)Cl",
    "CC1=C(C(=O)NC(=O)N1)N2CCN(CC2)C3=CC=C(C=C3)OCC4COC(O4)(Br)Br"
]

targets = [
    "MKLSSGHASVLLIWFFLLVLFPIAGSQVQLVQSGAEVKKPGASVKVSCKASGYTFTGYYMHWVRQAPGQGLEWMGWINPNSGGTNYAQKFQGRVTMTRDTSISTAYMELSRLRSDDTAVYYCARGGWLLLSFDYWGQGTLVTVSS",
    "MKLPVRLLVLMFWIPASSSDVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAKDRLSITIRPRYYGMDVWGQGTTVTVSS",
    "MEFGLSWLFLVAILKGVQCEVQLVESGGGLVQPGGSLRLSCAASGFPFNNYAMHWVRQAPGKGLEWVSVISYDGSNKYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARWGGDGFYAMDYWGQGTLVTVSS"
]

activities = [1, 0, 1]

drug_discovery = DrugDiscoveryML()
drug_discovery.prepare_data(compounds, targets, activities)
drug_discovery.train_model()

new_compound = "CC1=C(C(=O)NC(=O)N1)N2CCN(CC2)C3=CC=C(C=C3)OCC4COC(O4)(I)I"
new_target = "MEFGLSWLFLVAILKGVQCEVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMSWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCAKGGWLLLSFDYWGQGTLVTVSS"

activity_probability = drug_discovery.predict_activity(new_compound, new_target)
print(f"Predicted activity probability: {activity_probability:.2f}")


drug_discovery.visualize_feature_importance()
