In [None]:
!pip install rdkit==2023.3.2

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, precision_recall_fscore_support
from sklearn.model_selection import RepeatedStratifiedKFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import label_binarize
from rdkit import Chem
from rdkit.Chem import Descriptors


class DescriptorCalculator:
    def __init__(self, desc_list=None):
        self.desc_list = desc_list or Descriptors.descList

    def calculate_descriptors(self, mol):
        descriptors = {}
        for name, function in self.desc_list:
            descriptors[name] = function(mol)
        return descriptors

def count_atoms(mol, atom_symbol):
    return len([atom for atom in mol.GetAtoms() if atom.GetSymbol() == atom_symbol])


count_nitrogen_atoms = lambda mol: count_atoms(mol, "N")
count_oxygen_atoms = lambda mol: count_atoms(mol, "O")


default_descriptors = [
    ("ExactMolWt", Descriptors.ExactMolWt),
    ("MolLogP", Descriptors.MolLogP),
    ("TPSA", Descriptors.TPSA),
    ("NumHDonors", Descriptors.NumHDonors),
    ("NumHAcceptors", Descriptors.NumHAcceptors),
    ("NumRotatableBonds", Descriptors.NumRotatableBonds),
    ("FractionCSP3", Descriptors.FractionCSP3),
    ("NumAromaticRings", Descriptors.NumAromaticRings),
    ("MaxPartialCharge", Descriptors.MaxPartialCharge),
    ("MinPartialCharge", Descriptors.MinPartialCharge),
    ("NumNitrogen", count_nitrogen_atoms),
    ("NumOxygen", count_oxygen_atoms),
]


def append_smiles_descriptors_to_df(
    df, smiles_col="SMILES", desc_list=None, keep_smiles_col=False
):
    """Append descriptors calculated from SMILES to a dataframe.

    Parameters
    ----------
    df : pandas.DataFrame
        Dataframe containing a column with SMILES.
    smiles_col : str, optional
        Name of the column containing SMILES, by default "SMILES".
    desc_list : list, optional
        List of descriptors, by default None.
    keep_smiles_col : bool, optional
        Whether to keep the column containing SMILES, by default False.

    Returns
    -------
    pandas.DataFrame
        Dataframe with descriptors appended.
    """

    calculator = DescriptorCalculator(desc_list)

    problematic_smiles = []
    descriptors_list = []
    valid_indices = []
    
    for idx, smiles in df[smiles_col].items():
        try:
            mol = Chem.MolFromSmiles(smiles)
            if not mol:
                raise ValueError("Invalid molecule")
            descriptors = calculator.calculate_descriptors(mol)
            descriptors_list.append(descriptors)
            valid_indices.append(idx)
        except Exception as e:
            print(f"Problematic SMILES at index {idx}: {smiles}, Error: {e}")
            problematic_smiles.append((idx, smiles))

    descriptors_df = pd.DataFrame(descriptors_list)
    df = df.loc[valid_indices].reset_index(drop=True)
    df_with_descriptors = pd.concat([df, descriptors_df.reset_index(drop=True)], axis=1)
    
    if not keep_smiles_col:
        df_with_descriptors.drop(columns=[smiles_col], inplace=True)
        
    return df_with_descriptors, problematic_smiles

In [None]:
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, CondensedNearestNeighbour, TomekLinks

In [None]:
df = pd.read_csv("/kaggle/input/pqq-related/training0827v1.csv")
df

In [None]:
df.duplicated().sum()

In [None]:
df_bbb = pd.read_csv("/kaggle/input/deepbbb/BBBtrain.csv")
df_bbb

In [None]:
df_bbb.duplicated().sum()

In [None]:
df_test = pd.read_csv("/kaggle/input/pqq-related/test.csv")
df_test

In [None]:
df_test.duplicated().sum()

In [None]:
duplicates = df[df["SMILES"].isin(df_test["SMILES"])]

print(duplicates)

In [None]:
duplicates = df_bbb[df_bbb["SMILES"].isin(df_test["SMILES"])]

print(duplicates)

In [None]:
import plotly.express as px

def draw_pie_for_target(df, target):
    fig = px.pie(df, names=target, 
             height=400, width=600, 
             hole=0.7, 
             title=f'{target} Overview',
             color_discrete_sequence=['#4c78a8', '#72b7b2'])
             
    # Customize pie chart
    fig.update_traces(hovertemplate=None, textposition='outside', textinfo='percent+label', rotation=0)
    fig.update_layout(margin=dict(t=100, b=30, l=0, r=0), showlegend=False,
                            plot_bgcolor='#fafafa', paper_bgcolor='#fafafa',
                            title_font=dict(size=20, color='#555', family="Lato, sans-serif"),
                            font=dict(size=17, color='#8a8d93'),
                            hoverlabel=dict(bgcolor="#444", font_size=13, font_family="Lato, sans-serif"))

    fig.show()

In [None]:
import plotly.express as px
from plotly.subplots import make_subplots
import math

def draw_pie_for_targets(df, targets):
    if not isinstance(targets, list):
        targets = [targets]

    n = len(targets)
    cols = math.ceil(math.sqrt(n))
    rows = math.ceil(n / cols)
    
    fig = make_subplots(rows=rows, cols=cols, 
                        subplot_titles=targets,  # This will set the titles above each pie chart
                        specs=[[{'type': 'pie'} for _ in range(cols)] for _ in range(rows)])
    
    # Create a consistent color mapping for all unique values across all targets
    all_unique_values = set()
    for target in targets:
        all_unique_values.update(df[target].unique())
    color_map = {val: color for val, color in zip(sorted(list(all_unique_values)), ['#4c78a8', '#72b7b2'] * len(all_unique_values))}
    
    for index, target in enumerate(targets):
        row = (index // cols) + 1
        col = (index % cols) + 1
        
        pie = px.pie(df, names=target, hole=0.7, color=target, color_discrete_map=color_map)

        for trace in pie.data:
            fig.add_trace(trace, row=row, col=col)
    
    # Customize pie charts using your original settings
    fig.update_traces(hovertemplate=None, textposition='outside', textinfo='percent+label', rotation=0)
    fig.update_layout(margin=dict(t=100, b=30, l=0, r=0), showlegend=False,
                      plot_bgcolor='#fafafa', paper_bgcolor='#fafafa',
                      font=dict(size=13, color='#8a8d93'),
                      hoverlabel=dict(bgcolor="#444", font_size=13, font_family="Lato, sans-serif"),
                      title_font=dict(size=15, color='#555', family="Lato, sans-serif"))

    fig.show()

In [None]:
def run_experiments(create_model_func, X, y, resampler=None, threshold=0.5):
    accs, precs, recs, f1s, cms = [], [], [], [], []

    rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=5, random_state=42)

    for i, (train_index, test_index) in enumerate(rskf.split(X, y)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        clf = create_model_func()

        if resampler:
            X_train, y_train = resampler.fit_resample(X_train, y_train)

        clf.fit(X_train, y_train)

        # Make predictions on the test data
        y_proba = clf.predict_proba(X_test)[:, 1]

        # Apply the decision threshold
        y_pred = (y_proba >= threshold).astype(int)


        # Compute the metrics
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred)
        rec = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)

        # Append the metrics for this fold to the lists
        accs.append(acc)
        precs.append(prec)
        recs.append(rec)
        f1s.append(f1)
        cms.append(cm)

    # Print the average metrics
    print(f"Average Accuracy: {np.mean(accs)}")
    print(f"Average Precision: {np.mean(precs)}")
    print(f"Average Recall: {np.mean(recs)}")
    print(f"Average F1 Score: {np.mean(f1s)}")

    return cms

In [None]:
create_model = lambda: RandomForestClassifier(class_weight='balanced_subsample', n_estimators=90, min_samples_split=2, min_samples_leaf=1, max_depth=15)

In [None]:
df.columns

In [None]:
targets = ['WP:3844', 'GO:0000165', 'GO:0004896', 'KEGG:hsa04064',
       'KEGG:hsa04210', 'KEGG:hsa04630', 'GO:0098869', 'GO:0072593', 'GO:0006281']
targets

In [None]:
features = ['ExactMolWt', 'MolLogP', 'TPSA', 'NumHDonors', 'NumHAcceptors', 'NumRotatableBonds',
           'FractionCSP3', 'NumAromaticRings', 'MaxPartialCharge', 'MinPartialCharge', 
           'NumNitrogen', 'NumOxygen']
features

In [None]:
draw_pie_for_targets(df, targets)

In [None]:
draw_pie_for_target(df, targets[1])

In [None]:
from rdkit import RDLogger

# Disable RDKit warnings
RDLogger.DisableLog('rdApp.*')  

In [None]:
dfnew, _ = append_smiles_descriptors_to_df(df, desc_list=default_descriptors)
dfnew

In [None]:
dfnew.isna().sum()

In [None]:
dfnew[dfnew["MaxPartialCharge"].isna()]

In [None]:
dfnew = dfnew.dropna()
dfnew

In [None]:
# Identify rows with infinity or NaN values
invalid_rows = np.any(np.isinf(dfnew) | np.isnan(dfnew), axis=1)
invalid_rows

In [None]:
dfnew = dfnew[~invalid_rows]
dfnew

In [None]:
dfnew_bbb, _ = append_smiles_descriptors_to_df(df_bbb, desc_list=default_descriptors)
dfnew_bbb

In [None]:
# Identify rows with infinity or NaN values
invalid_rows = np.any(np.isinf(dfnew_bbb) | np.isnan(dfnew_bbb), axis=1)
invalid_rows

In [None]:
dfnew_bbb = dfnew_bbb[~invalid_rows]
dfnew_bbb

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

### target BBB

In [None]:
X = dfnew_bbb[features]
y = dfnew_bbb["label"]

In [None]:
tbbb = create_model

In [None]:
cms = run_experiments(tbbb, X, y)

In [None]:
cm_sum = np.sum(cms, axis=0)
plt.figure(figsize=(10, 7))
sns.heatmap(cm_sum, annot=True, fmt="d", cmap="YlGnBu")
plt.title("Sum of Confusion Matrices of all folds")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

In [None]:
clf_targetbbb = tbbb()
clf_targetbbb.fit(X, y)

In [None]:
X = dfnew[features]
X

### target 1

In [None]:
y = dfnew[targets[0]]
y

In [None]:
t1 = lambda: XGBClassifier(n_estimators=98, max_depth=11, scale_pos_weight=9.036)

In [None]:
cms = run_experiments(t1, X, y)

In [None]:
cm_sum = np.sum(cms, axis=0)
plt.figure(figsize=(10, 7))
sns.heatmap(cm_sum, annot=True, fmt="d", cmap="YlGnBu")
plt.title("Sum of Confusion Matrices of all folds")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

In [None]:
clf_target1 = t1()
clf_target1.fit(X, y)

### target 2

In [None]:
y = dfnew[targets[1]]

In [None]:
t2 = lambda: XGBClassifier(n_estimators=50, max_depth=20, scale_pos_weight=9.483)

In [None]:
cms = run_experiments(t2, X, y)

In [None]:
cm_sum = np.sum(cms, axis=0)
plt.figure(figsize=(10, 7))
sns.heatmap(cm_sum, annot=True, fmt="d", cmap="YlGnBu")
plt.title("Sum of Confusion Matrices of all folds")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

In [None]:
clf_target2 = t2()
clf_target2.fit(X, y)

### target 3

In [None]:
y = dfnew[targets[2]]

In [None]:
t3 = lambda: CatBoostClassifier(iterations=276, depth=9, verbose=False)

In [None]:
cms = run_experiments(t3, X, y)

In [None]:
cm_sum = np.sum(cms, axis=0)
plt.figure(figsize=(10, 7))
sns.heatmap(cm_sum, annot=True, fmt="d", cmap="YlGnBu")
plt.title("Sum of Confusion Matrices of all folds")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

In [None]:
clf_target3 = t3()
clf_target3.fit(X, y)

### target 4

In [None]:
y = dfnew[targets[3]]

In [None]:
t4 = lambda: CatBoostClassifier(iterations=288, depth=7, verbose=False)

In [None]:
cms = run_experiments(t4, X, y)

In [None]:
cm_sum = np.sum(cms, axis=0)
plt.figure(figsize=(10, 7))
sns.heatmap(cm_sum, annot=True, fmt="d", cmap="YlGnBu")
plt.title("Sum of Confusion Matrices of all folds")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

In [None]:
clf_target4 = t4()
clf_target4.fit(X, y)

### target 5

In [None]:
y = dfnew[targets[4]]

In [None]:
t5 = lambda: CatBoostClassifier(iterations=295, depth=9, verbose=False)

In [None]:
cms = run_experiments(t5, X, y)

In [None]:
cm_sum = np.sum(cms, axis=0)
plt.figure(figsize=(10, 7))
sns.heatmap(cm_sum, annot=True, fmt="d", cmap="YlGnBu")
plt.title("Sum of Confusion Matrices of all folds")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

In [None]:
clf_target5 = t5()
clf_target5.fit(X, y)

### target 6

In [None]:
y = dfnew[targets[5]]

In [None]:
t6 = lambda: XGBClassifier(n_estimators=110, max_depth=9, scale_pos_weight=8.830)

In [None]:
cms = run_experiments(t6, X, y)

In [None]:
cm_sum = np.sum(cms, axis=0)
plt.figure(figsize=(10, 7))
sns.heatmap(cm_sum, annot=True, fmt="d", cmap="YlGnBu")
plt.title("Sum of Confusion Matrices of all folds")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

In [None]:
clf_target6 = t6()
clf_target6.fit(X, y)

### target 7

In [None]:
y = dfnew[targets[6]]

In [None]:
t7 = lambda: XGBClassifier(n_estimators=149, max_depth=6, scale_pos_weight=6.929)

In [None]:
cms = run_experiments(t7, X, y)

In [None]:
cm_sum = np.sum(cms, axis=0)
plt.figure(figsize=(10, 7))
sns.heatmap(cm_sum, annot=True, fmt="d", cmap="YlGnBu")
plt.title("Sum of Confusion Matrices of all folds")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

In [None]:
clf_target7 = t7()
clf_target7.fit(X, y)

### target 8

In [None]:
y = dfnew[targets[7]]

In [None]:
t8 = lambda: XGBClassifier(n_estimators=130, max_depth=18, scale_pos_weight=2.499)

In [None]:
cms = run_experiments(t8, X, y)  # resampler=SMOTE()

In [None]:
cm_sum = np.sum(cms, axis=0)
plt.figure(figsize=(10, 7))
sns.heatmap(cm_sum, annot=True, fmt="d", cmap="YlGnBu")
plt.title("Sum of Confusion Matrices of all folds")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

In [None]:
clf_target8 = t8()
clf_target8.fit(X, y)

### target 9

In [None]:
y = dfnew[targets[8]]

In [None]:
t9 = lambda: XGBClassifier(n_estimators=140, max_depth=20, scale_pos_weight=1.822)

In [None]:
cms = run_experiments(t9, X, y)

In [None]:
cm_sum = np.sum(cms, axis=0)
plt.figure(figsize=(10, 7))
sns.heatmap(cm_sum, annot=True, fmt="d", cmap="YlGnBu")
plt.title("Sum of Confusion Matrices of all folds")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

In [None]:
clf_target9 = t9()
clf_target9.fit(X, y)

### Put together

In [None]:
all_models = [clf_targetbbb, clf_target1, clf_target2, clf_target3, clf_target4, clf_target5, clf_target6, clf_target7,
              clf_target8, clf_target9]

In [None]:
targets = ["BBB"] + targets
targets

In [None]:
dfnew_test, _ = append_smiles_descriptors_to_df(df_test, desc_list=default_descriptors)
dfnew_test

In [None]:
X_test = dfnew_test.drop(columns=["cid"])
X_test

In [None]:
predicted_features = {}
threshold = 0.5

for i, model in enumerate(all_models):
    y_proba = model.predict_proba(X_test)[:, 1]
    y_pred = (y_proba >= threshold).astype(int)

    feature_name = targets[i]
    predicted_features[feature_name] = y_pred

# Convert the dictionary of predictions into a DataFrame
y_pred_df = pd.DataFrame(predicted_features)
y_pred_df

In [None]:
y_pred_df['count_1s'] = y_pred_df.sum(axis=1)
# Concatenate X_test with y_pred_df
result = pd.concat([dfnew_test["cid"], X_test, y_pred_df], axis=1)

## Result

In [None]:
result

In [None]:
result[result['count_1s'] == result['count_1s'].max()]

In [None]:
result[result['count_1s'] == result['count_1s'].min()]

In [None]:
result.to_csv("result.csv", index=False)