In [2]:
import pandas as pd
import numpy as np
from catboost import Pool, CatBoostRanker
from sklearn.model_selection import train_test_split

# Load your dataset
df = pd.read_csv('morg3.csv')

# Preprocessing: Ensuring the target variable is non-negative
df['MtbH37Rv-Inhibition'] = df['MtbH37Rv-Inhibition'].apply(lambda x: max(x, 0))

# Selecting features and target
X = df.select_dtypes(include=[np.number]).drop(columns=['MtbH37Rv-Inhibition'])
y = df['MtbH37Rv-Inhibition']

# Generate a unique group id for each compound (for illustration)
# In a real scenario, these should be meaningful groups
group_id = np.arange(len(df))

# Splitting the dataset
X_train, X_test, y_train, y_test, group_id_train, group_id_test = train_test_split(X, y, group_id, test_size=0.2, random_state=42)

# Prepare the Pool objects
train_pool = Pool(data=X_train, label=y_train, group_id=group_id_train)
test_pool = Pool(data=X_test, label=y_test, group_id=group_id_test)

# Initialize the CatBoostRanker
ranker = CatBoostRanker(
    iterations=100,
    learning_rate=0.1,
    depth=6,
    loss_function='PairLogitPairwise',
    verbose=10,
    random_seed=42
)

# Train the model
ranker.fit(train_pool)

# Make predictions on the test set
predictions = ranker.predict(test_pool)

# Ranking the test set based on the predictions
# Note: Higher scores imply higher ranks
ranked_indices = np.argsort(-predictions)  # Descending order
top_25_indices = ranked_indices[:25]
top_25_compounds = X_test.iloc[top_25_indices]

print("Indices of Top 25 Compounds based on Direct Ranking:")
print(top_25_indices)

# If you want to display more information about the top compounds,
# ensure to maintain a reference to the original compound identifiers in your dataset.


  df = pd.read_csv('morg3.csv')


0:	learn: 0.0000000	total: 147ms	remaining: 14.5s
10:	learn: 0.0000000	total: 1.03s	remaining: 8.33s
20:	learn: 0.0000000	total: 1.91s	remaining: 7.18s
30:	learn: 0.0000000	total: 2.72s	remaining: 6.06s
40:	learn: 0.0000000	total: 3.53s	remaining: 5.08s
50:	learn: 0.0000000	total: 4.36s	remaining: 4.19s
60:	learn: 0.0000000	total: 5.18s	remaining: 3.31s
70:	learn: 0.0000000	total: 6.01s	remaining: 2.45s
80:	learn: 0.0000000	total: 6.82s	remaining: 1.6s
90:	learn: 0.0000000	total: 7.63s	remaining: 754ms
99:	learn: 0.0000000	total: 8.36s	remaining: 0us
Indices of Top 25 Compounds based on Direct Ranking:
[    0 28597 28598 28599 28600 28601 28602 28603 28596 28604 28606 28607
 28608 28609 28610 28611 28612 28605 28595 28594 28593 28576 28577 28578
 28579]
