In [1]:
import pandas as pd

In [62]:
import pandas as pd
import requests
from tqdm import tqdm
import time
import re

def extract_id_from_url(url):
    """Extract the paper ID from OpenReview URL."""
    match = re.search(r'id=([^&]+)', url)
    return match.group(1) if match else None

def get_value(field):
    """Extract value from field if it's a dict with 'value' key."""
    if isinstance(field, dict) and 'value' in field:
        return field['value']
    return field

def get_paper_info(url):
    """Get paper information using OpenReview REST API v2."""
    try:
        paper_id = extract_id_from_url(url)
        if not paper_id:
            return None
            
        api_url = f'https://api2.openreview.net/notes?id={paper_id}'
        
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'application/json'
        }
        
        response = requests.get(api_url, headers=headers)
        response.raise_for_status()
        
        data = response.json()
        notes = data.get('notes', [])
        
        if notes and len(notes) > 0:
            note = notes[0]
            content = note.get('content', {})
            
            paper_info = {
                'url': url,
                'title': get_value(content.get('title', '')),
                'authors': get_value(content.get('authors', [])),
                'abstract': get_value(content.get('abstract', '')),
                'keywords': get_value(content.get('keywords', [])),
                'tldr': get_value(content.get('tldr', '')),
                'pdf_url': f"https://openreview.net/pdf?id={paper_id}"
            }
            return paper_info
        else:
            print(f"No paper data found for {url}")
            return None
            
    except Exception as e:
        print(f"Error processing {url}: {str(e)}")
        return None

# Read the CSV file
spotlight = pd.read_csv('accepted-spotlight-aidrugx.csv', names=['url'])
spotlight['type'] = 'spotlight'
accepted = pd.read_csv('accepted-posters-aidrugx.csv', names=['url'])
accepted['type'] = 'accept'

accepted = pd.concat([spotlight, accepted])


In [None]:
# Process each URL
papers_info = []
for i, url in tqdm(enumerate(accepted['url']), desc="Processing papers"):
    paper_info = get_paper_info(url)
    if paper_info:
        papers_info.append(paper_info)
    # Add a small delay to be nice to the server
    time.sleep(1)
    # if i % 10 == 0:
    #     time.sleep(1)

# Convert to DataFrame
papers_df = pd.DataFrame(papers_info)
papers_df['type'] = accepted['type'].values

# Save to CSV
papers_df.to_csv('aidrugx_papers_info.csv', index=False)

In [92]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
import plotly.express as px

# 1. Load the data
papers_df = pd.read_csv('aidrugx_papers_info.csv')

# 2. Prepare text for embedding
def prepare_text(row):
    """Combine relevant fields into a single text string"""
    elements = [
        'Title: ' + str(row['title']),
        # 'Authors: ' + ', '.join(eval(row['authors']) if isinstance(row['authors'], str) else row['authors']),
        'Abstract: ' + str(row['abstract']).replace('\n', ' '),
        'Keywords: ' + ', '.join(eval(row['keywords']) if isinstance(row['keywords'], str) and row['keywords'] != 'nan' else [])
    ]
    return '. '.join(elements)

# Create combined text for each paper
papers_df['combined_text'] = papers_df.apply(prepare_text, axis=1)

In [94]:
# 3. Generate embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(papers_df['combined_text'].tolist(), show_progress_bar=True)


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

In [129]:
# 4. Reduce dimensionality for visualization
pca = PCA(n_components=3)
embeddings_3d = pca.fit_transform(embeddings)
print("Explained variance ratio:", pca.explained_variance_ratio_)


Explained variance ratio: [0.085468   0.06098753 0.05101749]


In [130]:
import scipy.cluster.hierarchy as sch
from scipy.spatial.distance import pdist
import matplotlib.pyplot as plt
import plotly.graph_objects as go

# 5. Create interactive visualization
def format_authors(authors_str):
    """Extract first and last author from author list"""
    try:
        authors = eval(authors_str) if isinstance(authors_str, str) else authors_str
        if len(authors) <= 2:
            return ", ".join(authors)
        return f"{authors[0].strip(',')} ... {authors[-1].strip(',')}"
    except:
        return ""


# Create an HTML table with links and cluster-based styling
def create_paper_table(df, linkage_matrix, n_clusters=5):
    # Cut the tree to get cluster assignments
    clusters = sch.fcluster(linkage_matrix, n_clusters, criterion='maxclust')
    clusters = clusters[cluster_order]  # reorder clusters to match df
    
    table_html = """
    <table style="width:100%; border-collapse: collapse;">
      <tr style="background-color: #f2f2f2;">
        <th style="padding: 8px; border: 1px solid #ddd;">Title</th>
        <th style="padding: 8px; border: 1px solid #ddd;">Authors</th>
        <th style="padding: 8px; border: 1px solid #ddd;">Type</th>
        <th style="padding: 8px; border: 1px solid #ddd;">Link</th>
      </tr>
    """
    
    current_cluster = None
    for (_, row), cluster in zip(df.iterrows(), clusters):
        # Add visual separator between clusters
        if current_cluster != cluster:
            bg_color = f"#f9f9f9" if cluster % 2 == 0 else "#ffffff"
            table_html += f'<tr><td colspan="4" style="background-color: #e6e6e6; padding: 4px; text-align: center;"><b>Cluster {cluster}</b></td></tr>'
            current_cluster = cluster
            
        # Style spotlight papers differently
        type_style = 'font-weight: bold; color: #FF6B00;' if row['type'] == 'spotlight' else ''
        
        table_html += f"""
        <tr style="background-color: {bg_color}">
            <td style="padding: 8px; border: 1px solid #ddd;">{row['title']}</td>
            <td style="padding: 8px; border: 1px solid #ddd;">{row['authors']}</td>
            <td style="padding: 8px; border: 1px solid #ddd; {type_style}">{row['type'].title()}</td>
            <td style="padding: 8px; border: 1px solid #ddd;">
                <a href="{row['url']}" target="_blank">Open Paper</a>
            </td>
        </tr>
        """
    table_html += "</table>"
    return HTML(table_html)

In [146]:

# Perform hierarchical clustering
# Calculate distance matrix
distances = pdist(embeddings)
# Create linkage matrix
linkage_matrix = sch.linkage(distances, method='ward')
# Get the ordering of papers
cluster_order = sch.leaves_list(linkage_matrix)

# Reorder the DataFrame based on clustering
df_plot = pd.DataFrame({
    'PC1': embeddings_3d[:, 0],
    'PC2': embeddings_3d[:, 1],
    'PC3': embeddings_3d[:, 2],
    'title': papers_df['title'],
    'authors': papers_df['authors'].apply(format_authors),
    'url': papers_df['url']
}).iloc[cluster_order]

# Get cluster assignments
n_clusters = 15  # adjust as needed
clusters = sch.fcluster(linkage_matrix, n_clusters, criterion='maxclust')
clusters = clusters[cluster_order]  # reorder clusters to match df

# Add clusters to df_plot
df_plot['cluster'] = clusters

# Define discrete colors for clusters
# discrete_colors = px.colors.qualitative.Set3[:n_clusters]  # or use D3, Set1, Paired, etc.
discrete_colors = px.colors.qualitative.Light24[:n_clusters]  # 24 distinct colors

# Add type information to df_plot
df_plot['type'] = papers_df['type'].iloc[cluster_order]

# Map cluster numbers to colors directly
cluster_colors = [discrete_colors[int(c-1)] for c in df_plot['cluster']]

def wrap_text(text, width=60):
    """Wrap text at specified width"""
    return '<br>'.join(text[i:i + width] for i in range(0, len(text), width))


# Keep all the preprocessing code the same until the figure creation
# Just modify df_plot to use only PC1 and PC2 for the 2D plot

# Create figure with both traces
fig = go.Figure()

# Add trace with uniform color (initially visible)
fig.add_trace(
    go.Scatter(  # Changed from Scatter3d to Scatter
        x=df_plot['PC1'],
        y=df_plot['PC2'],
        mode='markers',
        marker=dict(
            size=df_plot['type'].apply(lambda x: 15 if x == 'spotlight' else 8),
            color=df_plot['type'].apply(lambda x: 'orange' if x == 'spotlight' else 'blue'),
            symbol=df_plot['type'].apply(lambda x: 'star' if x == 'spotlight' else 'circle')  # Can use 'star' in 2D
        ),
        name='Papers',
        text=df_plot.apply(lambda x: f"<b>Title:</b> {wrap_text(x['title'])}<br><b>Authors:</b> {x['authors']}", axis=1),
        hovertemplate="%{text}<extra></extra>",
        visible=True
    )
)

# Add trace with discrete cluster colors (initially hidden)
fig.add_trace(
    go.Scatter(  # Changed from Scatter3d to Scatter
        x=df_plot['PC1'],
        y=df_plot['PC2'],
        mode='markers',
        marker=dict(
            size=df_plot['type'].apply(lambda x: 15 if x == 'spotlight' else 8),
            symbol=df_plot['type'].apply(lambda x: 'star' if x == 'spotlight' else 'circle'),
            color=cluster_colors,
        ),
        name='Clusters',
        text=df_plot.apply(lambda x: f"<b>Title:</b> {wrap_text(x['title'])}<br><b>Authors:</b> {x['authors']}<br><b>Cluster:</b> {int(x['cluster'])}", axis=1),
        hovertemplate="%{text}<extra></extra>",
        visible=False
    )
)

# Add a custom legend for clusters
for i in range(n_clusters):
    fig.add_trace(
        go.Scatter(  # Changed from Scatter3d to Scatter
            x=[None], y=[None],
            mode='markers',
            marker=dict(size=10, color=discrete_colors[i]),
            name=f'Cluster {i+1}',
            showlegend=True,
            visible=False
        )
    )

# Update button to show/hide cluster legend
fig.update_layout(
    updatemenus=[
        dict(
            type="buttons",
            direction="left",
            buttons=list([
                dict(
                    args=[{"visible": [True, False] + [False]*n_clusters}],
                    label="Spotlight",
                    method="restyle"
                ),
                dict(
                    args=[{"visible": [False, True] + [True]*n_clusters}],
                    label="Semantic Clusters",
                    method="restyle"
                )
            ]),
            pad={"r": 10, "t": 10},
            showactive=True,
            x=0.1,
            xanchor="left",
            y=1.1,
            yanchor="top"
        ),
    ],
    # Add 2D-specific layout settings
    xaxis_title="PC1",
    yaxis_title="PC2",
    width=1000,
    height=800,
    hovermode='closest',  # Changed from 'closest' for external hover
    hoverdistance=100,
    hoverlabel=dict(
        bgcolor="white",
        font_size=12,
        font_family="Arial"
    ),
    hoverlabel_align='left',
    margin=dict(r=200, t=100, b=100, l=100)  # Adjusted margins
)

fig.show()
display(create_paper_table(df_plot, linkage_matrix, n_clusters=n_clusters))

Title,Authors,Type,Link
Cluster 1,Cluster 1,Cluster 1,Cluster 1
DiffER: Categorical Diffusion Models for Chemical Retrosynthesis,Sean Current ... srinivasan parthasarathy,Accept,Open Paper
Homomorphism Counts as Structural Encodings for Molecular Property Prediction,Linus Bao ... Matthias Lanzinger,Accept,Open Paper
SmileyLlama: Modifying Large Language Models \\for Directed Chemical Space Exploration,Joe Cavanagh ... Thomas D. Bannister,Accept,Open Paper
An Efficient Tokenization for Molecular Language Models,Seojin Kim ... Jinwoo Shin,Accept,Open Paper
Chain-of-thoughts for molecular understanding,Yunhui Jang ... Sungsoo Ahn,Accept,Open Paper
Cluster 2,Cluster 2,Cluster 2,Cluster 2
Deep Interactions for Multimodal Molecular Property Prediction,Patrick Soga ... Jundong Li,Accept,Open Paper
Geometry-text Multi-modal Foundation Model for Reactivity-oriented Molecule Editing,Haorui Li ... Anima Anandkumar,Accept,Open Paper
3D Interaction Geometric Pre-training for Molecular Relational Learning,Namkyeong Lee ... Chanyoung Park,Accept,Open Paper


In [147]:
# Export Plotly visualization as standalone HTML
fig.write_html(
    "aidrugx_semantic_viz.html",
    include_plotlyjs="cdn",  # Use CDN for plotly.js
    full_html=True,          # Create standalone HTML file
    config={
        'displayModeBar': True,
        'responsive': True
    }
)

# Export table
table_content = f"""
{{% raw %}}
<div class="table-container">
    {create_paper_table(df_plot, linkage_matrix, n_clusters=n_clusters).data}
</div>

<style>
    .table-container {{
        margin-top: 40px;
        overflow-x: auto;
    }}
</style>
{{% endraw %}}
"""

with open('aidrugx_papers_table.html', 'w', encoding='utf-8') as f:
    f.write(table_content)
