# Load

In [None]:
import pandas as pd
data = pd.read_csv("dblp-v11.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  100000 non-null  int64 
 1   abstract    82990 non-null   object
 2   authors     99999 non-null   object
 3   n_citation  100000 non-null  int64 
 4   references  87625 non-null   object
 5   title       100000 non-null  object
 6   venue       82305 non-null   object
 7   year        100000 non-null  int64 
 8   id          100000 non-null  object
dtypes: int64(3), object(6)
memory usage: 6.9+ MB


# Regressor

### Model selection

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor, DMatrix
import torch

# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Sample data for faster prototyping
df = data.sample(n=1_000, random_state=1)

# Preprocess data
# Handle missing values
df['abstract'] = df['abstract'].fillna('')
df['text'] = df['title'] + " " + df['abstract']

# Prepare features
scaler = MinMaxScaler()
df['year_scaled'] = scaler.fit_transform(df[['year']])

# Prepare target with log transformation
df['log_citation'] = np.log1p(df['n_citation'])

# Generate embeddings using GPU
model = SentenceTransformer('all-mpnet-base-v2', device='cuda')
embeddings = model.encode(
    df['text'].tolist(),
    batch_size=128,
    show_progress_bar=True,
    device='cuda'
)

# Combine features
X = np.hstack([embeddings, df['year_scaled'].values.reshape(-1, 1)])
y = df['log_citation'].values

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define and train models with GridSearch
models = [
    {
        'name': 'XGBoost',
        'model': XGBRegressor(tree_method = "hist", device = "cuda", predictor='gpu_predictor'),
        'params': {
            'n_estimators': [200, 300],
            'max_depth': [6, 8],
            'learning_rate': [0.05, 0.1],
            'subsample': [0.8, 1.0]
        }
    },
    {
        'name': 'RandomForest',
        'model': RandomForestRegressor(),
        'params': {
            'n_estimators': [200, 300],
            'max_depth': [None, 20],
            'min_samples_split': [3, 5]
        }
    }
]

best_model = None
best_score = float('inf')

for m in models:
    print(f"\nTraining {m['name']}...")
    grid = GridSearchCV(
        m['model'],
        m['params'],
        cv=3,
        scoring='neg_mean_squared_error',
        n_jobs=-1
    )

    # Use DMatrix for XGBoost to ensure GPU compatibility
    if m['name'] == 'XGBoost':
        dtrain = DMatrix(X_train, label=y_train, enable_categorical=True)
        dval = DMatrix(X_val, label=y_val, enable_categorical=True)
        grid.fit(dtrain.get_data(), dtrain.get_label())
    else:
        grid.fit(X_train, y_train)

    # Evaluate
    if m['name'] == 'XGBoost':
        val_pred = grid.best_estimator_.predict(dval.get_data())
    else:
        val_pred = grid.best_estimator_.predict(X_val)

    rmse = np.sqrt(mean_squared_error(np.expm1(y_val), np.expm1(val_pred)))
    print(f"{m['name']} Validation RMSE: {rmse:.2f}")

    if rmse < best_score:
        best_score = rmse
        best_model = grid.best_estimator_

# Evaluate best model
print(f"\nBest Model: {best_model}")
if isinstance(best_model, XGBRegressor):
    final_pred = best_model.predict(DMatrix(X_val, enable_categorical=True).get_data())
else:
    final_pred = best_model.predict(X_val)

final_rmse = np.sqrt(mean_squared_error(np.expm1(y_val), np.expm1(final_pred)))
final_mae = mean_absolute_error(np.expm1(y_val), np.expm1(final_pred))

print(f"Final RMSE: {final_rmse:.2f}")
print(f"Final MAE: {final_mae:.2f}")

Using device: cuda


Batches:   0%|          | 0/8 [00:00<?, ?it/s]


Training XGBoost...



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"



XGBoost Validation RMSE: 54.23

Training RandomForest...
RandomForest Validation RMSE: 54.32

Best Model: XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device='cuda:0', early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.05, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=6, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=200, n_jobs=None,
             num_parallel_tree=None, predictor='gpu_predictor', ...)
Final RMSE: 54.23
Final MAE: 23.98


### Training selected model

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor
import torch

# Check for GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Sample data for faster prototyping
df = data.sample(n=10_000, random_state=1)

# Preprocess data
df['abstract'] = df['abstract'].fillna('')
df['text'] = df['title'] + " " + df['abstract']

# Scale 'year' feature
scaler = MinMaxScaler()
df['year_scaled'] = scaler.fit_transform(df[['year']])

# Log transformation for target variable
df['log_citation'] = np.log1p(df['n_citation'])

# Generate embeddings using GPU
model = SentenceTransformer('all-mpnet-base-v2', device='cuda')
embeddings = model.encode(
    df['text'].tolist(),
    batch_size=128,
    show_progress_bar=True,
    device='cuda'
)

# Combine features
X = np.hstack([embeddings, df['year_scaled'].values.reshape(-1, 1)])
y = df['log_citation'].values

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define XGBoost model
xgb_model = XGBRegressor(
    tree_method="hist",
    device="cuda",
    predictor="gpu_predictor",
    n_estimators=300,
    max_depth=8,
    learning_rate=0.1,
    subsample=0.8
)

# Train model with XGBoost's built-in progress bar
xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True)

# Predictions
val_pred = xgb_model.predict(X_val)

# Evaluate model
final_rmse = np.sqrt(mean_squared_error(np.expm1(y_val), np.expm1(val_pred)))
final_mae = mean_absolute_error(np.expm1(y_val), np.expm1(val_pred))

print(f"Final RMSE: {final_rmse:.2f}")
print(f"Final MAE: {final_mae:.2f}")


Using device: cuda


Batches:   0%|          | 0/79 [00:00<?, ?it/s]

[0]	validation_0-rmse:1.73811
[1]	validation_0-rmse:1.70124


Parameters: { "predictor" } are not used.



[2]	validation_0-rmse:1.66960
[3]	validation_0-rmse:1.64754
[4]	validation_0-rmse:1.62967
[5]	validation_0-rmse:1.61450
[6]	validation_0-rmse:1.60012
[7]	validation_0-rmse:1.58993
[8]	validation_0-rmse:1.58093
[9]	validation_0-rmse:1.57076
[10]	validation_0-rmse:1.56556
[11]	validation_0-rmse:1.56057
[12]	validation_0-rmse:1.55657
[13]	validation_0-rmse:1.55478
[14]	validation_0-rmse:1.55072
[15]	validation_0-rmse:1.54898
[16]	validation_0-rmse:1.54518
[17]	validation_0-rmse:1.54427
[18]	validation_0-rmse:1.54305
[19]	validation_0-rmse:1.54245
[20]	validation_0-rmse:1.54107
[21]	validation_0-rmse:1.53897
[22]	validation_0-rmse:1.53853
[23]	validation_0-rmse:1.53753
[24]	validation_0-rmse:1.53837
[25]	validation_0-rmse:1.53842
[26]	validation_0-rmse:1.53715
[27]	validation_0-rmse:1.53738
[28]	validation_0-rmse:1.53842
[29]	validation_0-rmse:1.53723
[30]	validation_0-rmse:1.53832
[31]	validation_0-rmse:1.53784
[32]	validation_0-rmse:1.53869
[33]	validation_0-rmse:1.53745
[34]	validation_

### Testing trained model

In [None]:
import pandas as pd
import numpy as np
import sqlite3
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
import torch

# ----------------------------------------
# 1. Set up and check GPU availability
# ----------------------------------------
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
# Connect to the SQLite database and load the data.
conn = sqlite3.connect('papers.db')
test_df = pd.read_sql_query("SELECT * FROM papers", conn)
conn.close()

# Preprocess test data: handle missing values and combine title and abstract.
test_df['abstract'] = test_df['abstract'].fillna('')
test_df['text'] = test_df['title'] + " " + test_df['abstract']

# Scale the 'year' feature in the test set using the same scaler fitted on training data.
test_df['year_scaled'] = scaler.transform(test_df[['year']])

# Prepare the target variable in the test set.
# (Test set uses the column 'citations' for the citation count.)
test_df['log_citation'] = np.log1p(test_df['citations'])

# ----------------------------------------
# 4. Generate text embeddings using a SentenceTransformer model
# ----------------------------------------
# Load the pre-trained SentenceTransformer model with GPU support
embed_model = SentenceTransformer('all-mpnet-base-v2', device='cuda')

# Generate embeddings for the combined text from training and test data.

print("Encoding test data...")
test_embeddings = embed_model.encode(
    test_df['text'].tolist(),
    batch_size=128,
    show_progress_bar=True,
    device='cuda'
)

# ----------------------------------------
# 5. Combine features (embeddings + scaled year) and prepare datasets
# ----------------------------------------

X_test = np.hstack([test_embeddings, test_df['year_scaled'].values.reshape(-1, 1)])
y_test = test_df['log_citation'].values

# ----------------------------------------
# 7. Predict on the test set and evaluate regression metrics
# ----------------------------------------
# Predict in the log domain
y_test_pred_log = xgb_model.predict(X_test)

# Convert predictions back to original citation counts using the inverse transformation
y_test_pred = np.expm1(y_test_pred_log)
y_test_true = np.expm1(y_test)

# Calculate regression metrics
rmse = np.sqrt(mean_squared_error(y_test_true, y_test_pred))
mae = mean_absolute_error(y_test_true, y_test_pred)
r2 = r2_score(y_test_true, y_test_pred)

print("\nTest Set Regression Metrics:")
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R2 Score: {r2:.2f}")


Using device: cuda
Encoding test data...


Batches:   0%|          | 0/74 [00:00<?, ?it/s]


Test Set Regression Metrics:
RMSE: 210.04
MAE: 107.10
R2 Score: -0.34


In [None]:
test_df['predicted_citations'] = np.expm1(y_test_pred)

# Display the top papers with the highest predicted citations
top_predictions = test_df[['title', 'year', 'topic', 'predicted_citations']].sort_values(
    by='predicted_citations', ascending=False
).head(3)  # Show top 10 papers

print("\nTop Predicted Citations:")
print(top_predictions)


Top Predicted Citations:
                                                  title  year  \
6131  DSI-Net: Deep Synergistic Interaction Network ...  2021   
1077  Uni-ControlNet: All-in-One Control to Text-to-...  2023   
246   Anchor Diffusion for Unsupervised Video Object...  2019   

                 topic  predicted_citations  
6131               LLM         1.189171e+13  
1077  Diffusion Models         8.380618e+12  
246   Diffusion Models         3.803213e+12  


# Author-Author Network


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import sqlite3
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# -------------------------------
# 1. Load and Combine Data
# -------------------------------

# Load post-2017 dataset from papers.db
conn = sqlite3.connect('papers.db')
papers_df = pd.read_sql_query("SELECT * FROM papers", conn)
conn.close()

# Standardize column names to match
df = df.rename(columns={'n_citation': 'citations'})
papers_df = papers_df.rename(columns={'citations': 'citations'})

# Ensure 'authors' field is consistent across datasets
df['authors'] = df['authors'].fillna('')
papers_df['authors'] = papers_df['authors'].fillna('')

# Combine datasets
combined_df = pd.concat([df, papers_df], ignore_index=True)

# Process authors (semicolon-separated)
combined_df['author_list'] = combined_df['authors'].apply(lambda x: [a.strip() for a in x.split(';') if a.strip() != ''])

# Target variable: log-transform citation counts
combined_df['log_citations'] = np.log1p(combined_df['citations'])

# -------------------------------
# 2. Train-Test Split
# -------------------------------
train_df, test_df = train_test_split(combined_df, test_size=0.2, random_state=42)

# -------------------------------
# 3. Build the Author-Author Graph
# -------------------------------
# Collect all unique authors from the entire dataset
all_authors = set()
for authors in combined_df['author_list']:
    for a in authors:
        all_authors.add(a)
all_authors = sorted(list(all_authors))
author_to_id = {author: idx for idx, author in enumerate(all_authors)}
num_authors = len(all_authors)

# Build edge list: add an undirected edge for each pair of co-authors in a paper
edges = []
for authors in combined_df['author_list']:
    author_ids = [author_to_id[a] for a in authors if a in author_to_id]
    for i in range(len(author_ids)):
        for j in range(i + 1, len(author_ids)):
            edges.append((author_ids[i], author_ids[j]))
            edges.append((author_ids[j], author_ids[i]))

# Remove duplicate edges
edges = list(set(edges))
edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()

# Create the PyTorch Geometric Data object
data = Data(edge_index=edge_index)
data.num_nodes = num_authors

# -------------------------------
# 4. Map Papers to Author Node Indices
# -------------------------------
def get_author_ids(author_list):
    return [author_to_id[a] for a in author_list if a in author_to_id]

train_paper_author_ids = train_df['author_list'].apply(get_author_ids).tolist()
test_paper_author_ids = test_df['author_list'].apply(get_author_ids).tolist()

# Convert target values to torch tensors
y_train = torch.tensor(train_df['log_citations'].values, dtype=torch.float)
y_test  = torch.tensor(test_df['log_citations'].values, dtype=torch.float)

# -------------------------------
# 5. Define the GNN Regressor Model
# -------------------------------
class AuthorGNNRegressor(nn.Module):
    def __init__(self, num_nodes, in_channels=32, hidden_channels=64, out_channels=64):
        super(AuthorGNNRegressor, self).__init__()
        self.author_embeddings = nn.Embedding(num_nodes, in_channels)
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)
        self.regressor = nn.Linear(out_channels, 1)

    def forward(self, data, paper_author_ids):
        x = self.author_embeddings.weight
        x = self.conv1(x, data.edge_index)
        x = torch.relu(x)
        x = self.conv2(x, data.edge_index)

        paper_embeddings = []
        for author_ids in paper_author_ids:
            if len(author_ids) > 0:
                authors_tensor = x[author_ids]
                paper_emb = authors_tensor.mean(dim=0)
            else:
                paper_emb = torch.zeros(x.size(1), device=x.device)
            paper_embeddings.append(paper_emb)
        paper_embeddings = torch.stack(paper_embeddings, dim=0)
        out = self.regressor(paper_embeddings)
        return out.squeeze()

# Instantiate the model, optimizer, and loss function
model = AuthorGNNRegressor(num_nodes=num_authors)
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.MSELoss()

# -------------------------------
# 6. Training Loop
# -------------------------------
num_epochs = 100
model.train()
for epoch in range(num_epochs):
    optimizer.zero_grad()
    preds = model(data, train_paper_author_ids)
    loss = criterion(preds, y_train)
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item():.4f}")

# -------------------------------
# 7. Evaluation on Test Set
# -------------------------------
model.eval()
with torch.no_grad():
    test_preds = model(data, test_paper_author_ids)
    test_preds_citations = torch.expm1(test_preds).cpu().numpy()
    y_test_citations = torch.expm1(y_test).cpu().numpy()

rmse = np.sqrt(mean_squared_error(y_test_citations, test_preds_citations))
mae = mean_absolute_error(y_test_citations, test_preds_citations)
r2 = r2_score(y_test_citations, test_preds_citations)

print("\nTest Set Regression Metrics:")
print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R2 Score: {r2:.2f}")

# -------------------------------
# 8. Display Top Predicted Citations
# -------------------------------
test_df = test_df.copy()
test_df['predicted_citations'] = test_preds_citations
top_predictions = test_df[['title', 'year', 'topic', 'predicted_citations']].sort_values(
    by='predicted_citations', ascending=False
).head(10)

print("\nTop Predicted Citations:")
print(top_predictions)


Epoch 10/100, Loss: 3.7697
Epoch 20/100, Loss: 2.6772
Epoch 30/100, Loss: 1.7634
Epoch 40/100, Loss: 1.0372
Epoch 50/100, Loss: 0.5276
Epoch 60/100, Loss: 0.2340
Epoch 70/100, Loss: 0.1039
Epoch 80/100, Loss: 0.0525
Epoch 90/100, Loss: 0.0334
Epoch 100/100, Loss: 0.0260

Test Set Regression Metrics:
RMSE: 144.20
MAE: 60.33
R2 Score: -0.27

Top Predicted Citations:
                                                   title  year  \
4119   The Good, the Bad, and the Expert: How Consume...  2015   
4740   Chinese Term Recognition and Extraction Based ...  2008   
4045   High efficient hardware allocation framework o...  2015   
10845  Score-Based Generative Modeling through Stocha...  2020   
12922  Normalizing Flows for Probabilistic Modeling a...  2019   
15288                  Large language models in medicine  2023   
14175      Diffusion Models Beat GANs on Image Synthesis  2021   
15382  RePaint: Inpainting using Denoising Diffusion ...  2022   
16567   Vote-Selling: Infrastructure an