In [2]:
!pip install text-unidecode

Collecting text-unidecode
  Downloading text_unidecode-1.3-py2.py3-none-any.whl.metadata (2.4 kB)
Downloading text_unidecode-1.3-py2.py3-none-any.whl (78 kB)
Installing collected packages: text-unidecode
Successfully installed text-unidecode-1.3



[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import pandas as pd
import numpy as np
import re
import unicodedata
from text_unidecode import unidecode
from collections import defaultdict
import ast 

In [3]:
# --- Load and Prepare Your Dataset ---
# Make sure your CSV file is in the same directory or provide the full path.
try:
    df = pd.read_csv("Normalized_Dataset/AND_Normalized1.csv")
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print("Error: Please replace 'your_dataset.csv' with the actual name of your data file.")

Dataset loaded successfully!


In [25]:
# --- THE CORE FIX: Guarantee a single, unique 'publication_id' ---
# 1. Drop ALL potential old ID columns to prevent any conflicts.
#    errors='ignore' ensures this runs even if a column doesn't exist.
df = df.drop(columns=['publication_id', 'Paper_id', 'Unnamed: 0'], errors='ignore')

# 2. Reset the index. This creates a new 'index' column with unique integers (0, 1, 2...).
df = df.reset_index(drop=False)

# 3. Rename the new, unique 'index' column to be our one and only 'publication_id'.
df = df.rename(columns={'index': 'publication_id'})
print("Created a new, guaranteed-unique 'publication_id' column.")

# --- RENAME COLUMNS and CLEAN DATA ---
rename_map = {
    'Full_Name': 'author_name', 'Publication Year': 'year', 'Collaborators': 'co_authors',
    'Address': 'affiliation', 'journal': 'venue', 'title': 'title'
}
df = df.rename(columns=lambda c: rename_map.get(c, c))
df['ground_truth_author_id'] = df['OID'].astype(str)
nan_mask = df['ground_truth_author_id'].isin(['nan', 'None'])
df.loc[nan_mask, 'ground_truth_author_id'] = [f'unknown_{i}' for i in range(nan_mask.sum())]

key_text_cols = ['author_name', 'co_authors', 'affiliation', 'venue', 'title']
for col in key_text_cols:
    if col in df.columns:
        df[col] = df[col].fillna('')
if 'year' in df.columns:
    df['year'] = pd.to_numeric(df['year'], errors='coerce').fillna(0).astype(int)

# --- NORMALIZE DATA ---
def normalize_name(name):
    if not isinstance(name, str) or not name: return ""
    name = unidecode(name).lower()
    name = re.sub(r'[^a-z\s,]', '', name)
    parts = [p.strip() for p in re.split(r'[\s,]+', name) if p.strip()]
    if not parts: return ""
    lastname = parts[-1]; initials = [p[0] for p in parts[:-1]]
    return " ".join(initials) + " " + lastname

def normalize_text_generic(text):
    if not isinstance(text, str): return ""
    text = unidecode(text).lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def normalize_coauthors(coauthor_str):
    if not isinstance(coauthor_str, str) or not coauthor_str: return []
    try:
        coauthor_list = ast.literal_eval(coauthor_str)
        if isinstance(coauthor_list, list): return sorted([normalize_name(name) for name in coauthor_list])
    except: return sorted([normalize_name(name) for name in coauthor_str.split(';')])
    return []

df_normalized = df.copy()
df_normalized['norm_author_name'] = df['author_name'].apply(normalize_name)
df_normalized['norm_co_authors'] = df['co_authors'].apply(normalize_coauthors)
df_normalized['norm_title'] = df['title'].apply(normalize_text_generic)
df_normalized['norm_venue'] = df['venue'].apply(normalize_text_generic)
df_normalized['norm_affiliation'] = df['affiliation'].apply(normalize_text_generic)

# --- VERIFY THE FIX ---
assert df_normalized.columns.is_unique, "FATAL ERROR: Duplicate columns were still created."
print("\nVerification successful: All column names are unique.")

Created a new, guaranteed-unique 'publication_id' column.

Verification successful: All column names are unique.


In [26]:
df_normalized.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55429 entries, 0 to 55428
Data columns (total 29 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   publication_id          55429 non-null  int64  
 1   SurName                 55426 non-null  object 
 2   GivenNames              55392 non-null  object 
 3   author_name             55429 non-null  object 
 4   Emailid                 55429 non-null  object 
 5   Caveats_email           55429 non-null  bool   
 6   CorsAu                  55429 non-null  bool   
 7   OID                     45614 non-null  object 
 8   RID                     41662 non-null  object 
 9   co_authors              55429 non-null  object 
 10  Publication Month       55429 non-null  float64
 11  year                    55429 non-null  int64  
 12  Subject_cat             55429 non-null  object 
 13  affiliation             55429 non-null  object 
 14  AuNames                 55429 non-null

In [27]:
# ==============================================================================
# BLOCK 2: BLOCKING AND CANDIDATE PAIR GENERATION
# ==============================================================================

def create_blocks_and_pairs(df_norm):
    print("\nStarting blocking and pair generation...")
    df_temp = df_norm[['publication_id', 'norm_author_name']].copy()
    def get_block_key(name):
        if isinstance(name, str) and ' ' in name:
            parts = name.split()
            return f"{parts[-1]} {parts[0][0]}"
        return None
    df_temp['block_key'] = df_temp['norm_author_name'].apply(get_block_key)
    df_temp = df_temp.dropna(subset=['block_key'])

    grouped = df_temp.groupby('block_key')['publication_id'].apply(list)
    blocks = grouped[grouped.str.len() > 1]

    candidate_pairs = set()
    for pub_ids_list in blocks:
        for i in range(len(pub_ids_list)):
            for j in range(i + 1, len(pub_ids_list)):
                id1, id2 = sorted((pub_ids_list[i], pub_ids_list[j]))
                candidate_pairs.add((id1, id2))
    print("Blocking and pair generation complete.")
    return list(candidate_pairs)

candidate_pairs = create_blocks_and_pairs(df_normalized)
print(f"\nGenerated {len(candidate_pairs)} candidate pairs for comparison.")
if len(candidate_pairs) > 5:
    print("Example pairs:", candidate_pairs[:5])
else:
    print("Example pairs:", candidate_pairs)

# --- FINAL STEP: SET INDEX ---
# This will now succeed because the DataFrame is correctly structured.
df_normalized = df_normalized.set_index('publication_id', drop=False)
print("\n--- Pipeline Complete: Successfully set 'publication_id' as the index. ---")


Starting blocking and pair generation...
Blocking and pair generation complete.

Generated 491020 candidate pairs for comparison.
Example pairs: [(7747, 7749), (16623, 16624), (46843, 47445), (34051, 34058), (18585, 18627)]

--- Pipeline Complete: Successfully set 'publication_id' as the index. ---


In [29]:
!pip install scikit-learn lightgbm sentence-transformers python-Levenshtein networkx

Collecting lightgbm
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-5.1.2-py3-none-any.whl.metadata (16 kB)
Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting networkx
  Using cached networkx-3.5-py3-none-any.whl.metadata (6.3 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.9.0-cp312-cp312-win_amd64.whl.metadata (30 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-1.0.1-py3-none-any.whl.metadata (13 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Downloading levenshtein-0.27.1-cp312-cp312-win_amd64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-Levenshtein)
  Dow


[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [34]:
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import lightgbm as lgb
import Levenshtein
from Levenshtein import jaro_winkler as jarowinkler # Correct import/alias of the function
import networkx as nx
import numpy as np # <-- Added: Missing numpy import

# Initialize the SBERT model once
print("\nLoading SBERT model...")
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

# ----------------------------------------------------------------------
# Note: The 'jarowinkler' function is now the direct Jaro-Winkler similarity function.
# ----------------------------------------------------------------------
def compute_features(pub1, pub2, sbert_embeddings):
    """Computes a feature vector for a pair of publications (adapted for your data)."""
    features = {}
    
    # Name Similarity (Jaro-Winkler)
    # FIX: Directly call the aliased function 'jarowinkler'
    features['name_jaro'] = jarowinkler(pub1['norm_author_name'], pub2['norm_author_name'])

    # Affiliation Similarity (Token Jaccard)
    aff1 = set(pub1['norm_affiliation'].split())
    aff2 = set(pub2['norm_affiliation'].split())
    features['aff_jaccard'] = len(aff1.intersection(aff2)) / len(aff1.union(aff2)) if aff1.union(aff2) else 0

    # Co-author Similarity (Jaccard)
    # Note: Assuming 'norm_co_authors' is a list/set of co-author names
    coauth1 = set(pub1['norm_co_authors'])
    coauth2 = set(pub2['norm_co_authors'])
    features['coauth_jaccard'] = len(coauth1.intersection(coauth2)) / len(coauth1.union(coauth2)) if coauth1.union(coauth2) else 0
    
    # Venue (Journal) Similarity (Normalized Levenshtein)
    venue1, venue2 = pub1['norm_venue'], pub2['norm_venue']
    max_len = max(len(venue1), len(venue2))
    # Levenshtein distance normalized by the maximum length
    features['venue_lev'] = 1 - (Levenshtein.distance(venue1, venue2) / max_len) if max_len > 0 else 1

    # Temporal Proximity (Exponential Decay)
    year_diff = abs(pub1['year'] - pub2['year'])
    features['year_prox'] = np.exp(-0.1 * year_diff)

    # Title Semantic Similarity (SBERT)
    emb1 = sbert_embeddings[pub1.name] # .name gives the index (publication_id)
    emb2 = sbert_embeddings[pub2.name]
    # cosine_similarity expects 2D arrays, hence the list wrappers
    features['title_sbert_sim'] = cosine_similarity([emb1], [emb2])[0][0]
    
    return features

# --- Prepare data for ML model ---
print("\nComputing SBERT embeddings for all titles...")
# Assuming df_normalized is available and has 'title' and 'publication_id' as index
title_embeddings = sbert_model.encode(df_normalized['title'].tolist(), show_progress_bar=True)
# Create a mapping from publication_id to its embedding
sbert_map = dict(zip(df_normalized.index, title_embeddings))

print("\nEngineering features for candidate pairs...")
X = []
y = []

# Assuming candidate_pairs is available as a list of tuples: [(id1, id2), ...]
for id1, id2 in candidate_pairs:
    pub1 = df_normalized.loc[id1]
    pub2 = df_normalized.loc[id2]
    
    # Check if ground_truth_author_id is a Series (happens if index is not unique)
    # If loc returns a DataFrame, take the first row (common issue with non-unique indices)
    if isinstance(pub1, pd.DataFrame):
         pub1 = pub1.iloc[0]
         pub2 = pub2.iloc[0]
    
    features = compute_features(pub1, pub2, sbert_map)
    X.append(list(features.values()))
    
    # Assuming 'ground_truth_author_id' is accessible
    label = 1 if pub1['ground_truth_author_id'] == pub2['ground_truth_author_id'] else 0
    y.append(label)

X = np.array(X)
y = np.array(y)

print(f"\nCreated feature matrix of shape: {X.shape}")
print(f"Positive pairs: {np.sum(y)} | Negative pairs: {len(y) - np.sum(y)}")

# --- Train the Fusion Model (LightGBM) ---
if len(X) > 0 and np.sum(y) > 1:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
    lgb_classifier = lgb.LGBMClassifier(objective='binary', random_state=42)
    lgb_classifier.fit(X_train, y_train)

    print("\nClassifier training complete.")
    from sklearn.metrics import classification_report
    y_pred = lgb_classifier.predict(X_test)
    print("\nClassifier Performance Report:")
    print(classification_report(y_test, y_pred))
else:
    print("\nSkipping model training due to insufficient data or lack of positive samples.")
    lgb_classifier = None


Loading SBERT model...

Computing SBERT embeddings for all titles...


Batches:   0%|          | 0/1733 [00:00<?, ?it/s]


Engineering features for candidate pairs...

Created feature matrix of shape: (491020, 6)
Positive pairs: 220318 | Negative pairs: 270702
[LightGBM] [Info] Number of positive: 154223, number of negative: 189491
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005999 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 812
[LightGBM] [Info] Number of data points in the train set: 343714, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.448696 -> initscore=-0.205942
[LightGBM] [Info] Start training from score -0.205942

Classifier training complete.

Classifier Performance Report:
              precision    recall  f1-score   support

           0       1.00      0.81      0.90     81211
           1       0.81      1.00      0.90     66095

    accuracy                           0.90    147306
   macro avg       0.



In [35]:
import networkx as nx
from networkx.algorithms import community

if lgb_classifier and len(candidate_pairs) > 0:
    # --- Step 1: Construct the Learned Similarity Graph ---
    print("\nConstructing similarity graph...")
    pair_probabilities = lgb_classifier.predict_proba(X)[:, 1]

    G = nx.Graph()
    G.add_nodes_from(df_normalized.index)

    for i, (id1, id2) in enumerate(candidate_pairs):
        prob = pair_probabilities[i]
        if prob > 0.5: # Use a 50% probability threshold to create an edge
            G.add_edge(id1, id2, weight=prob)
            
    print(f"Graph constructed with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")

    # --- Step 2 & 3: Community Detection ---
    # We will use a single, robust method: Connected Components
    # This is simple and effective. Louvain is better for graphs with complex community structure.
    print("\nDetecting communities (clusters)...")
    clusters = list(nx.connected_components(G))
    
    # --- Assigning Cluster IDs ---
    final_clusters = {}
    cluster_id_counter = 0
    for component in clusters:
        for node in component:
            final_clusters[node] = cluster_id_counter
        cluster_id_counter += 1

    # Handle publications that were not in any pair (singletons)
    all_nodes = set(df_normalized.index)
    clustered_nodes = set(final_clusters.keys())
    singleton_nodes = all_nodes - clustered_nodes
    for node in singleton_nodes:
        final_clusters[node] = cluster_id_counter
        cluster_id_counter += 1
        
    df_normalized['predicted_cluster_id'] = df_normalized.index.map(final_clusters)
    
    print("\nFinal Disambiguation Results (Sample):")
    print(df_normalized[['author_name', 'ground_truth_author_id', 'predicted_cluster_id']].sort_values('predicted_cluster_id').head(10))

else:
    print("\nSkipping clustering because the model was not trained.")
    df_normalized['predicted_cluster_id'] = range(len(df_normalized)) # Assign all to unique clusters




Constructing similarity graph...
Graph constructed with 55429 nodes and 270354 edges.

Detecting communities (clusters)...

Final Disambiguation Results (Sample):
                       author_name               ground_truth_author_id  \
publication_id                                                            
0                    akhavian reza   akhavian, reza/0000-0001-9691-8016   
1                    akhavian reza   akhavian, reza/0000-0001-9691-8016   
4               budhitama subagdja                            unknown_2   
2               budhitama subagdja                            unknown_0   
3               budhitama subagdja                            unknown_1   
6                          feng yu         feng, yu/0000-0001-6433-5035   
5                          feng yu         feng, yu/0000-0001-6433-5035   
7                   mansour nasser  mansour, nasser/0000-0001-5707-7373   
8                   mansour nasser  mansour, nasser/0000-0001-5707-7373   
9          

In [36]:
from sklearn.metrics import normalized_mutual_info_score

if lgb_classifier and len(candidate_pairs) > 0:
    gt_labels = df_normalized['ground_truth_author_id'].values
    pred_labels = df_normalized['predicted_cluster_id'].values

    nmi_score = normalized_mutual_info_score(gt_labels, pred_labels)
    print(f"\n--- Evaluation ---")
    print(f"Normalized Mutual Information (NMI) Score: {nmi_score:.4f}")

    def b_cubed_score(true_labels_df, pred_labels_df):
        true_map = true_labels_df.groupby(true_labels_df).groups
        pred_map = pred_labels_df.groupby(pred_labels_df).groups
        
        precision_sum = 0.0
        recall_sum = 0.0
        
        for item_idx in true_labels_df.index:
            true_cluster_label = true_labels_df[item_idx]
            pred_cluster_label = pred_labels_df[item_idx]
            
            true_cluster = set(true_map[true_cluster_label])
            pred_cluster = set(pred_map[pred_cluster_label])

            intersection_size = len(true_cluster.intersection(pred_cluster))
            
            precision_sum += intersection_size / len(pred_cluster)
            recall_sum += intersection_size / len(true_cluster)
            
        p = precision_sum / len(true_labels_df)
        r = recall_sum / len(true_labels_df)
        f1 = (2 * p * r) / (p + r) if (p + r) > 0 else 0
        return p, r, f1

    precision, recall, f1 = b_cubed_score(
        df_normalized['ground_truth_author_id'],
        df_normalized['predicted_cluster_id']
    )
    print(f"B-Cubed Precision: {precision:.4f}")
    print(f"B-Cubed Recall: {recall:.4f}")
    print(f"B-Cubed F1-Score: {f1:.4f}")
else:
    print("\nSkipping evaluation.")


--- Evaluation ---
Normalized Mutual Information (NMI) Score: 0.9823
B-Cubed Precision: 0.8589
B-Cubed Recall: 0.9984
B-Cubed F1-Score: 0.9234


In [38]:
import joblib
# ==============================================================================
# BLOCK 3: MODEL TRAINING AND SAVING
# ==============================================================================
if len(X) > 0 and np.sum(y) > 1:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
    lgb_classifier = lgb.LGBMClassifier(objective='binary', random_state=42)
    print("Training LightGBM classifier...")
    lgb_classifier.fit(X_train, y_train)
    print("Training complete.")

    # --- SAVE THE MODELS ---
    # Save the trained LightGBM model
    joblib.dump(lgb_classifier, 'and_model.pkl')
    print("LightGBM model saved to and_model.pkl")

    # The SentenceTransformer model saves itself as a folder
    sbert_model.save('sbert_model')
    print("SentenceTransformer model saved to the 'sbert_model/' directory.")
    
    # Save the full normalized data frame, which we will use as our database
    df_normalized.to_csv('publication_database.csv')
    print("Normalized publication data saved to publication_database.csv")

else:
    print("Skipping model training and saving due to insufficient data.")

print("--- Pipeline Finished ---")

Training LightGBM classifier...
[LightGBM] [Info] Number of positive: 165238, number of negative: 203027
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005161 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 810
[LightGBM] [Info] Number of data points in the train set: 368265, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.448693 -> initscore=-0.205952
[LightGBM] [Info] Start training from score -0.205952
Training complete.
LightGBM model saved to and_model.pkl
SentenceTransformer model saved to the 'sbert_model/' directory.
Normalized publication data saved to publication_database.csv
--- Pipeline Finished ---
