In [95]:
import pandas as pd

# Load datasets
scopus = pd.read_csv("RQ1Scopus.csv")
acm = pd.read_csv("RQ1ACM.csv")
ieee = pd.read_csv("RQ1IEEE.csv")


print(len(scopus), len(acm), len(ieee))

187 302 135


In [None]:
import re
def pick_col(df, candidates):
    for c in candidates:
        if c in df.columns:
            return c
    return None

def normalize_title(title):
    """Lowercase, remove punctuation, collapse spaces."""
    if pd.isna(title):
        return ""
    title = title.lower()
    title = re.sub(r'[^a-z0-9 ]+', ' ', title)
    title = re.sub(r'\s+', ' ', title).strip()
    return title

def normalize_doi(doi):
    if pd.isna(doi):
        return ""
    doi = doi.strip().lower()
    doi = doi.replace("https://doi.org/", "")
    return doi

def standardize(df, source_name, mapping):
    title_col = pick_col(df, mapping["title"])
    abs_col   = pick_col(df, mapping["abstract"])
    year_col  = pick_col(df, mapping["year"])
    doi_col   = pick_col(df, mapping["doi"])
    cite_col  = pick_col(df, mapping["citations"])

    out = pd.DataFrame({
        "title": df[title_col] if title_col else "",
        "abstract": df[abs_col] if abs_col else "",
        "doi": df[doi_col] if doi_col else "",
        "year": df[year_col] if year_col else pd.NA,
        "citations": df[cite_col] if cite_col else pd.NA,
    })

    out["source"] = source_name  # optional; remove if you truly don't want it

    out["doi"] = out["doi"].apply(normalize_doi)
    out["year"] = pd.to_numeric(out["year"], errors="coerce").astype("Int64")
    out["citations"] = pd.to_numeric(out["citations"], errors="coerce").astype("Int64")
    out["title_norm"] = out["title"].apply(normalize_title)

    return out



In [None]:

# Column mappings per dataset (based on your headers)
scopus_map = {
    "title": ["Title"],
    "abstract": ["Abstract"],  # Scopus export may not include Abstract; code handles missing
    "doi": ["DOI"],
    "year": ["Year"],
    "citations": ["Cited by"]
}

acm_map = {
    "title": ["title", "Title"],
    "abstract": ["abstract", "Abstract"],
    "doi": ["doi", "DOI"],
    "year": ["year", "Year"],
    "citations": ["citations", "Cited by", "cited_by"]  # often missing -> NA
}

ieee_map = {
    "title": ["Document Title"],
    "abstract": ["Abstract"],
    "doi": ["DOI"],
    "year": ["Publication Year"],
    "citations": ["Article Citation Count"]
}

In [98]:

scopus_std = standardize(scopus, "Scopus", scopus_map)
acm_std    = standardize(acm, "ACM", acm_map)
ieee_std   = standardize(ieee, "IEEE", ieee_map)



In [99]:
scopus_std

Unnamed: 0,title,abstract,doi,year,citations,authors,source,title_norm,authors_norm
0,Graph-patchformer: Patch interaction transform...,,10.1016/j.neunet.2025.108140,2026,0,"Hou, C.; Yu, Y.; Ji, J.; Zhang, S.; Shen, X.; ...",Scopus,graph patchformer patch interaction transforme...,c|hou|j|j|ji|s|shen|x|y|yan|yu|zhang
1,Dual Attention Transformer with Multi-scale Pe...,,10.1007/978-981-95-3052-6_24,2026,0,"Li, F.; Peng, W.; Zhang, M.; Wang, M.; Zhang, H.",Scopus,dual attention transformer with multi scale pe...,f|h|li|m|m|peng|w|wang|zhang|zhang
2,Enhancing Salesforce Sales Forecasting with Co...,,10.1007/978-3-032-03558-5_29,2026,0,"Gorantla, S.; Machapatri, V.V.",Scopus,enhancing salesforce sales forecasting with co...,gorantla|machapatri|s|v v
3,M3E: Mixture of Multi-scale Multi-modal Expert...,,10.1007/978-981-95-3398-5_6,2026,0,"Xie, S.; Jiang, H.; Zhao, C.; Yang, X.",Scopus,m3e mixture of multi scale multi modal experts...,c|h|jiang|s|x|xie|yang|zhao
4,DiM: Improving multivariate time series foreca...,,10.1016/j.neucom.2025.131777,2026,0,"Mo, Y.; Wang, H.; Yao, Z.; Yang, C.; Li, B.; J...",Scopus,dim improving multivariate time series forecas...,b|c|fan|h|jiang|li|mo|mo|s|s|wang|y|y|yang|yao|z
...,...,...,...,...,...,...,...,...,...
182,Learning knowledge-enriched company embeddings...,,10.1145/3490354.3494390,2021,8,"Ang, G.; Lim, E.-P.",Scopus,learning knowledge enriched company embeddings...,ang|e p|g|lim
183,DISCRETE GRAPH STRUCTURE LEARNING FOR FORECAST...,,,2021,153,"Shang, C.; Bi, J.; Chen, J.",Scopus,discrete graph structure learning for forecast...,bi|c|chen|j|j|shang
184,Attentive Neural Controlled Differential Equat...,,10.1109/icdm51629.2021.00035,2021,15,"Jhin, S.Y.; Shin, H.; Hong, S.; Jo, M.; Park, ...",Scopus,attentive neural controlled differential equat...,h|h|hong|jeon|jhin|jo|lee|m|maeng|n|park|park|...
185,Dependency Learning Graph Neural Network for M...,,10.1007/978-3-030-92307-5_14,2021,1,"Patel, A.; Sriramulu, A.; Bergmeir, C.; Fourri...",Scopus,dependency learning graph neural network for m...,a|a|bergmeir|c|fourrier|n|patel|sriramulu


In [100]:
all_df = pd.concat([scopus_std, acm_std, ieee_std], ignore_index=True)
print("Merged (before dedup):", len(all_df))



Merged (before dedup): 624


In [101]:
all_df.drop_duplicates(subset=["doi"], inplace=True)
all_df.drop_duplicates(subset=["title_norm"], inplace=True)
all_df.drop(columns=["title_norm"], inplace=True)
print("After dedup:", len(all_df))


After dedup: 535


In [102]:
all_df.to_csv("RQ1_Merged_Deduped.csv", index=False)

In [104]:
import pandas as pd
import re
from difflib import SequenceMatcher
from collections import defaultdict

# Make sure i/j match row positions
all_df = all_df.reset_index(drop=True)

def bigrams(s):
    s = s.replace(" ", "")
    return {s[i:i+2] for i in range(len(s)-1)} if len(s) >= 2 else set()

# Ensure title_norm exists
all_df["title_norm"] = all_df["title"].apply(normalize_title)

# Build inverted index by bigram to generate candidates
inv = defaultdict(set)
bgs = []

for i, t in enumerate(all_df["title_norm"].tolist()):
    bg = bigrams(t)
    bgs.append(bg)
    for g in bg:
        inv[g].add(i)

def jaccard(a, b):
    if not a and not b:
        return 0.0
    return len(a & b) / len(a | b)

threshold_jaccard = 0.35
threshold_sim = 0.90

seen = set()
rows = []

titles_norm = all_df["title_norm"].tolist()  # faster than iloc inside loops

for i in range(len(all_df)):
    cand = set()
    for g in bgs[i]:
        cand |= inv[g]
    cand.discard(i)

    for j in cand:
        if j < i:
            continue
        if (i, j) in seen:
            continue
        seen.add((i, j))

        jac = jaccard(bgs[i], bgs[j])
        if jac < threshold_jaccard:
            continue

        sim = SequenceMatcher(None, titles_norm[i], titles_norm[j]).ratio()
        if sim >= threshold_sim:
            rows.append({
                "idx1": i,
                "idx2": j,
                "similarity": round(sim, 4),
                "title_1": all_df.iloc[i]["title"],
                "title_2": all_df.iloc[j]["title"],
                "doi_1": all_df.iloc[i].get("doi", ""),
                "doi_2": all_df.iloc[j].get("doi", ""),
                "year_1": all_df.iloc[i].get("year", pd.NA),
                "year_2": all_df.iloc[j].get("year", pd.NA),
            })

dup_candidates = pd.DataFrame(rows).sort_values("similarity", ascending=False)
print("Near-duplicate candidates found:", len(dup_candidates))
dup_candidates.head(30)

Near-duplicate candidates found: 3


Unnamed: 0,idx1,idx2,similarity,title_1,title_2,doi_1,doi_2,year_1,year_2
2,120,142,0.9669,Dynamic Hypergraph Structure Learning for Mult...,Dynamic graph structure learning for multivari...,10.1109/tbdata.2024.3362188,10.1016/j.patcog.2023.109423,2024,2023
1,67,99,0.9211,A Novel Discrete Time Series Representation Wi...,A Novel Discrete Time Series Representation wi...,10.1109/access.2025.3588507,10.1109/dsaa61799.2024.10722826,2025,2024
0,35,142,0.9116,Evolving graph structure learning for multivar...,Dynamic graph structure learning for multivari...,10.1016/j.knosys.2025.113190,10.1016/j.patcog.2023.109423,2025,2023
