In [10]:
import pandas as pd

# Load datasets
scopus = pd.read_csv("RQ2Scopus.csv")
acm = pd.read_csv("RQ2ACM.csv")
ieee = pd.read_csv("RQ2IEEE.csv")


print(len(scopus), len(acm), len(ieee))

158 164 110


In [11]:
import re
def pick_col(df, candidates):
    for c in candidates:
        if c in df.columns:
            return c
    return None

def normalize_title(title):
    """Lowercase, remove punctuation, collapse spaces."""
    if pd.isna(title):
        return ""
    title = title.lower()
    title = re.sub(r'[^a-z0-9 ]+', ' ', title)
    title = re.sub(r'\s+', ' ', title).strip()
    return title

def normalize_doi(doi):
    if pd.isna(doi):
        return ""
    doi = doi.strip().lower()
    doi = doi.replace("https://doi.org/", "")
    return doi

def standardize(df, source_name, mapping):
    title_col = pick_col(df, mapping["title"])
    abs_col   = pick_col(df, mapping["abstract"])
    year_col  = pick_col(df, mapping["year"])
    doi_col   = pick_col(df, mapping["doi"])
    cite_col  = pick_col(df, mapping["citations"])

    out = pd.DataFrame({
        "title": df[title_col] if title_col else "",
        "abstract": df[abs_col] if abs_col else "",
        "doi": df[doi_col] if doi_col else "",
        "year": df[year_col] if year_col else pd.NA,
        "citations": df[cite_col] if cite_col else pd.NA,
    })

    out["source"] = source_name  # optional; remove if you truly don't want it

    out["doi"] = out["doi"].apply(normalize_doi)
    out["year"] = pd.to_numeric(out["year"], errors="coerce").astype("Int64")
    out["citations"] = pd.to_numeric(out["citations"], errors="coerce").astype("Int64")
    out["title_norm"] = out["title"].apply(normalize_title)

    return out


In [12]:

# Column mappings per dataset (based on your headers)
scopus_map = {
    "title": ["Title"],
    "abstract": ["Abstract"],  # Scopus export may not include Abstract; code handles missing
    "doi": ["DOI"],
    "year": ["Year"],
    "citations": ["Cited by"]
}

acm_map = {
    "title": ["title", "Title"],
    "abstract": ["abstract", "Abstract"],
    "doi": ["doi", "DOI"],
    "year": ["year", "Year"],
    "citations": ["citations", "Cited by", "cited_by"]  # often missing -> NA
}

ieee_map = {
    "title": ["Document Title"],
    "abstract": ["Abstract"],
    "doi": ["DOI"],
    "year": ["Publication Year"],
    "citations": ["Article Citation Count"]
}

In [13]:

scopus_std = standardize(scopus, "Scopus", scopus_map)
acm_std    = standardize(acm, "ACM", acm_map)
ieee_std   = standardize(ieee, "IEEE", ieee_map)



In [14]:
scopus_std

Unnamed: 0,title,abstract,doi,year,citations,source,title_norm
0,Deep Learning or Trees? A Trade-off Analysis f...,,10.1007/978-3-032-02725-2_48,2026,0,Scopus,deep learning or trees a trade off analysis fo...
1,Attention-based spatial-temporal interactive c...,,10.1016/j.ins.2025.122647,2026,0,Scopus,attention based spatial temporal interactive c...
2,Multivariate Time Series forecasting based on ...,,10.1016/j.engappai.2025.112074,2025,2,Scopus,multivariate time series forecasting based on ...
3,Sensitivity-propagated dual-frequency graph ne...,,10.1016/j.neucom.2025.131644,2025,0,Scopus,sensitivity propagated dual frequency graph ne...
4,HUTFormer: Hierarchical U-Net transformer for ...,,10.1016/j.commtr.2025.100218,2025,0,Scopus,hutformer hierarchical u net transformer for l...
...,...,...,...,...,...,...,...
153,Inductive Graph Neural Networks for Spatiotemp...,,10.1609/aaai.v35i5.16575,2021,136,Scopus,inductive graph neural networks for spatiotemp...
154,METRO: A Generic Graph Neural Network Framewor...,,10.14778/3489496.3489503,2021,61,Scopus,metro a generic graph neural network framework...
155,Multi Scale Graph Wavenet for Wind Speed Forec...,,10.1109/bigdata52589.2021.9671624,2021,27,Scopus,multi scale graph wavenet for wind speed forec...
156,Dependency Learning Graph Neural Network for M...,,10.1007/978-3-030-92307-5_14,2021,1,Scopus,dependency learning graph neural network for m...


In [15]:
all_df = pd.concat([scopus_std, acm_std, ieee_std], ignore_index=True)
print("Merged (before dedup):", len(all_df))



Merged (before dedup): 432


In [16]:
all_df.drop_duplicates(subset=["doi"], inplace=True)
all_df.drop_duplicates(subset=["title_norm"], inplace=True)
all_df.drop(columns=["title_norm"], inplace=True)
print("After dedup:", len(all_df))
all_df

After dedup: 380


Unnamed: 0,title,abstract,doi,year,citations,source
0,Deep Learning or Trees? A Trade-off Analysis f...,,10.1007/978-3-032-02725-2_48,2026,0,Scopus
1,Attention-based spatial-temporal interactive c...,,10.1016/j.ins.2025.122647,2026,0,Scopus
2,Multivariate Time Series forecasting based on ...,,10.1016/j.engappai.2025.112074,2025,2,Scopus
3,Sensitivity-propagated dual-frequency graph ne...,,10.1016/j.neucom.2025.131644,2025,0,Scopus
4,HUTFormer: Hierarchical U-Net transformer for ...,,10.1016/j.commtr.2025.100218,2025,0,Scopus
...,...,...,...,...,...,...
426,Network Traffic Forecasting via Fuzzy Spatial-...,Spatial-temporal network traffic prediction re...,10.1109/iscmi63661.2024.10851677,2024,2,IEEE
427,Mobile Traffic Prediction in Consumer Applicat...,Mobile traffic prediction is an important yet ...,10.1109/tce.2024.3361037,2024,63,IEEE
428,Large Language Models for Wireless Cellular Tr...,Wireless cellular traffic prediction is essent...,10.1109/globecom52923.2024.10901784,2024,2,IEEE
429,Graph Attention LSTM Network: A New Model for ...,For the road networks containing multiple inte...,10.1109/icisce.2018.00058,2018,45,IEEE


In [17]:
all_df.to_csv("RQ2_Merged_Deduped.csv", index=False)

In [18]:
import pandas as pd
import re
from difflib import SequenceMatcher
from collections import defaultdict

# Make sure i/j match row positions
all_df = all_df.reset_index(drop=True)

def bigrams(s):
    s = s.replace(" ", "")
    return {s[i:i+2] for i in range(len(s)-1)} if len(s) >= 2 else set()

# Ensure title_norm exists
all_df["title_norm"] = all_df["title"].apply(normalize_title)

# Build inverted index by bigram to generate candidates
inv = defaultdict(set)
bgs = []

for i, t in enumerate(all_df["title_norm"].tolist()):
    bg = bigrams(t)
    bgs.append(bg)
    for g in bg:
        inv[g].add(i)

def jaccard(a, b):
    if not a and not b:
        return 0.0
    return len(a & b) / len(a | b)

threshold_jaccard = 0.35
threshold_sim = 0.90

seen = set()
rows = []

titles_norm = all_df["title_norm"].tolist()  # faster than iloc inside loops

for i in range(len(all_df)):
    cand = set()
    for g in bgs[i]:
        cand |= inv[g]
    cand.discard(i)

    for j in cand:
        if j < i:
            continue
        if (i, j) in seen:
            continue
        seen.add((i, j))

        jac = jaccard(bgs[i], bgs[j])
        if jac < threshold_jaccard:
            continue

        sim = SequenceMatcher(None, titles_norm[i], titles_norm[j]).ratio()
        if sim >= threshold_sim:
            rows.append({
                "idx1": i,
                "idx2": j,
                "similarity": round(sim, 4),
                "title_1": all_df.iloc[i]["title"],
                "title_2": all_df.iloc[j]["title"],
                "doi_1": all_df.iloc[i].get("doi", ""),
                "doi_2": all_df.iloc[j].get("doi", ""),
                "year_1": all_df.iloc[i].get("year", pd.NA),
                "year_2": all_df.iloc[j].get("year", pd.NA),
            })

dup_candidates = pd.DataFrame(rows).sort_values("similarity", ascending=False)
print("Near-duplicate candidates found:", len(dup_candidates))
dup_candidates.head(30)

Near-duplicate candidates found: 2


Unnamed: 0,idx1,idx2,similarity,title_1,title_2,doi_1,doi_2,year_1,year_2
1,97,113,0.9669,Dynamic Hypergraph Structure Learning for Mult...,Dynamic graph structure learning for multivari...,10.1109/tbdata.2024.3362188,10.1016/j.patcog.2023.109423,2024,2023
0,22,113,0.9116,Evolving graph structure learning for multivar...,Dynamic graph structure learning for multivari...,10.1016/j.knosys.2025.113190,10.1016/j.patcog.2023.109423,2025,2023
