In [9]:
import pandas as pd

RQ1= pd.read_csv("RQ1_Merged_Deduped.csv")
RQ2= pd.read_csv("RQ2_Merged_Deduped.csv")

all_df = pd.concat([RQ1, RQ2], ignore_index=True)
print("Before dedup:", len(all_df))

Before dedup: 915


In [10]:
import pandas as pd
import re

def pick_col(df, candidates):
    for c in candidates:
        if c in df.columns:
            return c
    return None

# ---------- Normalization helpers ----------

def normalize_title(title):
    if pd.isna(title):
        return ""
    title = title.lower()
    title = re.sub(r'[^a-z0-9 ]+', ' ', title)
    title = re.sub(r'\s+', ' ', title).strip()
    return title

def normalize_doi(doi):
    if pd.isna(doi):
        return ""
    doi = doi.strip().lower()
    doi = doi.replace("https://doi.org/", "")
    doi = doi.replace("http://doi.org/", "")
    doi = doi.replace("doi:", "")
    return doi.strip()

def normalize_df(df):
    df = df.copy()

    # Normalize DOI
    df["doi_norm"] = df["doi"].apply(normalize_doi)
    df.loc[df["doi_norm"] == "", "doi_norm"] = pd.NA

    # Normalize title
    df["title_norm"] = df["title"].apply(normalize_title)
    df.loc[df["title_norm"] == "", "title_norm"] = pd.NA

    # Ensure numeric types
    df["year"] = pd.to_numeric(df["year"], errors="coerce").astype("Int64")
    df["citations"] = pd.to_numeric(df["citations"], errors="coerce").astype("Int64")

    return df



In [11]:
RQ1n = normalize_df(RQ1)
RQ2n = normalize_df(RQ2)

all_df = pd.concat([RQ1n, RQ2n], ignore_index=True)
print("Before dedup:", len(all_df))
all_df

    

Before dedup: 915


Unnamed: 0,title,abstract,doi,year,citations,source,doi_norm,title_norm
0,Graph-patchformer: Patch interaction transform...,,10.1016/j.neunet.2025.108140,2026,0,Scopus,10.1016/j.neunet.2025.108140,graph patchformer patch interaction transforme...
1,Dual Attention Transformer with Multi-scale Pe...,,10.1007/978-981-95-3052-6_24,2026,0,Scopus,10.1007/978-981-95-3052-6_24,dual attention transformer with multi scale pe...
2,Enhancing Salesforce Sales Forecasting with Co...,,10.1007/978-3-032-03558-5_29,2026,0,Scopus,10.1007/978-3-032-03558-5_29,enhancing salesforce sales forecasting with co...
3,M3E: Mixture of Multi-scale Multi-modal Expert...,,10.1007/978-981-95-3398-5_6,2026,0,Scopus,10.1007/978-981-95-3398-5_6,m3e mixture of multi scale multi modal experts...
4,DiM: Improving multivariate time series foreca...,,10.1016/j.neucom.2025.131777,2026,0,Scopus,10.1016/j.neucom.2025.131777,dim improving multivariate time series forecas...
...,...,...,...,...,...,...,...,...
910,Network Traffic Forecasting via Fuzzy Spatial-...,Spatial-temporal network traffic prediction re...,10.1109/iscmi63661.2024.10851677,2024,2,IEEE,10.1109/iscmi63661.2024.10851677,network traffic forecasting via fuzzy spatial ...
911,Mobile Traffic Prediction in Consumer Applicat...,Mobile traffic prediction is an important yet ...,10.1109/tce.2024.3361037,2024,63,IEEE,10.1109/tce.2024.3361037,mobile traffic prediction in consumer applicat...
912,Large Language Models for Wireless Cellular Tr...,Wireless cellular traffic prediction is essent...,10.1109/globecom52923.2024.10901784,2024,2,IEEE,10.1109/globecom52923.2024.10901784,large language models for wireless cellular tr...
913,Graph Attention LSTM Network: A New Model for ...,For the road networks containing multiple inte...,10.1109/icisce.2018.00058,2018,45,IEEE,10.1109/icisce.2018.00058,graph attention lstm network a new model for t...


In [None]:
all_df.drop_duplicates(subset=["doi"], inplace=True)
all_df.drop_duplicates(subset=["title_norm"], inplace=True)
all_df.drop(columns=["title_norm"], inplace=True)
print("After dedup:", len(all_df))


After dedup: 760


In [None]:
all_df.to_csv("RQ1+RQ2.csv", index=False)