# Deduplicate Literature Review CSV Files

This notebook:
1. Loads three CSV files (ACM, IEEE, Scopus).
2. Normalizes titles and DOIs.
3. Identifies duplicated records (by DOI and by title).
4. Shows the duplicated entries in dataframes.
5. Produces a deduplicated dataframe and saves it to disk.


In [None]:
import pandas as pd
import re
from pathlib import Path
from IPython.display import display

# ---------- Helper Functions ----------

def normalize_title(title):
    """Lowercase, remove punctuation, collapse spaces."""
    if pd.isna(title):
        return ""
    title = title.lower()
    title = re.sub(r'[^a-z0-9 ]+', ' ', title)
    title = re.sub(r'\s+', ' ', title).strip()
    return title

def normalize_doi(doi):
    if pd.isna(doi):
        return ""
    doi = doi.strip().lower()
    doi = doi.replace("https://doi.org/", "")
    return doi

# ---------- Load Files ----------

files = [
    r"C:\Users\Andre Silva\Desktop\Literature Review\RQ1\acm.csv",
    r"C:\Users\Andre Silva\Desktop\Literature Review\RQ1\IEEEexport2025.12.11-16.55.31.csv",
    r"C:\Users\Andre Silva\Desktop\Literature Review\RQ1\scopus_export_Dec 11-2025_daae2a1c-ddb4-43f4-93f5-6ff62a5fbc23.csv"
]

dfs = []
for f in files:
    print(f"Loading {f}")
    df = pd.read_csv(f, dtype=str)
    df["source_file"] = Path(f).name
    dfs.append(df)

data = pd.concat(dfs, ignore_index=True)

print(f"Total records loaded: {len(data)}")
display(data.head())

# ---------- Normalize fields ----------
if "doi" not in data.columns:
    data["doi"] = ""

data["norm_doi"] = data["doi"].apply(normalize_doi)

title_col = "title" if "title" in data.columns else "Document Title"  # IEEE/Scopus
data["norm_title"] = data[title_col].apply(normalize_title)

# ---------- Find duplicates ----------

# Duplicates by DOI (ignoring empty DOIs)
duplicates_by_doi = data[
    (data["norm_doi"] != "") & data.duplicated(subset=["norm_doi"], keep=False)
]

print(f"\nNumber of records involved in DOI duplicates: {len(duplicates_by_doi)}")
display(duplicates_by_doi.head(20))

# Duplicates by title
duplicates_by_title = data[
    data.duplicated(subset=["norm_title"], keep=False)
]

print(f"\nNumber of records involved in title duplicates: {len(duplicates_by_title)}")
display(duplicates_by_title.head(20))

# ---------- Deduplication Logic ----------
# Rule 1: Deduplicate by DOI when present
no_duplicate = data.sort_values("norm_doi").drop_duplicates(subset=["norm_doi"], keep="first")

# Rule 2: Deduplicate remaining by normalized title
no_duplicate = no_duplicate.sort_values("norm_title").drop_duplicates(subset=["norm_title"], keep="first")

print(f"\nDeduplicated records: {len(no_duplicate)}")
display(no_duplicate.head(20))

# ---------- Save Output ----------
output_path = r"C:\Users\Andre Silva\Desktop\Literature Review\RQ1\deduplicated_publications.csv"
no_duplicate.to_csv(output_path, index=False)

print(f"\nDeduplicated file saved to: {output_path}")


Loading C:\Users\Andre Silva\Desktop\Literature Review\RQ1\acm.csv
Loading C:\Users\Andre Silva\Desktop\Literature Review\RQ1\IEEEexport2025.12.11-16.55.31.csv
Loading C:\Users\Andre Silva\Desktop\Literature Review\RQ1\scopus_export_Dec 11-2025_daae2a1c-ddb4-43f4-93f5-6ff62a5fbc23.csv
Total records loaded: 684


Unnamed: 0,type,id,abstract,address,articleno,author,booktitle,doi,isbn,issn,...,Art. No.,Page start,Page end,Cited by,Link,Document Type,Publication Stage,Open Access,Source,EID
0,inproceedings,10.1145/3539618.3591784,Knowledge graph embedding (KGE) aims to projec...,"New York, NY, USA",,"Zhang, Zhao and Guan, Zhanpeng and Zhang, Fuwe...",Proceedings of the 46th International ACM SIGI...,10.1145/3539618.3591784,9781450394086.0,,...,,,,,,,,,,
1,inproceedings,10.1145/3711896.3737046,Multivariate Time Series Forecasting (MTSF) in...,"New York, NY, USA",,"Yu, Chengqing and Wang, Fei and Yang, Chuangua...",Proceedings of the 31st ACM SIGKDD Conference ...,10.1145/3711896.3737046,9798400714542.0,,...,,,,,,,,,,
2,article,10.1145/3718091,Real-world time series data is inherently comp...,"New York, NY, USA",72.0,"Fan, Jinxiao and Wang, Pengfei and Liu, Liang ...",,10.1145/3718091,,2157-6904,...,,,,,,,,,,
3,inproceedings,10.1145/3746252.3761007,Multivariate time series (MTS) forecasting is ...,"New York, NY, USA",,"Li, Xinhui and Yue, Kun and Yu, Lixing and Yan...",Proceedings of the 34th ACM International Conf...,10.1145/3746252.3761007,9798400720406.0,,...,,,,,,,,,,
4,inproceedings,10.1145/3690624.3709202,Leveraging graph structures for time-series fo...,"New York, NY, USA",,"Chen, Hongjie and Rossi, Ryan A. and Kim, Sung...",Proceedings of the 31st ACM SIGKDD Conference ...,10.1145/3690624.3709202,9798400712456.0,,...,,,,,,,,,,



Number of records involved in DOI duplicates: 0


Unnamed: 0,type,id,abstract,address,articleno,author,booktitle,doi,isbn,issn,...,Page end,Cited by,Link,Document Type,Publication Stage,Open Access,Source,EID,norm_doi,norm_title



Number of records involved in title duplicates: 382


Unnamed: 0,type,id,abstract,address,articleno,author,booktitle,doi,isbn,issn,...,Page end,Cited by,Link,Document Type,Publication Stage,Open Access,Source,EID,norm_doi,norm_title
302,,,,,,,,,,,...,,,,,,,,,,
303,,,,,,,,,,,...,,,,,,,,,,
304,,,,,,,,,,,...,,,,,,,,,,
305,,,,,,,,,,,...,,,,,,,,,,
306,,,,,,,,,,,...,,,,,,,,,,
307,,,,,,,,,,,...,,,,,,,,,,
308,,,,,,,,,,,...,,,,,,,,,,
309,,,,,,,,,,,...,,,,,,,,,,
310,,,,,,,,,,,...,,,,,,,,,,
311,,,,,,,,,,,...,,,,,,,,,,



Deduplicated records: 293


Unnamed: 0,type,id,abstract,address,articleno,author,booktitle,doi,isbn,issn,...,Page end,Cited by,Link,Document Type,Publication Stage,Open Access,Source,EID,norm_doi,norm_title
341,,,,,,,,,,,...,,,,,,,,,,
272,article,10.1145/3451394,Crowd flow prediction is an essential task ben...,"New York, NY, USA",110.0,"Xia, Tong and Lin, Junjie and Li, Yong and Fen...",,10.1145/3451394,,1556-4681,...,,,,,,,,,10.1145/3451394,3dgcn 3 dimensional dynamic graph convolutiona...
145,inproceedings,10.1145/3583780.3614759,"In this work, we focus on robust time series r...","New York, NY, USA",,"Zhang, Weiqi and Zhang, Jianfeng and Li, Jia a...",Proceedings of the 32nd ACM International Conf...,10.1145/3583780.3614759,9798400701245.0,,...,,,,,,,,,10.1145/3583780.3614759,a co training approach for noisy time series l...
247,article,10.1145/3749156,With recent advancements in graph neural netwo...,"New York, NY, USA",238.0,"Liao, Ningyi and Liu, Haoyu and Zhu, Zulun and...",,10.1145/3749156,,,...,,,,,,,,,10.1145/3749156,a comprehensive benchmark on spectral gnns the...
208,inproceedings,10.1145/3589334.3645391,Accurate customer LifeTime Value (LTV) predict...,"New York, NY, USA",,"Zhou, Zhiyuan and Lin, Li and Wang, Hai and Zh...",Proceedings of the ACM Web Conference 2024,10.1145/3589334.3645391,9798400701719.0,,...,,,,,,,,,10.1145/3589334.3645391,a cross domain method for customer lifetime va...
39,inproceedings,10.1145/3704558.3707079,Tobacco pest is one of the main factors that h...,"New York, NY, USA",,"Zhu, Liming and Kong, Xu and Li, Ming and Qin,...",Proceedings of the 2024 2nd International Conf...,10.1145/3704558.3707079,9798400710681.0,,...,,,,,,,,,10.1145/3704558.3707079,a distribution graph guided network with dual ...
244,article,10.1145/3759440,This research aims to develop a novel framewor...,"New York, NY, USA",47.0,"E, Subha and V, Jothi Prakash and S, Arul Antr...",,10.1145/3759440,,1559-1131,...,,,,,,,,,10.1145/3759440,a graph based framework for temporal and causa...
24,article,10.14778/3705829.3705842,Accurate long-term forecasting from multivaria...,,,"Cheng, Yunyao and Guo, Chenjuan and Yang, Bin ...",,10.14778/3705829.3705842,,2150-8097,...,,,,,,,,,10.14778/3705829.3705842,a memory guided transformer for time series fo...
249,inproceedings,10.1145/3748777.3748784,Accurate vessel trajectory prediction facilita...,"New York, NY, USA",,"Yu, Haomin and Li, Tianyi and Torp, Kristian a...",Proceedings of the 19th International Symposiu...,10.1145/3748777.3748784,,,...,,,,,,,,,10.1145/3748777.3748784,a multi modal knowledge enhanced framework for...
234,article,10.14778/3654621.3654637,"Time series data, including univariate and mul...",,,"Zhong, Shuhan and Song, Sizhe and Zhuo, Weipen...",,10.14778/3654621.3654637,,2150-8097,...,,,,,,,,,10.14778/3654621.3654637,a multi scale decomposition mlp mixer for time...



Deduplicated file saved to: C:\Users\Andre Silva\Desktop\Literature Review\RQ1\deduplicated_publications.csv


In [None]:
# ---------- Combined Duplicate Records ----------

# Combine duplicates from DOI & title
combined_duplicates = pd.concat([duplicates_by_doi, duplicates_by_title], ignore_index=True)

# Remove exact row duplicates (same title + same doi + same source)
combined_duplicates = combined_duplicates.drop_duplicates()

print(f"\nTotal unique duplicated records (DOI + Title): {len(combined_duplicates)}")

display(combined_duplicates)

In [None]:
deduplicated_results = pd.read_csv(r"C:\Users\Andre Silva\Desktop\Literature Review\RQ1\deduplicated_publications.csv")
deduplicated_results= deduplicated_results
deduplicated_results.head()

Unnamed: 0,type,id,abstract,address,articleno,author,booktitle,doi,isbn,issn,...,Page end,Cited by,Link,Document Type,Publication Stage,Open Access,Source,EID,norm_doi,norm_title
