In [None]:
import pandas as pd
import bibtexparser # Used to read .bib files
import re
pd.set_option("display.max_columns", None)

In [None]:

ieee_df = pd.read_csv("dataset/IEEE.csv")
ieee_df


In [None]:
with open("dataset/acm.bib", encoding="utf-8") as bibtex_file:
    bib_database = bibtexparser.load(bibtex_file)

# Convert entries to a pandas DataFrame
acm_df = pd.DataFrame(bib_database.entries)
acm_df



In [None]:
springer_df = pd.read_parquet("dataset/springer.parquet")
springer_df

In [None]:
# read pubmed parquet
pubmed_df = pd.read_parquet("dataset/pubmed.parquet")

pubmed_df

In [None]:
sciencedirect_df = pd.read_parquet("dataset/sciencedirect.parquet")
elsevier_df = sciencedirect_df

In [None]:
# 1. Create a list of tuples: (dataframe_name, title_series)
df_sources = [
    ("ACM", acm_df["title"]),
    ("Elsevier", elsevier_df["title"]),
    ("IEEE", ieee_df["Document Title"]),
    ("PubMed", pubmed_df["Title"]),
    ("Springer", springer_df["Item Title"])
]

# 2. Create a combined DataFrame with source information
combined_data = []
for source_name, title_series in df_sources:
    # Create a temporary DataFrame with title and source
    temp_df = pd.DataFrame({
        'title': title_series.str.lower(),  # Normalize to lowercase
        'source': source_name
    })
    combined_data.append(temp_df)

# Combine all into one DataFrame
all_titles_df = pd.concat(combined_data, ignore_index=True)

# 3. Remove rows with missing titles
all_titles_df = all_titles_df.dropna(subset=['title'])

# 4. Find titles that appear more than once
title_counts = all_titles_df['title'].value_counts()
duplicate_titles = title_counts[title_counts > 1].index.tolist()

# 5. Filter to show only duplicates and group by title
if not duplicate_titles:
    print("No duplicate titles found across the DataFrames.")
else:
    duplicates_df = all_titles_df[all_titles_df['title'].isin(duplicate_titles)]
    
    print(f"Found {len(duplicate_titles)} duplicate titles across datasets")
    print(f"Total duplicate entries: {len(duplicates_df)}")
    print("=" * 80)
    # Group by title and show which sources contain each duplicate
    for title in sorted(duplicate_titles):
        title_data = duplicates_df[duplicates_df['title'] == title]
        sources = title_data['source'].tolist()
        count = len(sources)
        
        print(f"\nTitle: {title}")
        print(f"   Total occurrences: {count}")
        print(f"   Found in: {', '.join(sources)}")

In [None]:
# Standardize all dataframes to common schema: id, title, abstract, library, authors, doi, journal, date

# ACM
acm_df["library"] = "acm"
acm_df = acm_df.rename(columns={
    "ID": "id", "title": "title", "abstract": "abstract", "author": "authors",
    "doi": "doi", "journal": "journal", "year": "date"
})
for col in ["id", "title", "abstract", "library", "authors", "doi", "journal", "date"]:
    if col not in acm_df.columns:
        acm_df[col] = pd.NA
acm_df = acm_df[["id", "title", "abstract", "library", "authors", "doi", "journal", "date"]]

# Elsevier
elsevier_df["library"] = "elsevier"
elsevier_df = elsevier_df.rename(columns={
    "ID": "id", "title": "title", "abstract": "abstract", "author": "authors",
    "doi": "doi", "journal": "journal", "year": "date"
})
if "date" in elsevier_df.columns:
    elsevier_df["date"] = elsevier_df["date"].astype(str)
for col in ["id", "title", "abstract", "library", "authors", "doi", "journal", "date"]:
    if col not in elsevier_df.columns:
        elsevier_df[col] = pd.NA
elsevier_df = elsevier_df[["id", "title", "abstract", "library", "authors", "doi", "journal", "date"]]

# IEEE
ieee_df["library"] = "ieee"
ieee_df = ieee_df.rename(columns={
    "ISBNs": "id", "Document Title": "title", "Abstract": "abstract", "Authors": "authors",
    "DOI": "doi", "Publication Title": "journal", "Online Date": "date"
})
for col in ["id", "title", "abstract", "library", "authors", "doi", "journal", "date"]:
    if col not in ieee_df.columns:
        ieee_df[col] = pd.NA
ieee_df = ieee_df[["id", "title", "abstract", "library", "authors", "doi", "journal", "date"]]

# PubMed
pubmed_df["library"] = "pubmed"
pubmed_df = pubmed_df.rename(columns={
    "PMID": "id", "Title": "title", "abstract": "abstract", "Authors": "authors",
    "DOI": "doi", "Journal/Book": "journal", "Create Date": "date"
})
for col in ["id", "title", "abstract", "library", "authors", "doi", "journal", "date"]:
    if col not in pubmed_df.columns:
        pubmed_df[col] = pd.NA
pubmed_df = pubmed_df[["id", "title", "abstract", "library", "authors", "doi", "journal", "date"]]

# Springer - USE ACTUAL COLUMN NAMES from the parquet file
springer_df["library"] = "springer"
springer_df = springer_df.rename(columns={
    "Item Title": "title", "Abstract": "abstract", "Authors": "authors",
    "Item DOI": "doi", "Publication Title": "journal", "Publication Year": "date"
})
# Add ID if missing
if "id" not in springer_df.columns:
    springer_df["id"] = range(len(springer_df))
if "date" in springer_df.columns:
    springer_df["date"] = springer_df["date"].astype(str)
for col in ["id", "title", "abstract", "library", "authors", "doi", "journal", "date"]:
    if col not in springer_df.columns:
        springer_df[col] = pd.NA
springer_df = springer_df[["id", "title", "abstract", "library", "authors", "doi", "journal", "date"]]


In [None]:
print(f"ACM records after normalization: {len(acm_df)}")
print(f"IEEE records after normalization: {len(ieee_df)}")
print(f"PubMed records after normalization: {len(pubmed_df)}")
print(f"Elsevier records after normalization: {len(elsevier_df)}")
print(f"Springer records after normalization: {len(springer_df)}")
print(f"\nSpringer columns: {springer_df.columns.tolist()}")
print(f"Springer library value counts:\n{springer_df['library'].value_counts()}")


In [None]:
combined_df = pd.concat([acm_df, elsevier_df, ieee_df, pubmed_df, springer_df], ignore_index=True)
print(len(combined_df))
combined_df

In [None]:
print("Library counts immediately after concat:")
print(combined_df['library'].value_counts())
print(f"Springer count: {(combined_df['library'] == 'springer').sum()}")


In [None]:
# Check library counts BEFORE duplicate removal
print("Library counts BEFORE duplicate removal:")
print(combined_df['library'].value_counts())
print(f"\nTotal records: {len(combined_df)}\n")

# create a normalized comparison key
combined_df['title_norm'] = combined_df['title'].str.lower().str.strip()

# group and filter to only duplicated titles
dupes = combined_df[combined_df.duplicated('title_norm', keep=False)]

# now show which libraries the duplicates belong to
result = dupes.groupby('title_norm')['library'].unique().reset_index()

print(result)


In [None]:
# DROP DUPLICATES
combined_df['title_norm'] = combined_df['title'].str.lower().str.strip()

# find duplicated keys
dupe_keys = combined_df['title_norm'][combined_df['title_norm'].duplicated(keep=False)]

# remove every row with those keys
combined_df = combined_df[~combined_df['title_norm'].isin(dupe_keys)]

combined_df = combined_df.drop(columns='title_norm')
# remove journal is medRxiv
combined_df = combined_df[combined_df['journal'] != 'medRxiv']

combined_df


In [None]:
import matplotlib.pyplot as plt
# caculate group count by library, and draw a bar chart
library_counts = combined_df['library'].value_counts()
print(library_counts)
library_counts.plot(kind='bar', title='Number of Records by Library')


In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Combine all abstracts into one string
all_abstracts = ' '.join(combined_df['abstract'].dropna().astype(str))

# Create wordcloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_abstracts)

# Display
plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Wordcloud of Abstracts')
plt.tight_layout(pad=0)
plt.show()

In [None]:
# group by journal and order by count descending
top15journal = combined_df.groupby('journal').size().sort_values(ascending=False).head(15)
# plot bar chart
top15journal.plot(kind='bar', title='Top 15 Journals by Number of Publications')


# TODO a bar chart showsinig publication by each quarter
# TODO chang the color of  earhc plot
# TODO build occurance martix