In [12]:
import pandas as pd
import numpy as np

df = pd.read_csv("data/0510-citation-pub-data.csv")

  df = pd.read_csv("data/0510-citation-pub-data.csv")


In [13]:
df.shape

(99864, 117)

In [14]:
# Get the unique publication names
pubs = df["Journal"].unique()
print("Number of unique publication names: ", len(pubs))

Number of unique publication names:  14047


In [15]:
# Preprint servers are typically from either *rxiv or ssrn, zenodo, preprints, research square, osf preprints
df["is_preprint"] = df["Journal"].str.contains("rxiv|ssrn|zenodo|preprints|research square|osf preprints", case=False)
print("Number of preprints: ", df["is_preprint"].sum())

Number of preprints:  3487


In [16]:
unique_publishers = df["Publisher"].unique()

print(unique_publishers[:1000])

['Elsevier' nan 'IEEE' 'Springer US' 'ACM' 'Springer Nature Switzerland'
 'SAGE Publications' 'Schloss Dagstuhl-Leibniz-Zentrum für Informatik'
 'IOP Publishing' 'Springer International Publishing' 'Taylor & Francis'
 'MDPI' 'Springer Berlin Heidelberg' 'Springer Nature' 'CRC Press'
 'Sage Publications' 'Academic Press' 'Elsevier Science Publishers BV'
 'IEEE Computer Society' 'Springer Netherlands' 'Springer-Verlag'
 'John Wiley & Sons, Ltd.' 'Gesellschaft für Informatik eV'
 'North-Holland' 'IGI Global' 'Kluwer Academic Publishers' 'Springer'
 'Springer Science & Business Media' 'Dynamic Publishers, Inc.'
 'Berlin: Springer-Verlag, 1973-' 'Singapore'
 'SCS European Publishing House, Germany' 'Nova Science Publishers' 'OUP'
 'Society for Computer Simulation International'
 'World Scientific Publishing Company'
 'New York, NY: ACM Special Interest Group on Simulation; Washington, DC: IEE Computer Society, Technical Committee on Simulation], 1988-c1997.'
 'IEEE Computer Society Press' '

In [17]:
df["is_journal"] = ~df["is_preprint"].astype(bool) & (df["Journal"].str.contains("journal|transactions|j.|trans", case=False) | df["Publisher"].str.contains("hindawi|plos|bmj|nature|iop|aps|acs|rsc|sage|frontiers|pergamon|wiley|publish|plos", case=False))
print("Number of journal papers: ", df["is_journal"].sum())

Number of journal papers:  23238


In [18]:
df["is_conference"] = ~df["is_journal"].astype(bool) & ~df["is_preprint"].astype(bool) & df["Journal"].str.contains("conference|symposium|workshop|proceeding|proc|review|digest|conf|symp", case=False)
print("Number of conference papers: ", df["is_conference"].sum())

Number of conference papers:  6992


In [19]:
df["is_journal_publisher_empty"] = df["Journal"].isnull() & df["Publisher"].isnull()
print("Number of papers with empty journal field: ", df["is_journal_publisher_empty"].sum())

Number of papers with empty journal field:  21215


In [20]:
df["is_institutional"] = df["Journal"].str.contains("university|institute|college|school|center", case=False) | df["Publisher"].str.contains("university|institute|college|school|center|sloan|exeter|epfl", case=False)
print("Number of institutional papers: ", df["is_institutional"].sum())

Number of institutional papers:  1913


In [21]:
print("Number of papers with empty journal field and preprint: ", (df["is_preprint"] & df["is_journal_publisher_empty"]).sum())
print("Number of papers with empty journal field and conference: ", (df["is_conference"] & df["is_journal_publisher_empty"]).sum())
print("Number of papers with empty journal field and journal: ", (df["is_journal"] & df["is_journal_publisher_empty"]).sum())
print("Number of papers with empty journal field and institutional: ", (df["is_institutional"] & df["is_journal_publisher_empty"]).sum())

Number of papers with empty journal field and preprint:  0
Number of papers with empty journal field and conference:  0
Number of papers with empty journal field and journal:  0
Number of papers with empty journal field and institutional:  0


In [23]:
def apply_not_found(row):
    if row['is_journal_publisher_empty']:
        return 'empty_journal_publisher'
    else:
        return 'not_found'

df['publication_type'] = df.apply(apply_not_found, axis=1)
print("Number of not found papers: ", (df["publication_type"] == 'not_found').sum())
print(df['publication_type'].value_counts())

def classify_publication(row):
    if row['publication_type'] == 'not_found':
        if row['is_preprint']:
            return 'preprint'
        elif row['is_conference']:
            return 'conference'
        elif row['is_journal']:
            return 'journal'
        elif row['is_institutional']:
            return 'institutional'
        elif row['is_journal_publisher_empty']:
            return 'empty_journal_publisher'
        else:
            return 'not_found'
    else:
        return row['publication_type']

df['publication_type'] = df.apply(classify_publication, axis=1)

# Remove the dummy columns
# df.drop(columns=['is_preprint', 'is_conference', 'is_journal', 'is_journal_publisher_empty', 'is_institutional'], inplace=True)

Number of not found papers:  78649
publication_type
not_found                  78649
empty_journal_publisher    21215
Name: count, dtype: int64


In [24]:
df['publication_type'].value_counts()

publication_type
preprint                   32646
journal                    23238
empty_journal_publisher    21215
not_found                  14878
conference                  6992
institutional                895
Name: count, dtype: int64

In [25]:
unclassified = df[~df["is_preprint"].astype(bool) & ~df["is_conference"].astype(bool) & ~df["is_journal"].astype(bool) & ~df["is_journal_publisher_empty"].astype(bool) & ~df["is_institutional"].astype(bool)]
print("Number of unclassified papers: ", len(unclassified))

unique_unclassified = unclassified["Journal"].unique()
print("Number of unique unclassified publication names: ", len(unique_unclassified))

Number of unclassified papers:  14878
Number of unique unclassified publication names:  3970


In [26]:
# Check if is_* are mutually exclusive
print(df[df["is_preprint"] & df["is_conference"]][["Publisher", "Journal"]])

Empty DataFrame
Columns: [Publisher, Journal]
Index: []


In [27]:
print(unique_unclassified[:1000])

['Artificial Intelligence' 'Multimedia Tools and Applications'
 'IEEE Access'
 'Computer Science Methods for Effective and Sustainable Simulation Studies'
 'Simulation Modelling Practice and Theory'
 'Advanced Engineering Informatics' 'Report from Dagstuhl Seminar'
 'Resuscitation' 'Safety science'
 'Concurrency and Computation: Practice and Experience' 'Soft Computing'
 'Information Sciences' 'Applied Soft Computing' 'Computer Graphics Forum'
 'Computer graphics forum' 'Autonomous Agents and Multi-Agent Systems'
 'Computer Animation and Virtual Worlds'
 'Intelligent Control and Computer Engineering' 'The visual computer'
 'Future Generation Computer Systems' 'Parallel Computing'
 'New Horizons of Parallel and Distributed Computing'
 'Lecture Notes in Computer Science' 'Information and Software Technology'
 'IEEE Parallel & Distributed Technology: Systems & Applications'
 'Science of Computer Programming' 'Lecture notes in computer science'
 'Empirical Software Engineering' 'AGU Fall M

In [28]:
# Most of these are probably journals, but we can use ChatGPT to identify the ones that are not journals.

RECLASSIFY = False

if RECLASSIFY:
    system_prompt = """
    We have a list of publications. We need to classify each provided publication into a publication type, as we don't know what to classify them as.
    Output with the following JSON template, with one categorization for each publication. Don't do extra publications as it will confuse the system. 

    {"publications": [
    {
    "publication": "XYZ publication",
    "classification":  "journal" | "preprint" | "conference" | "institutional" | "other"
    }
    ]}.

    """

    import os
    from openai import OpenAI
    import dotenv
    import json
    from tqdm import tqdm

    dotenv.load_dotenv()
    client = OpenAI(
        api_key=os.getenv("OPENAI_API_KEY")
    )

    # Chunk the list of unknown publications into groups of 100
    chunks = [unique_unclassified[i:min(i + 25, len(unique_unclassified))] for i in range(0, len(unique_unclassified), 25)]
    all_classifications = {}

    for chunk in tqdm(chunks):
        print("Checking chunk: ", ", ".join(chunk))
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": "\n".join(chunk)}
            ],
            response_format={"type": "json_object"},
            max_tokens=2048
        )

        classifications = response.choices[0].message.content
        # print(classifications)
        classifications = json.loads(classifications)
        classifications = classifications["publications"]
        print(classifications)
        for classification in classifications:
            all_classifications[classification["publication"]] = classification["classification"]

    with open("data/0510-unclassified-publications-classifications.json", "w") as f:
        json.dump(all_classifications, f)
else:
    import json
    with open("data/0510-unclassified-publications-classifications.json", "r") as f:
        all_classifications = json.load(f)

In [29]:
all_classifications

{'Artificial Intelligence': 'journal',
 'Multimedia Tools and Applications': 'journal',
 'IEEE Access': 'journal',
 'Computer Science Methods for Effective and Sustainable Simulation Studies': 'other',
 'Simulation Modelling Practice and Theory': 'journal',
 'Advanced Engineering Informatics': 'journal',
 'Report from Dagstuhl Seminar': 'institutional',
 'Resuscitation': 'journal',
 'Safety science': 'journal',
 'Concurrency and Computation: Practice and Experience': 'journal',
 'Soft Computing': 'journal',
 'Information Sciences': 'journal',
 'Applied Soft Computing': 'journal',
 'Computer Graphics Forum': 'journal',
 'Computer graphics forum': 'journal',
 'Autonomous Agents and Multi-Agent Systems': 'journal',
 'Computer Animation and Virtual Worlds': 'journal',
 'Intelligent Control and Computer Engineering': 'other',
 'The visual computer': 'journal',
 'Future Generation Computer Systems': 'journal',
 'Parallel Computing': 'journal',
 'New Horizons of Parallel and Distributed Compu

In [30]:
df['publication_type'].value_counts()

publication_type
preprint                   32646
journal                    23238
empty_journal_publisher    21215
not_found                  14878
conference                  6992
institutional                895
Name: count, dtype: int64

In [31]:
import rapidfuzz
from rapidfuzz import process

def get_closest_classification(publication):
    if publication in all_classifications:
        return all_classifications[publication]
    else:
        closest_match = process.extractOne(publication, all_classifications.keys())
        if closest_match:
            return all_classifications[closest_match[0]]
        else:
            return 'not_found'

df.loc[df['publication_type'] == 'not_found', 'publication_type'] = df.loc[df['publication_type'] == 'not_found', 'Journal'].apply(get_closest_classification)

In [32]:
df['publication_type'].value_counts()

publication_type
journal                    34857
preprint                   32862
empty_journal_publisher    21215
conference                  8536
institutional               1230
other                       1158
book                           6
Name: count, dtype: int64

In [34]:
df.drop(columns=['is_preprint', 'is_conference', 'is_journal', 'is_journal_publisher_empty', 'is_institutional'], inplace=True)
df.columns

Index(['Researcher of Interest', 'School', 'Publication Link', 'Authors',
       'Publication date', 'Journal', 'Volume', 'Pages', 'Publisher',
       'Description',
       ...
       'Citations 2017', 'Citations 2018', 'Citations 2019', 'Citations 2020',
       'Citations 2021', 'Citations 2022', 'Citations 2023', 'Citations 2024',
       'Total Citation Count', 'publication_type'],
      dtype='object', length=118)

In [35]:
df.to_csv("data/0510-citation-pub-data-publication-classified.csv", index=False)