Import requirements.

In [13]:
import pandas as pd
import json
from tqdm import tqdm
import nltk
from joblib import Parallel, delayed
import os
from pprint import pprint

In [3]:
threads = os.cpu_count()

Read metadata and retrieve all the titles of articles and journals.

In [4]:
# Read metadata JSON file in order to build a dictionary
metadata = open("../Data/metadata.json")
metadata_dict = json.load(metadata)

article_titles = list()
for paper in metadata_dict:
    article_titles.append(paper["title"])

journal_titles = list()
for paper in metadata_dict:
    journal_titles.append(paper["source_title"])

Read the academic vocabulary.

In [5]:
# Read the "academic_vocabulary.csv" file into a data frame
df = pd.read_csv("../Data/academic_vocabulary.csv")
# Print the columns of the df
print(df.columns)

Index(['word', 'domain'], dtype='object')


Iterate over the values of the column "Domain" to delete empty rows.

In [6]:
df = df.dropna(subset=["domain"])

In [7]:
academic_vocabulary = dict()
for index, row in df.iterrows():
    if "+" in row["domain"]:
        domains = row["domain"].split("+")
        for domain in domains:
            if domain not in academic_vocabulary:
                academic_vocabulary[domain] = list()
            academic_vocabulary[domain].append(row["word"])
    else:
        if row["domain"] not in academic_vocabulary:
            academic_vocabulary[row["domain"]] = list()
        academic_vocabulary[row["domain"]].append(row["word"])

academic_vocabulary.keys()

dict_keys(['Edu', 'Med', 'Sci', 'His', 'Law', 'Soc', 'Hum', 'Rel', 'Fin', 'mediate', 'Sci '])

Define a function to classify titles.

In [8]:
def classify_field_of_endeavor(title):
    # Tokenize the title
    title.lower()
    tokens = nltk.word_tokenize(title)
    tokens = [token.lower() for token in tokens]

    # Create an empty dictionary to store the frequency of each field in the title
    field_counts = {}
    for field in academic_vocabulary:
        field_counts[field] = 0

    # Loop through each token and increment the count for any field that matches
    for token in tokens:
        for field in academic_vocabulary:
            if token in academic_vocabulary[field]:
                field_counts[field] += 1

    # Field with the highest frequency
    highest = max(field_counts, key=field_counts.get)
    if field_counts[highest] == 0:
        return "Unclassified"
    return highest

In [10]:
fields = Parallel(n_jobs=threads)(delayed(classify_field_of_endeavor)(title) for title in tqdm(article_titles))

100%|██████████| 49719/49719 [00:06<00:00, 7759.46it/s] 


In [11]:
overview_articles = dict()
for field in fields:
    if field not in overview_articles:
        overview_articles[field] = 0
    overview_articles[field] += 1
overview_articles

{'Med': 42355,
 'Sci': 3040,
 'Edu': 1250,
 'His': 37,
 'Hum': 244,
 'Fin': 17,
 'Soc': 52,
 'Unclassified': 2676,
 'Law': 26,
 'Rel': 22}