In [1]:
import json
import pandas as pd
import glob

In [2]:
def reconstruct_abstract(inverted_index):
    if inverted_index is None:
        return None
    else:
        # Create a list of tuples, each containing a word and its first occurrence index
        word_positions = [(word, positions[0]) for word, positions in inverted_index.items()]

        # Sort the tuples based on the index
        sorted_words = sorted(word_positions, key=lambda x: x[1])

        # Extract the sorted words
        sorted_words_only = [word for word, position in sorted_words]

        # Join the words into a single string
        abstract = ' '.join(sorted_words_only)

        return abstract
    
def extract_author_names(author_list):
    # Extract 'display_name' from each dictionary in the list
    author_names = [author['author']['display_name'] for author in author_list if 'author' in author and 'display_name' in author['author']]
    
    # Join the names with '|'
    return '|'.join(author_names)

def extract_display_names(location_list):
    if location_list is None or not isinstance(location_list, list):
        return None

    display_names = []
    for location in location_list:
        # Check if 'source' exists and is a dictionary
        if location and isinstance(location, dict) and 'source' in location and isinstance(location['source'], dict):
            # Extract 'display_name' if it exists
            display_name = location['source'].get('display_name')
            if display_name:
                display_names.append(display_name)

    return '|'.join(display_names)


def extract_concept_names(concept_list):
    if concept_list is None or not isinstance(concept_list, list):
        return None

    display_names = []
    for concept in concept_list:
        # Check if concept is a dictionary and has 'display_name'
        if concept and isinstance(concept, dict) and 'display_name' in concept:
            # Extract 'display_name'
            display_name = concept.get('display_name')
            if display_name:
                display_names.append(display_name)

    return '|'.join(display_names)


In [3]:
files = glob.glob("data/*.json")
len(files)

64

In [4]:
dataframes = []

# Loop through the files and append each dataframe to the list
for file in files:
    df = pd.read_json(file)
    dataframes.append(df)

# Concatenate all dataframes in the list
merged_df = pd.concat(dataframes, ignore_index=True)

merged_df = merged_df.sort_values(by='publication_year', ascending=False)

# Reset the index after sorting
merged_df = merged_df.reset_index(drop=True)

In [5]:
merged_df['abstract'] = merged_df['abstract_inverted_index'].apply(reconstruct_abstract)
merged_df['authors'] = merged_df['authorships'].apply(extract_author_names)

merged_df['publication'] = merged_df['locations'].apply(extract_display_names)
merged_df['concepts'] = merged_df['concepts'].apply(extract_concept_names)

In [None]:
merged_df.to_csv("data/database.csv")

In [None]:
merged_df.columns

Index(['id', 'doi', 'title', 'display_name', 'relevance_score',
       'publication_year', 'publication_date', 'ids', 'language',
       'primary_location', 'type', 'type_crossref', 'open_access',
       'authorships', 'countries_distinct_count',
       'institutions_distinct_count', 'corresponding_author_ids',
       'corresponding_institution_ids', 'apc_list', 'apc_paid', 'has_fulltext',
       'fulltext_origin', 'cited_by_count', 'cited_by_percentile_year',
       'biblio', 'is_retracted', 'is_paratext', 'keywords', 'concepts', 'mesh',
       'locations_count', 'locations', 'best_oa_location',
       'sustainable_development_goals', 'grants', 'referenced_works_count',
       'referenced_works', 'related_works', 'ngrams_url',
       'abstract_inverted_index', 'cited_by_api_url', 'counts_by_year',
       'updated_date', 'created_date', 'is_authors_truncated', 'abstract',
       'authors', 'publication'],
      dtype='object')

In [None]:
merged_df.iloc[3]["concepts"]

'DART ion source|Pepper|Mass spectrometry|Chromatography|Dart|Chemistry|Linear discriminant analysis|Gas chromatography–mass spectrometry|Chemometrics|Analytical Chemistry (journal)|Mathematics|Computer science|Statistics|Electron ionization|Ion|Food science|Organic chemistry|Programming language|Ionization'

In [None]:
len(merged_df.concepts.unique())

190042

In [None]:
merged_df.iloc[200].concepts

'Reading (process)|Bridge (graph theory)|Computer science|Linguistics|Philosophy|Medicine|Internal medicine'

In [None]:
def load_data():
    df = pd.read_csv("data/database.csv")
    # List the columns you want to keep, including 'displayname' and 'author_names'
    columns_to_keep = ["title", "authors", "publication_year", "abstract", "type", "publication", "language", "concepts", "doi"]  # Add other columns as needed

    # Select only these columns
    df = df[columns_to_keep]

    # Rename 'displayname' to 'title' and 'author_names' to 'authors'
    df = df.rename(columns={"publication_year": "year"})

    return df

df2 = load_data()

df2 = df2.drop_duplicates()
df2 = df2.reset_index(drop=True)
df2.to_parquet("data/database.parquet")

In [None]:
df2

Unnamed: 0,title,authors,year,abstract,type,publication,language,concepts,doi
0,Other books received,R. Fox,2004,,article,Journal of the Royal Society of Medicine,en,World Wide Web|Computer science|Library scienc...,https://doi.org/10.1258/jrsm.97.3.151
1,In This Issue,Christopher Tomlins,2004,,article,Law and History Review,,Political science,https://doi.org/10.1017/s0738248000015467
2,Front Matter,,2004,Next article No AccessFront MatterPDFPDF PLUS ...,article,Speculum,en,Front (military)|Download|History|Library scie...,https://doi.org/10.1017/s0038713400094690
3,,K. Dodds,2004,The current study applied gas chromatography-m...,article,Journal of Historical Geography,en,DART ion source|Pepper|Mass spectrometry|Chrom...,https://doi.org/10.1016/j.jhg.2003.11.012
4,"Dancing to Utopia: Modernity, Community and th...",Colin Counsell,2004,,article,Dance Research,en,Utopia|Modernity|Art|Choir|Movement (music)|Ar...,https://doi.org/10.2307/4147310
...,...,...,...,...,...,...,...,...,...
290394,"<i>Francisco de los Cobos, Secretary to the Em...",Wallace K. Ferguson,1960,Find information about UTP Journals. Universit...,article,The Canadian historical review,en,Emperor|Publishing|Indigenous|Politics|The Ren...,https://doi.org/10.3138/chr-041-04-br17
290395,"<i>The Letters and Journals of Simon Fraser, 1...",J. H. Stewart Reed,1960,Find information about UTP Journals. Universit...,article,The Canadian historical review,en,History|Psychology,https://doi.org/10.3138/chr-041-04-br07
290396,<i>Guide to the Manuscript Maps in the William...,Theodore E. Layng,1960,Find information about UTP Journals. Universit...,article,The Canadian historical review,en,Publishing|Indigenous|Slavic languages|The Ren...,https://doi.org/10.3138/chr-041-04-br67
290397,<i>From Joseph II to the Jacobin Trials: Gover...,R. R. Palmer,1960,Find information about UTP Journals. Universit...,article,The Canadian historical review,en,Jacobin|Period (music)|Public opinion|Governme...,https://doi.org/10.3138/chr-041-04-br18


In [None]:
len(df2.concepts.unique())

190042

In [None]:
df2.language.unique()

array(['en', nan, 'so', 'es', 'it', 'pt', 'ro', 'af', 'de', 'fr', 'ca',
       'ru', 'id', 'sv', 'tr', 'nl', 'vi', 'tl', 'fi', 'cy', 'sw', 'cs',
       'sl', 'da', 'hr', 'et', 'el', 'ar', 'pl', 'lt', 'sk', 'zh-tw',
       'ja', 'no', 'hu', 'ko', 'zh-cn', 'lv', 'sq', 'he', 'bg', 'th',
       'fa', 'ml'], dtype=object)

In [None]:
cons = []
for c in df2.concepts.unique():
    if type(c) == str:
        for item in c.split("|"):
            cons.append(item)
# cons = list(set(cons))
cons.sort()
print(len(cons))

1886466


In [None]:
from collections import Counter

In [None]:
# Use Counter to count the occurrences of each item
counter = Counter(cons)

# Get the most common items
most_common_items = counter.most_common()