In [1]:
import json
import pandas as pd
import glob

In [2]:
def reconstruct_abstract(inverted_index):
    if inverted_index is None:
        return None
    else:
        # Create a list of tuples, each containing a word and its first occurrence index
        word_positions = [(word, positions[0]) for word, positions in inverted_index.items()]

        # Sort the tuples based on the index
        sorted_words = sorted(word_positions, key=lambda x: x[1])

        # Extract the sorted words
        sorted_words_only = [word for word, position in sorted_words]

        # Join the words into a single string
        abstract = ' '.join(sorted_words_only)

        return abstract
    
def extract_author_names(author_list):
    # Extract 'display_name' from each dictionary in the list
    author_names = [author['author']['display_name'] for author in author_list if 'author' in author and 'display_name' in author['author']]
    
    # Join the names with '|'
    return '|'.join(author_names)

def extract_display_names(location_list):
    if location_list is None or not isinstance(location_list, list):
        return None

    display_names = []
    for location in location_list:
        # Check if 'source' exists and is a dictionary
        if location and isinstance(location, dict) and 'source' in location and isinstance(location['source'], dict):
            # Extract 'display_name' if it exists
            display_name = location['source'].get('display_name')
            if display_name:
                display_names.append(display_name)

    return '|'.join(display_names)


def extract_concept_names(concept_list):
    if concept_list is None or not isinstance(concept_list, list):
        return None

    display_names = []
    for concept in concept_list:
        # Check if concept is a dictionary and has 'display_name'
        if concept and isinstance(concept, dict) and 'display_name' in concept:
            # Extract 'display_name'
            display_name = concept.get('display_name')
            if display_name:
                display_names.append(display_name)

    return '|'.join(display_names)


In [3]:
files = glob.glob("data/*.json")
len(files)

64

In [4]:
dataframes = []
columns_to_keep = ["title", "authors", "publication_year", "abstract", "type", "publication", "language", "concepts", "doi"]  # Add other columns as needed

# Loop through the files and append each dataframe to the list
for file in files:
    df = pd.read_json(file)
    df['abstract'] = df['abstract_inverted_index'].apply(reconstruct_abstract)
    df['authors'] = df['authorships'].apply(extract_author_names)

    df['publication'] = df['locations'].apply(extract_display_names)
    df['concepts'] = df['concepts'].apply(extract_concept_names)
    
    

    # Select only these columns
    df = df[columns_to_keep]

    # Rename 'displayname' to 'title' and 'author_names' to 'authors'
    df = df.rename(columns={"publication_year": "year"})
    dataframes.append(df)

# Concatenate all dataframes in the list
merged_df = pd.concat(dataframes, ignore_index=True)

merged_df = merged_df.sort_values(by='publication_year', ascending=False)

# Reset the index after sorting
merged_df = merged_df.reset_index(drop=True)

In [5]:
merged_df['abstract'] = merged_df['abstract_inverted_index'].apply(reconstruct_abstract)
merged_df['authors'] = merged_df['authorships'].apply(extract_author_names)

merged_df['publication'] = merged_df['locations'].apply(extract_display_names)
merged_df['concepts'] = merged_df['concepts'].apply(extract_concept_names)

In [6]:
merged_df.to_csv("data/database.csv")

In [7]:
merged_df.columns

Index(['id', 'doi', 'title', 'display_name', 'relevance_score',
       'publication_year', 'publication_date', 'ids', 'language',
       'primary_location', 'type', 'type_crossref', 'open_access',
       'authorships', 'countries_distinct_count',
       'institutions_distinct_count', 'corresponding_author_ids',
       'corresponding_institution_ids', 'apc_list', 'apc_paid', 'has_fulltext',
       'cited_by_count', 'cited_by_percentile_year', 'biblio', 'is_retracted',
       'is_paratext', 'keywords', 'concepts', 'mesh', 'locations_count',
       'locations', 'best_oa_location', 'sustainable_development_goals',
       'grants', 'referenced_works_count', 'referenced_works', 'related_works',
       'ngrams_url', 'abstract_inverted_index', 'cited_by_api_url',
       'counts_by_year', 'updated_date', 'created_date', 'fulltext_origin',
       'abstract', 'authors', 'publication'],
      dtype='object')

In [8]:
merged_df.iloc[3]["concepts"]

'Humanities|History|Art'

In [9]:
len(merged_df.concepts.unique())

18324

In [10]:
merged_df.iloc[200].concepts

'Computer science'

In [11]:
def load_data():
    df = pd.read_csv("data/database.csv")
    # List the columns you want to keep, including 'displayname' and 'author_names'
    columns_to_keep = ["title", "authors", "publication_year", "abstract", "type", "publication", "language", "concepts", "doi"]  # Add other columns as needed

    # Select only these columns
    df = df[columns_to_keep]

    # Rename 'displayname' to 'title' and 'author_names' to 'authors'
    df = df.rename(columns={"publication_year": "year"})

    return df

df2 = load_data()

df2 = df2.drop_duplicates()
df2 = df2.reset_index(drop=True)
df2.to_parquet("data/omb-database.parquet")

In [12]:
df2

Unnamed: 0,title,authors,year,abstract,type,publication,language,concepts,doi
0,"Carolingian Medical Knowledge and Practice, c....",Claire Burridge,2024,Carolingian Medical Knowledge and Practice exp...,book,,en,Perspective (graphical)|Context (archaeology)|...,https://doi.org/10.1163/9789004466173
1,Popioły i fundamenty. Jak katedra krakowska wy...,Piotr Pajor,2023,The paper focuses on the Krakow cathedral at t...,article,Historia Slavorum Occidentis,en,Coronation|Choir|Ceremony|Art|Humanities|Ancie...,https://doi.org/10.15804/hso230105
2,Monetary Circulation in Byzantine and Caroling...,Alessia Rovelli,2023,,book-chapter,Routledge eBooks,en,Byzantine architecture|Circulation (fluid dyna...,https://doi.org/10.4324/9781003420897-5
3,L’Ordre du Temple dans la Basse Vallée du Rhôn...,Michael J. Peixoto,2023,Reviewed by: L’Ordre du Temple dans la Basse V...,article,Catholic Historical Review,en,Humanities|History|Art,https://doi.org/10.1353/cat.2023.0018
4,Orta Çağ Sonrası Batı Kaligrafi Geleneği ve Bl...,Serdar KİPDEMİR|Almıla YILDIRIM,2023,,article,Kesit akademi dergisi,tr,Humanities|Art|Physics,https://doi.org/10.29228/kesit.68571
...,...,...,...,...,...,...,...,...,...
24905,Notices of Archaeological Publications,J. M. K.,1856,,article,The Archaeological Journal|Zenodo (CERN Europe...,en,Archaeology|History|Geography,https://doi.org/10.1080/00665983.1856.10851028
24906,Hornbooks,Kenneth MacKenzie,1851,,article,Notes and Queries,,Business,https://doi.org/10.1093/nq/s1-iii.69.151a
24907,Pillgarlick,X. Z.,1851,Journal Article Pillgarlick Get access X. Z. S...,article,Notes and Queries,en,Library science|Computer science,https://doi.org/10.1093/nq/s1-iii.69.150c
24908,Carte de L'Empire Carlovingien et des Empire A...,Delamarche,1850,Map shows the extent of various kingdoms and e...,article,,en,Empire|Reign|Ancient history|Kingdom|Treaty|Hi...,


In [13]:
len(df2.concepts.unique())

509681

In [14]:
df2.language.unique()

array(['en', 'pt', 'ru', 'bg', 'fr', 'es', 'id', nan, 'it', 'pl', 'uk',
       'nl', 'tr', 'ca', 'hr', 'th', 'sv', 'de', 'ar', 'ko', 'sk', 'ne',
       'af', 'so', 'hu', 'tl', 'no', 'et', 'vi', 'el', 'fi', 'ro', 'sl',
       'mk', 'cs', 'da', 'lv', 'cy', 'lt', 'zh-cn', 'sw', 'ja', 'bn',
       'sq', 'fa', 'he', 'ta', 'hi', 'zh-tw', 'ml'], dtype=object)

In [15]:
cons = []
for c in df2.concepts.unique():
    if type(c) == str:
        for item in c.split("|"):
            cons.append(item)
# cons = list(set(cons))
cons.sort()
print(len(cons))

5862531


In [16]:
from collections import Counter

In [17]:
# Use Counter to count the occurrences of each item
counter = Counter(cons)

# Get the most common items
most_common_items = counter.most_common()

In [18]:
df3 = df2.dropna()

In [19]:
df3

Unnamed: 0,title,authors,year,abstract,type,publication,language,concepts,doi
1,A Physicochemical Examination of Blue Shades i...,A. Panagopoulou|Joanita Vroom|Anno Hein|Vassil...,2023.0,This study attempts to take aspects of pottery...,article,Colorants,en,Glaze|Pottery|Polychrome|Terracotta|Archaeolog...,https://doi.org/10.3390/colorants2020021
2,Traços de colonialidade em narrativas de licen...,Pâmela Barroso de Araújo Cruz|Jéssica de Almeida,2023.0,Este artigo objetiva apresentar dados parciais...,article,Orfeu,pt,Humanities|Art|Musical|Philosophy|Visual arts,https://doi.org/10.5965/2525530408012023e0201
3,Conflict of Laws in the ALI’s First Century,Symeon C. Symeonides,2023.0,Abstract This chapter discusses the American L...,book-chapter,Oxford University Press eBooks,en,Skepticism|Flexibility (engineering)|Certainty...,https://doi.org/10.1093/oso/9780197685341.003....
4,The Need for Restatement of the Common Law,David J. Seipp,2023.0,"Abstract Restatements of the common law, chief...",book-chapter,Oxford University Press eBooks,en,Common law|Law|Supreme court|Political science...,https://doi.org/10.1093/oso/9780197685341.003....
6,Historical and cultural monuments of antiquity...,В.Т. Чшиев,2023.0,В статье рассматриваются памятники средневеков...,article,Vestnik Vladikavkazskogo naučnogo centra,ru,Middle Ages|Geography|Archaeology|Ancient hist...,https://doi.org/10.46698/vnc.2023.28.43.001
...,...,...,...,...,...,...,...,...,...
751915,<i>An Introduction to Eighteenth Century Franc...,Crane Brinton,1960.0,Find information about UTP Journals. Universit...,article,The Canadian historical review,en,History,https://doi.org/10.3138/chr-041-03-br03
751917,<i>The Papers of Henry Clay</i>. I. <i>The Ris...,Thomas J. Pressly,1960.0,Find information about UTP Journals. Universit...,article,The Canadian historical review,en,History|Ancient history,https://doi.org/10.3138/chr-041-03-br11
751919,"<i>Movements of Political Protest in Canada, 1...",W. L. Morton,1960.0,Find information about UTP Journals. Universit...,article,The Canadian historical review,en,Politics|Political science|Social movement|Pol...,https://doi.org/10.3138/chr-041-03-br10
751921,<i>A History of Modern Germany: The Reformatio...,Franklin L. Ford,1960.0,Find information about UTP Journals. Universit...,article,The Canadian historical review,en,History,https://doi.org/10.3138/chr-041-03-br05


In [13]:
# Select specific columns
df5 = df2[["title", "authors", "year", "abstract", "type", "publication"]]

# Change 'book-chapter' to 'chapter' in the 'type' column
df5['type'] = df5['type'].replace('book-chapter', 'chapter')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df5['type'] = df5['type'].replace('book-chapter', 'chapter')


In [14]:
df5

Unnamed: 0,title,authors,year,abstract,type,publication
0,"Carolingian Medical Knowledge and Practice, c....",Claire Burridge,2024,Carolingian Medical Knowledge and Practice exp...,book,
1,Popioły i fundamenty. Jak katedra krakowska wy...,Piotr Pajor,2023,The paper focuses on the Krakow cathedral at t...,article,Historia Slavorum Occidentis
2,Monetary Circulation in Byzantine and Caroling...,Alessia Rovelli,2023,,chapter,Routledge eBooks
3,L’Ordre du Temple dans la Basse Vallée du Rhôn...,Michael J. Peixoto,2023,Reviewed by: L’Ordre du Temple dans la Basse V...,article,Catholic Historical Review
4,Orta Çağ Sonrası Batı Kaligrafi Geleneği ve Bl...,Serdar KİPDEMİR|Almıla YILDIRIM,2023,,article,Kesit akademi dergisi
...,...,...,...,...,...,...
24905,Notices of Archaeological Publications,J. M. K.,1856,,article,The Archaeological Journal|Zenodo (CERN Europe...
24906,Hornbooks,Kenneth MacKenzie,1851,,article,Notes and Queries
24907,Pillgarlick,X. Z.,1851,Journal Article Pillgarlick Get access X. Z. S...,article,Notes and Queries
24908,Carte de L'Empire Carlovingien et des Empire A...,Delamarche,1850,Map shows the extent of various kingdoms and e...,article,


In [15]:
df5.to_parquet("data/omb-data.parquet")