In [1]:
import json
import pandas as pd
import glob

In [2]:
def reconstruct_abstract(inverted_index):
    if inverted_index is None:
        return None
    else:
        # Create a list of tuples, each containing a word and its first occurrence index
        word_positions = [(word, positions[0]) for word, positions in inverted_index.items()]

        # Sort the tuples based on the index
        sorted_words = sorted(word_positions, key=lambda x: x[1])

        # Extract the sorted words
        sorted_words_only = [word for word, position in sorted_words]

        # Join the words into a single string
        abstract = ' '.join(sorted_words_only)

        return abstract
    
def extract_author_names(author_list):
    # Extract 'display_name' from each dictionary in the list
    author_names = [author['author']['display_name'] for author in author_list if 'author' in author and 'display_name' in author['author']]
    
    # Join the names with '|'
    return '|'.join(author_names)

def extract_display_names(location_list):
    if location_list is None or not isinstance(location_list, list):
        return None

    display_names = []
    for location in location_list:
        # Check if 'source' exists and is a dictionary
        if location and isinstance(location, dict) and 'source' in location and isinstance(location['source'], dict):
            # Extract 'display_name' if it exists
            display_name = location['source'].get('display_name')
            if display_name:
                display_names.append(display_name)

    return '|'.join(display_names)




In [3]:
files = glob.glob("data/*.json")
len(files)

30

In [4]:
dataframes = []

# Loop through the files and append each dataframe to the list
for file in files:
    df = pd.read_json(file)
    dataframes.append(df)

# Concatenate all dataframes in the list
merged_df = pd.concat(dataframes, ignore_index=True)

merged_df = merged_df.sort_values(by='publication_year', ascending=False)

# Reset the index after sorting
merged_df = merged_df.reset_index(drop=True)

In [5]:
merged_df['abstract'] = merged_df['abstract_inverted_index'].apply(reconstruct_abstract)
merged_df['authors'] = merged_df['authorships'].apply(extract_author_names)

merged_df['publication'] = merged_df['locations'].apply(extract_display_names)


In [6]:
merged_df.to_csv("data/database.csv")

In [7]:
merged_df.columns

Index(['id', 'doi', 'title', 'display_name', 'relevance_score',
       'publication_year', 'publication_date', 'ids', 'language',
       'primary_location', 'type', 'type_crossref', 'open_access',
       'authorships', 'countries_distinct_count',
       'institutions_distinct_count', 'corresponding_author_ids',
       'corresponding_institution_ids', 'apc_list', 'apc_paid', 'has_fulltext',
       'fulltext_origin', 'cited_by_count', 'cited_by_percentile_year',
       'biblio', 'is_retracted', 'is_paratext', 'keywords', 'concepts', 'mesh',
       'locations_count', 'locations', 'best_oa_location',
       'sustainable_development_goals', 'grants', 'referenced_works_count',
       'referenced_works', 'related_works', 'ngrams_url',
       'abstract_inverted_index', 'cited_by_api_url', 'counts_by_year',
       'updated_date', 'created_date', 'abstract', 'authors', 'publication'],
      dtype='object')

In [8]:
merged_df.iloc[3]["locations"]

[{'is_oa': False,
  'landing_page_url': 'https://doi.org/10.1093/ehr/civ.413.1015',
  'pdf_url': None,
  'source': {'id': 'https://openalex.org/S56601287',
   'display_name': 'The English Historical Review',
   'issn_l': '0013-8266',
   'issn': ['0013-8266', '1477-4534'],
   'is_oa': False,
   'is_in_doaj': False,
   'host_organization': 'https://openalex.org/P4310311648',
   'host_organization_name': 'Oxford University Press',
   'host_organization_lineage': ['https://openalex.org/P4310311647',
    'https://openalex.org/P4310311648'],
   'host_organization_lineage_names': ['University of Oxford',
    'Oxford University Press'],
   'type': 'journal'},
  'license': None,
  'version': None,
  'is_accepted': False,
  'is_published': False}]

In [9]:
merged_df.type.unique()

array(['article', 'book-chapter', 'paratext', 'editorial', 'book',
       'dissertation', 'letter', 'other', 'erratum', 'report', 'dataset'],
      dtype=object)

In [10]:
def load_data():
    df = pd.read_csv("data/database.csv")
    # List the columns you want to keep, including 'displayname' and 'author_names'
    columns_to_keep = ["title", "authors", "publication_year", "abstract", "type", "publication", "language", "doi"]  # Add other columns as needed

    # Select only these columns
    df = df[columns_to_keep]

    # Rename 'displayname' to 'title' and 'author_names' to 'authors'
    df = df.rename(columns={"publication_year": "year"})

    return df

df2 = load_data()

df2 = df2.drop_duplicates()
df2 = df2.reset_index(drop=True)
df2.to_parquet("data/database.parquet")

In [11]:
df2

Unnamed: 0,title,authors,year,abstract,type,publication,language,doi
0,Reputation and Coalitions in Medieval Trade: E...,Avner Greif,1989,This article examines the economic institution...,article,The Journal of Economic History,en,https://doi.org/10.1017/s0022050700009475
1,Shorter Notices,Brendan Bradshaw,1989,Journal Article Shorter Notices Get access BRE...,article,The English Historical Review,en,https://doi.org/10.1093/ehr/civ.ccccxi.472
2,Book‐reviews,Tim Winter|Hasan Gai Eaton|Edward J. Lazzerini...,1989,Lisbeth Rocher and Fatima Cherqaoui D'une foi ...,article,Institute of Muslim Minority Affairs,en,https://doi.org/10.1080/02666958908716119
3,Shorter Notices,MICHAEL BAXANDALL,1989,Shorter Notices Get access MICHAEL BAXANDALL W...,article,The English Historical Review,en,https://doi.org/10.1093/ehr/civ.413.1015
4,REVIEWS,BRENDA M. BOLTON,1989,,article,The Journal of Theological Studies,,https://doi.org/10.1093/jts/40.1.268
...,...,...,...,...,...,...,...,...
133568,REVIEWS,IGNAZ MAYBAUM,1960,,article,Journal of Semitic Studies,,https://doi.org/10.1093/jss/5.2.208
133569,Back Matter,,1960,Previous article No AccessBack MatterPDFPDF PL...,paratext,Speculum,en,https://doi.org/10.1017/s0038713400131033
133570,Selected Reference Books of 1958-1959,Constance M. Winchell,1960,,article,College & Research Libraries,en,https://doi.org/10.5860/crl_21_01_20
133571,REVIEWS,O. K. SCHRAM,1960,Journal Article REVIEWS Get access The Old Eng...,article,The Review of English Studies,en,https://doi.org/10.1093/res/xi.42.194


In [12]:
len(df2.publication.unique())

7355

In [13]:
df2.language.unique()

array(['en', nan, 'de', 'it', 'es', 'fr', 'af', 'ro', 'ca', 'lt', 'sv',
       'cy', 'et', 'el', 'hu', 'lv', 'pl', 'nl', 'pt', 'tl', 'no', 'vi',
       'sw', 'da', 'ar', 'ru', 'id', 'fi', 'sl', 'ja', 'sq', 'hr', 'cs',
       'sk', 'tr', 'so', 'fa', 'ko', 'he'], dtype=object)