In [1]:
import json
import pandas as pd
import glob

In [2]:
def reconstruct_abstract(inverted_index):
    if inverted_index is None:
        return None
    else:
        # Create a list of tuples, each containing a word and its first occurrence index
        word_positions = [(word, positions[0]) for word, positions in inverted_index.items()]

        # Sort the tuples based on the index
        sorted_words = sorted(word_positions, key=lambda x: x[1])

        # Extract the sorted words
        sorted_words_only = [word for word, position in sorted_words]

        # Join the words into a single string
        abstract = ' '.join(sorted_words_only)

        return abstract
    
def extract_author_names(author_list):
    # Extract 'display_name' from each dictionary in the list
    author_names = [author['author']['display_name'] for author in author_list if 'author' in author and 'display_name' in author['author']]
    
    # Join the names with '|'
    return '|'.join(author_names)

def extract_display_names(location_list):
    if location_list is None or not isinstance(location_list, list):
        return None

    display_names = []
    for location in location_list:
        # Check if 'source' exists and is a dictionary
        if location and isinstance(location, dict) and 'source' in location and isinstance(location['source'], dict):
            # Extract 'display_name' if it exists
            display_name = location['source'].get('display_name')
            if display_name:
                display_names.append(display_name)

    return '|'.join(display_names)




In [3]:
files = glob.glob("data/*.json")
len(files)

39

In [4]:
dataframes = []

# Loop through the files and append each dataframe to the list
for file in files:
    df = pd.read_json(file)
    dataframes.append(df)

# Concatenate all dataframes in the list
merged_df = pd.concat(dataframes, ignore_index=True)

merged_df = merged_df.sort_values(by='publication_year', ascending=False)

# Reset the index after sorting
merged_df = merged_df.reset_index(drop=True)

In [5]:
merged_df['abstract'] = merged_df['abstract_inverted_index'].apply(reconstruct_abstract)
merged_df['authors'] = merged_df['authorships'].apply(extract_author_names)

merged_df['publication'] = merged_df['locations'].apply(extract_display_names)


In [6]:
merged_df.to_csv("data/database.csv")

In [7]:
merged_df.columns

Index(['id', 'doi', 'title', 'display_name', 'relevance_score',
       'publication_year', 'publication_date', 'ids', 'language',
       'primary_location', 'type', 'type_crossref', 'open_access',
       'authorships', 'countries_distinct_count',
       'institutions_distinct_count', 'corresponding_author_ids',
       'corresponding_institution_ids', 'apc_list', 'apc_paid', 'has_fulltext',
       'fulltext_origin', 'cited_by_count', 'cited_by_percentile_year',
       'biblio', 'is_retracted', 'is_paratext', 'keywords', 'concepts', 'mesh',
       'locations_count', 'locations', 'best_oa_location',
       'sustainable_development_goals', 'grants', 'referenced_works_count',
       'referenced_works', 'related_works', 'ngrams_url',
       'abstract_inverted_index', 'cited_by_api_url', 'counts_by_year',
       'updated_date', 'created_date', 'abstract', 'authors', 'publication'],
      dtype='object')

In [8]:
merged_df.iloc[3]["locations"]

[{'is_oa': False,
  'landing_page_url': 'https://doi.org/10.2307/4053617',
  'pdf_url': None,
  'source': {'id': 'https://openalex.org/S58239531',
   'display_name': 'Albion',
   'issn_l': '0095-1390',
   'issn': ['0095-1390', '2326-1242'],
   'is_oa': False,
   'is_in_doaj': False,
   'host_organization': 'https://openalex.org/P4310316456',
   'host_organization_name': 'Appalachian State University',
   'host_organization_lineage': ['https://openalex.org/P4310316456'],
   'host_organization_lineage_names': ['Appalachian State University'],
   'type': 'journal'},
  'license': None,
  'version': None,
  'is_accepted': False,
  'is_published': False}]

In [9]:
merged_df.type.unique()

array(['article', 'book-chapter', 'book', 'dissertation', 'editorial',
       'other', 'paratext', 'erratum', 'report', 'letter', 'dataset'],
      dtype=object)

In [10]:
def load_data():
    df = pd.read_csv("data/database.csv")
    # List the columns you want to keep, including 'displayname' and 'author_names'
    columns_to_keep = ["title", "authors", "publication_year", "abstract", "type", "publication", "language", "doi"]  # Add other columns as needed

    # Select only these columns
    df = df[columns_to_keep]

    # Rename 'displayname' to 'title' and 'author_names' to 'authors'
    df = df.rename(columns={"publication_year": "year"})

    return df

df2 = load_data()

df2 = df2.drop_duplicates()
df2 = df2.reset_index(drop=True)
df2.to_parquet("data/database.parquet")

In [11]:
df2

Unnamed: 0,title,authors,year,abstract,type,publication,language,doi
0,METAPSICOLOGIA DO PERDÃO,José Luiz Caon,1998,SÍNTESE - O autor serve-se de dois textos Sófo...,article,Veritas,pt,https://doi.org/10.15448/1984-6746.1998.1.35394
1,A Book Made New: Reading Propertius Reading Po...,Michael Comber,1998,In situations in which understanding is disrup...,article,Journal of Roman Studies,en,https://doi.org/10.1017/s0075435800044105
2,Summary of the spring 1998 meeting of the work...,Trina Arpin|Richard I. Macphail|Giovanni Boschian,1998,"GeoarchaeologyVolume 13, Issue 6 p. 645-647 Co...",article,Geoarchaeology-An International Journal,en,https://doi.org/10.1002/(sici)1520-6548(199808...
3,"The Wars of the Bruces: Scotland, England, and...",John McCafferty|Colm McNamee,1998,An abstract is not available for this content ...,article,Albion,en,https://doi.org/10.2307/4053617
4,The Caspian's False Promise,Martha Brill Olcott,1998,,article,Foreign Policy,en,https://doi.org/10.2307/1149381
...,...,...,...,...,...,...,...,...
216411,Illustrations of the History of Medieval Thoug...,Reginald L. Poole,1960,,book,,en,
216412,Incomes of Medieval English Doctors,Hammond Ea,1960,Journal Article Incomes of Medieval English Do...,article,Journal of the History of Medicine and Allied ...,en,https://doi.org/10.1093/jhmas/xv.2.154
216413,Early Medieval Trade Routes,,1960,THE techniques and methodology of numismatics ...,article,The American Historical Review,en,https://doi.org/10.1086/ahr/65.2.271
216414,Book Notes,,1960,,article,American Sociological Review,,https://doi.org/10.2307/2092118


In [12]:
len(df2.publication.unique())

11850

In [13]:
df2.language.unique()

array(['pt', 'en', 'fr', 'de', nan, 'it', 'es', 'nl', 'ro', 'no', 'et',
       'ca', 'af', 'so', 'sv', 'tl', 'vi', 'hr', 'sl', 'pl', 'da', 'hu',
       'lt', 'id', 'tr', 'lv', 'cy', 'el', 'bg', 'he', 'fi', 'fa', 'sk',
       'sw', 'cs', 'ml', 'sq', 'ru', 'zh-cn', 'ar', 'ja', 'ko'],
      dtype=object)