hello world


# OpenAIRE Bulk Data Handling

This downloads the bulk data from the sub graphs https://graph.openaire.eu/docs/downloads/subgraphs
extracts the json files and puts it into a usefull format, like parquet.

In [9]:
# Variables

# URL of the tar file
url = "https://zenodo.org/records/14887484/files/aurora.tar?download=1"

# extract the file name from the URL
file_name = url.split("/")[-1]

# remove everything after the question mark
file_name = file_name.split("?")[0]


# Path to save the downloaded tar file using file_name variable
download_path = f"./data/01_input/{file_name}"

# Path to save the extracted files
extraction_path = "./data/02_extracted"

print(f"URL: {url}")
print(f"File Name: {file_name}")
print(f"Download Path: {download_path}")
print(f"Extraction Path: {extraction_path}")

URL: https://zenodo.org/records/14887484/files/aurora.tar?download=1
File Name: aurora.tar
Download Path: ./data/01_input/aurora.tar
Extraction Path: ./data/02_extracted


Download the tar file

In [10]:
import requests
import os

# Create the directory if it doesn't exist
os.makedirs(os.path.dirname(download_path), exist_ok=True)

# Download the tar file
response = requests.get(url)
with open(download_path, 'wb') as file:
    file.write(response.content)

print("Download complete.")

KeyboardInterrupt: 

extract the tar file

In [None]:
import os
import tarfile

# Create the directory if it doesn't exist
os.makedirs(extraction_path, exist_ok=True)

# Extract the tar file
with tarfile.open(download_path, 'r') as tar:
    tar.extractall(path=extraction_path)

print("Extraction complete.")
    

In [16]:
# List the extracted files
extracted_files = os.listdir(extraction_path)

# count he number of files in the extracted folder
num_files = len(extracted_path)
print(f"Number of files: {num_files}")

# print the first 5 files
print("First 5 files:")
for file in extracted_files[:5]:
    print(file) 

# print the added subdirectories
subdirectories = [file for file in extracted_files if os.path.isdir(os.path.join(extraction_path, file))]
print("Subdirectories:")
for subdirectory in subdirectories:
    print(subdirectory)

# print the latest added subdirectory based on date modified
latest_subdirectory = sorted(subdirectories, key=lambda x: os.path.getmtime(os.path.join(extraction_path, x)))[-1]
print(f"Latest subdirectory: {latest_subdirectory}")

# make varable for the path to the latest subdirectory
latest_extraction_path = os.path.join(extraction_path, latest_subdirectory)

# print the path of the latest extraction path
print(f"Latest extraction path: {latest_extraction_path}")



Number of files: 19
First 5 files:
aurora
Subdirectories:
aurora
Latest subdirectory: aurora
Latest extraction path: ./data/02_extracted/aurora


In [None]:
import duckdb
import os

# Connect to an in-memory DuckDB database
con = duckdb.connect()

# Get the first 3 gzipped JSON files from the latest_extraction_path
gz_files = [file for file in os.listdir(latest_extraction_path) if file.endswith(".gz")][:3]

# Load the gzipped JSON files directly into DuckDB
for gz_file in gz_files:
    gz_file_path = os.path.join(latest_extraction_path, gz_file)
    con.execute("CREATE OR REPLACE TEMP TABLE temp_table AS SELECT * FROM read_json_auto(?, compression='gzip')", [gz_file_path])

# Generate the SQL schema from the temporary table
schema = con.execute("DESCRIBE temp_table").fetchall()

# Print the schema
print("SQL Schema:")
for column in schema:
    print(f"{column[0]}: {column[1]}")

# Close the DuckDB connection
con.close()

SQL Schema:
authors: STRUCT(fullName VARCHAR, "name" VARCHAR, rank BIGINT, surname VARCHAR, pid STRUCT(id STRUCT(scheme VARCHAR, "value" VARCHAR), provenance STRUCT(provenance VARCHAR, trust VARCHAR)))[]
collectedFrom: STRUCT("key" VARCHAR, "value" VARCHAR)[]
communities: STRUCT(code VARCHAR, "label" VARCHAR, provenance STRUCT(provenance VARCHAR, trust VARCHAR)[])[]
contributors: JSON[]
countries: JSON[]
coverages: JSON[]
dateOfCollection: VARCHAR
descriptions: VARCHAR[]
documentationUrls: JSON[]
formats: JSON[]
id: VARCHAR
indicators: STRUCT(citationImpact STRUCT(citationClass VARCHAR, citationCount DOUBLE, impulse DOUBLE, impulseClass VARCHAR, influence DOUBLE, influenceClass VARCHAR, popularity DOUBLE, popularityClass VARCHAR), usageCounts STRUCT(downloads BIGINT, "views" BIGINT))
instances: STRUCT(alternateIdentifiers STRUCT(scheme VARCHAR, "value" VARCHAR)[], collectedFrom STRUCT("key" VARCHAR, "value" VARCHAR), hostedBy STRUCT("key" VARCHAR, "value" VARCHAR), pids STRUCT(scheme V