In [None]:
from pathlib import Path
import sys
import os
import django
from datetime import date

# Add the parent directory to the system path and setup django
BASE_DIR = str(Path(os.getcwd()).resolve().parent.parent.parent.parent)

if BASE_DIR not in sys.path:
    sys.path.append(BASE_DIR)

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "backend_site.settings")

django.setup()

from get_data import get_scipost_raw_data, SCIPOST_TOKEN_URL
from tsosi.app_settings import app_settings
from datetime import date
from pathlib import Path
import requests

today_str = date.today().strftime("%Y-%m-%d")
raw_data_file = f"{today_str}_scipost_raw.json"

raw_data_folder = Path.home() / "Nextcloud/TSOSI_data/scipost/0_raw"

raw_path = str(raw_data_folder / raw_data_file)
# get_scipost_raw_data(dest_file=raw_path)

print("Collecting OAuth2 token")
auth_data = app_settings.SCIPOST_AUTH

payload = {
    "grant_type": "password",
    "username": auth_data["username"],
    "password": auth_data["password"],
    "scope": "read",
}
auth = (auth_data["client_id"], auth_data["client_secret"])
token_data: dict = requests.post(SCIPOST_TOKEN_URL, payload, auth=auth)

### 0 - Get SciPost data


In [None]:
from pathlib import Path
import sys
import os
import django
from datetime import date

# Add the parent directory to the system path and setup django
BASE_DIR = str(Path(os.getcwd()).resolve().parent.parent.parent.parent)

if BASE_DIR not in sys.path:
    sys.path.append(BASE_DIR)

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "backend_site.settings")

django.setup()

from get_data import get_scipost_raw_data
from datetime import date
from pathlib import Path

today_str = date.today().strftime("%Y-%m-%d")
raw_data_file = f"{today_str}_scipost_raw.json"

raw_data_folder = Path.home() / "Nextcloud/TSOSI_data/scipost/0_raw"

raw_path = str(raw_data_folder / raw_data_file)
get_scipost_raw_data(dest_file=raw_path)

In [None]:
import pandas as pd
from pathlib import Path

file = (
    Path.home()
    / "Nextcloud/TSOSI_data/scipost/0_raw/2025-05-06_scipost_raw.json"
)
df = pd.read_json(str(file))
mask = (~df["payment_date"].isna()) & (df["status"] != "paid")
df[mask]

### 1 - Pre-process collected data


In [None]:
from pathlib import Path
import sys
import os
import django
from datetime import date

# Add the parent directory to the system path and setup django
BASE_DIR = str(Path(os.getcwd()).resolve().parent.parent.parent.parent)

if BASE_DIR not in sys.path:
    sys.path.append(BASE_DIR)

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "backend_site.settings")

django.setup()

from get_data import pre_process_data
from pathlib import Path
import pandas as pd

date_str = "2025-05-06"
raw_file = f"{date_str}_scipost_raw.json"
raw_folder = Path.home() / "Nextcloud/TSOSI_data/scipost/0_raw"
raw_path = str(raw_folder / raw_file)

processed_file = f"{date_str}_scipost_pre_processed.json"
processed_folder = Path.home() / "Nextcloud/TSOSI_data/scipost/1_pre_processed"
processed_path = str(processed_folder / processed_file)

df = pd.read_json(raw_path, orient="records")

data = pre_process_data(df)
data.to_json(processed_path, orient="records", indent=2, index=False)

processed_file = processed_file.split(".")[0] + ".xlsx"

data.sort_values(["emitter_ror_id", "emitter"]).to_excel(
    str(processed_folder / processed_file), sheet_name="Transfers", index=False
)

### Data check


In [None]:
import datetime

# Dates
date_cols = ["date_scheduled", "invoice_date", "payment_date"]
for c in date_cols:
    df[c] = pd.to_datetime(df[c])

subsidies = pd.json_normalize(df["subsidy"]).add_prefix("subsidy_")
df = pd.concat([df.drop(columns=["subsidy"]), subsidies], axis=1)

# Payments check
check = df.groupby("subsidy_url").agg(
    {
        "amount": ["sum", "count"],
        "subsidy_amount": "first",
        "subsidy_amount_publicly_shown": "first",
        "subsidy_date_from": "first",
        "subsidy_date_until": "first",
    }
)
check["diff"] = ~check["amount", "sum"].eq(check["subsidy_amount", "first"])
check[check["diff"] & check["subsidy_amount_publicly_shown", "first"]]

# Payment status description
df[
    (df["status"] == "scheduled")
    & (df["date_scheduled"] < pd.to_datetime(date(2025, 3, 1)))
]

# Invoiced payment check
invoice_date_threshold = pd.to_datetime(datetime.date.today()) - pd.Timedelta(
    days=180
)
df[(df["status"] == "invoiced") & (df["invoice_date"] < invoice_date_threshold)]


# Missing date value
df[
    (
        df["invoice_date"].isna()
        & df["payment_date"].isna()
        & df["status"].isin(["paid", "invoiced"])
    )
]

### 3 - Enrich data

Manually enrich the SciPost data with the wikidata ID for the organizations without ROR ID.


### 5 - Generate TSOSI data file


In [None]:
from pathlib import Path
import sys
import os
import django
from datetime import date

# Add the parent directory to the system path and setup django
BASE_DIR = str(Path(os.getcwd()).resolve().parent.parent.parent.parent)

if BASE_DIR not in sys.path:
    sys.path.append(BASE_DIR)

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "backend_site.settings")

django.setup()

from tsosi.data.preparation.scipost.default import get_config

date_data = date(2025, 5, 6)
date_str = date_data.strftime("%Y-%m-%d")
file_path = (
    Path.home()
    / f"Nextcloud/TSOSI_data/scipost/4_processed/{date_str}_scipost_processed.xlsx"
)
config = get_config(str(file_path), "Transfers", date_data)
config.generate_data_file()