### 1 - Pre-process data files


##### Couperin 2023 participation data


In [None]:
from pathlib import Path
import pandas as pd
from datetime import date

date_str = "2025-04-30"
input_folder = Path.home() / "Nextcloud/TSOSI_data/doaj/0_raw"
input_data_file = f"{date_str}_DOAJ_Couperin_2023_breakdown_raw.xlsx"
input_sheet = "Transfers"
output_folder = Path.home() / "Nextcloud/TSOSI_data/doaj/1_pre_processed"
output_data_file = f"{date_str}_DOAJ_Couperin_2023_breakdown_pre_processed.xlsx"


columns = [
    "Member name",
    "Country",
    "Agent",
    "agent_ror_id",
    "Sales price",
    "Currency",
    "Paid Y/N",
]

data = pd.read_excel(
    str(input_folder / input_data_file), sheet_name=input_sheet
)[columns]

data["date_payment_emitter"] = date(2023, 1, 1)
mask_differed = data["Paid Y/N"] == "No"
data["date_payment_recipient"] = date(2023, 1, 1)
data.loc[mask_differed, "date_payment_recipient"] = date(2025, 1, 1)


data.to_excel(
    str(output_folder / output_data_file), sheet_name="Transfers", index=False
)

##### Couperin 2024 participation data


In [None]:
from pathlib import Path
import pandas as pd
from datetime import date

date_str = "2025-04-29"
input_folder = Path.home() / "Nextcloud/TSOSI_data/doaj/0_raw"
input_data_file = f"{date_str}_DOAJ_Couperin_2024_breakdown_raw.xlsx"
input_sheet = "Transfers"
output_folder = Path.home() / "Nextcloud/TSOSI_data/doaj/1_pre_processed"
output_data_file = f"{date_str}_DOAJ_Couperin_2024_breakdown_pre_processed.xlsx"


columns = [
    "Member name",
    "Country",
    "Agent",
    "agent_ror_id",
    "Sales price",
    "Currency",
]

data = pd.read_excel(
    str(input_folder / input_data_file), sheet_name=input_sheet
)[columns]

data["date_payment_recipient"] = date(2024, 1, 1)

data.to_excel(
    str(output_folder / output_data_file), sheet_name="Transfers", index=False
)

### 2 - Prepare manual matching


##### Couperin 2023 data file


In [None]:
from pathlib import Path
import pandas as pd
import sys
import os
import django
from asgiref.sync import sync_to_async

# Add the parent directory to the system path and setup django
BASE_DIR = str(Path(os.getcwd()).resolve().parent.parent.parent.parent)

if BASE_DIR not in sys.path:
    sys.path.append(BASE_DIR)

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "backend_site.settings")

django.setup()

from tsosi.data.pid_matching import prepare_manual_matching

date_str = "2025-04-30"
input_folder = Path.home() / "Nextcloud/TSOSI_data/doaj/1_pre_processed"
input_data_file = f"{date_str}_DOAJ_Couperin_2023_breakdown_pre_processed.xlsx"
input_sheet = "Transfers"
output_folder = Path.home() / "Nextcloud/TSOSI_data/doaj/2_matched"
output_data_file = f"{date_str}_DOAJ_Couperin_2023_breakdown_matched.xlsx"

data = pd.read_excel(
    str(input_folder / input_data_file), sheet_name=input_sheet
)


@sync_to_async
def _run():
    prepared = prepare_manual_matching(
        data, name_column="Member name", country_column="Country"
    )
    return prepared


prepared = await _run()
prepared.to_excel(
    str(output_folder / output_data_file), sheet_name=input_sheet, index=False
)

##### Couperin 2024 data file


In [None]:
from pathlib import Path
import pandas as pd
import sys
import os
import django
from asgiref.sync import sync_to_async

# Add the parent directory to the system path and setup django
BASE_DIR = str(Path(os.getcwd()).resolve().parent.parent.parent.parent)

if BASE_DIR not in sys.path:
    sys.path.append(BASE_DIR)

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "backend_site.settings")

django.setup()

from tsosi.data.pid_matching import prepare_manual_matching

date_str = "2025-04-29"
input_folder = Path.home() / "Nextcloud/TSOSI_data/doaj/1_pre_processed"
input_data_file = f"{date_str}_DOAJ_Couperin_2024_breakdown_pre_processed.xlsx"
input_sheet = "Transfers"
output_folder = Path.home() / "Nextcloud/TSOSI_data/doaj/2_matched"
output_data_file = f"{date_str}_DOAJ_Couperin_2024_breakdown_matched.xlsx"

data = pd.read_excel(
    str(input_folder / input_data_file), sheet_name=input_sheet
)


@sync_to_async
def _run():
    prepared = prepare_manual_matching(
        data, name_column="Member name", country_column="Country"
    )
    return prepared


prepared = await _run()
prepared.to_excel(
    str(output_folder / output_data_file), sheet_name=input_sheet, index=False
)

### 4 - Prepare data files


##### Couperin 2023 data file


In [None]:
from pathlib import Path
import pandas as pd
import sys
import os
import django
from asgiref.sync import sync_to_async

# Add the parent directory to the system path and setup django
BASE_DIR = str(Path(os.getcwd()).resolve().parent.parent.parent.parent)

if BASE_DIR not in sys.path:
    sys.path.append(BASE_DIR)

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "backend_site.settings")

django.setup()

from tsosi.data.pid_matching import process_enriched_data

date_str = "2025-04-30"
input_folder = Path.home() / "Nextcloud/TSOSI_data/doaj/3_enriched"
input_data_file = f"{date_str}_DOAJ_Couperin_2023_breakdown_enriched_done.xlsx"
input_sheet = "Transfers"
output_folder = Path.home() / "Nextcloud/TSOSI_data/doaj/4_prepared"
output_data_file = f"{date_str}_DOAJ_Couperin_2023_breakdown_prepared.xlsx"

date_cols = [
    "date_payment_emitter",
    "date_payment_recipient",
]

data = pd.read_excel(
    str(input_folder / input_data_file), sheet_name=input_sheet
)


@sync_to_async
def _run():
    processed = process_enriched_data(data, "Member name", "emitter")
    return processed


processed = await _run()
for c in date_cols:
    processed[c] = processed[c].dt.date

processed.to_excel(
    str(output_folder / output_data_file), sheet_name=input_sheet, index=False
)

##### Couperin 2024 data file


In [None]:
from pathlib import Path
import pandas as pd
import sys
import os
import django
from asgiref.sync import sync_to_async

# Add the parent directory to the system path and setup django
BASE_DIR = str(Path(os.getcwd()).resolve().parent.parent.parent.parent)

if BASE_DIR not in sys.path:
    sys.path.append(BASE_DIR)

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "backend_site.settings")

django.setup()

from tsosi.data.pid_matching import process_enriched_data

date_str = "2025-04-29"
input_folder = Path.home() / "Nextcloud/TSOSI_data/doaj/3_enriched"
input_data_file = f"{date_str}_DOAJ_Couperin_2024_breakdown_enriched_done.xlsx"
input_sheet = "Transfers"
output_folder = Path.home() / "Nextcloud/TSOSI_data/doaj/4_prepared"
output_data_file = f"{date_str}_DOAJ_Couperin_2024_breakdown_prepared.xlsx"

date_cols = [
    "date_payment_recipient",
]

data = pd.read_excel(
    str(input_folder / input_data_file), sheet_name=input_sheet
)


@sync_to_async
def _run():
    processed = process_enriched_data(data, "Member name", "emitter")
    return processed


processed = await _run()
for c in date_cols:
    processed[c] = processed[c].dt.date

processed.to_excel(
    str(output_folder / output_data_file), sheet_name=input_sheet, index=False
)

##### Clean DOAJ 2023 Library data file

Remove all the lines with Couperin as an agent and concat the standalone Couperin data.


In [None]:
from pathlib import Path
import pandas as pd

couperin_ror_id = "035c9qf67"
folder = Path.home() / "Nextcloud/TSOSI_data/doaj/4_prepared"
doaj_file = "2024-09-20-DOAJ_Library_Report_2023_prepared.xlsx"
couperin_file = "2025-04-30_DOAJ_Couperin_2023_breakdown_prepared.xlsx"
output_file = "2025-05-05_DOAJ_Library_Report_2023_prepared_final.xlsx"
sheet_name = "Transfers"

doaj_data = pd.read_excel(str(folder / doaj_file), sheet_name=sheet_name)
couperin_data = pd.read_excel(
    str(folder / couperin_file), sheet_name=sheet_name
)

date_cols = [
    "date_payment_emitter",
    "date_payment_recipient",
]
for c in date_cols:
    couperin_data[c] = couperin_data[c].dt.date
column_mapping = {
    "Member name": "Institution name",
    "Country": "country",
    "Agent": "agent/consortium",
    "Sales price": "amount",
    "Currency": "currency",
}
couperin_data.rename(columns=column_mapping, inplace=True)

doaj_c_mask = (doaj_data["agent_ror_id"] == couperin_ror_id) & (
    doaj_data["emitter_ror_id"] != couperin_ror_id
)
doaj_data.drop(index=doaj_data[doaj_c_mask].index, inplace=True)
merged = pd.concat([doaj_data, couperin_data], axis=0)

merged.to_excel(str(folder / output_file), sheet_name=sheet_name, index=False)

##### Clean DOAJ 2024 Library data file


Replace Couperin transfers with Couperin data extract


In [None]:
from pathlib import Path
import pandas as pd

couperin_ror_id = "035c9qf67"
folder = Path.home() / "Nextcloud/TSOSI_data/doaj/4_prepared"
doaj_file = "2025-01-15-DOAJ_Library_Report_2024_prepared.xlsx"
couperin_file = "2025-04-29_DOAJ_Couperin_2024_breakdown_prepared.xlsx"
output_file = "2025-05-05_DOAJ_Library_Report_2024_prepared_Couperin_clean.xlsx"
sheet_name = "Transfers"

doaj_data = pd.read_excel(str(folder / doaj_file), sheet_name=sheet_name)
couperin_data = pd.read_excel(
    str(folder / couperin_file), sheet_name=sheet_name
)

date_cols = [
    "date_payment_recipient",
]
for c in date_cols:
    couperin_data[c] = couperin_data[c].dt.date
column_mapping = {
    "Member name": "Company",
    "Country": "Country",
    "Agent": "Agent",
    "Sales price": "Support amount",
    "Currency": "Currency",
}
couperin_data.rename(columns=column_mapping, inplace=True)

doaj_c_mask = (doaj_data["agent_ror_id"] == couperin_ror_id) & (
    doaj_data["emitter_ror_id"] != couperin_ror_id
)
doaj_data.drop(index=doaj_data[doaj_c_mask].index, inplace=True)
# date_cols = ["Invoice date", "Support end date", "Paid up until"]
# for c in date_cols:
#     doaj_data[c] = pd.to_datetime(doaj_data[c]).dt.date


merged = pd.concat([doaj_data, couperin_data], axis=0)

merged.to_excel(str(folder / output_file), sheet_name=sheet_name, index=False)

Remove transfers flagged as duplicates


In [None]:
from pathlib import Path
import pandas as pd

prepared_folder = Path.home() / "Nextcloud/TSOSI_data/doaj/4_prepared"
prepared_file = (
    "2025-05-05_DOAJ_Library_Report_2024_prepared_Couperin_clean.xlsx"
)
duplicate_folder = Path.home() / "Nextcloud/TSOSI_data/doaj/0_raw"
duplicate_file = "2025-04-29_DOAJ_Library_2024_Invoice_data_review.xlsx"
output_file = "2025-05-05_DOAJ_Library_Report_2024_prepared_final.xlsx"
sheet_name = "Transfers"

prepared = pd.read_excel(
    str(prepared_folder / prepared_file), sheet_name=sheet_name
)
duplicates = pd.read_excel(
    str(duplicate_folder / duplicate_file), sheet_name="Sheet1"
)

duplicates = duplicates[duplicates["Duplicated (guess)"] == 1]
# Remove Couperin data, already handled
duplicates = duplicates[
    duplicates["Agent"].str.lower().str.strip() != "couperin"
]

prepared["_index"] = prepared.index.copy(deep=True)

merge_cols = [
    "Company",
    "Country",
    "Support amount",
    "Currency",
    "Invoice date",
    "Agent",
]

res = duplicates.merge(
    prepared[["_index", *merge_cols]], on=merge_cols, how="left"
)
assert len(res) == len(duplicates)
assert not res["_index"].isna().any()

clean = prepared[~prepared["_index"].isin(res["_index"].to_list())].copy()
print(
    f"""
Original table size: {len(prepared)}.
Without duplicates: {len(clean)}
Delta: {len(res)}
"""
)
del clean["_index"]
date_cols = ["date_payment_recipient"]
for c in date_cols:
    clean[c] = clean[c].dt.date

clean.to_excel(
    str(prepared_folder / output_file), sheet_name=sheet_name, index=False
)

### 5 - Generate TSOSI data file


In [None]:
from pathlib import Path
import sys
import os
import django
from datetime import date

# Add the parent directory to the system path and setup django
BASE_DIR = str(Path(os.getcwd()).resolve().parent.parent.parent.parent)

if BASE_DIR not in sys.path:
    sys.path.append(BASE_DIR)

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "backend_site.settings")

django.setup()

from tsosi.data.preparation.doaj.libraries_2021_2022 import (
    get_config as config_libraries_2021_2022,
)
from tsosi.data.preparation.doaj.libraries_2023 import (
    get_config as config_libraries_2023,
)
from tsosi.data.preparation.doaj.libraries_2024_now import (
    get_config as config_libraries_2024_now,
)
from tsosi.data.preparation.doaj.publishers_202X import (
    get_config as config_publishers,
)

prepared_folder = "Nextcloud/TSOSI_data/doaj/4_prepared"
## Libraries
# Libraries 2021 & 2022
date_data_2021_2022_2023 = date(2024, 9, 20)
file_path = (
    Path.home()
    / prepared_folder
    / "2024-09-20-DOAJ_Library_Report_2021_prepared.xlsx"
)
config = config_libraries_2021_2022(
    2021, str(file_path), "Transfers", date_data_2021_2022_2023
)
config.generate_data_file()

file_path = (
    Path.home()
    / prepared_folder
    / "2024-09-20-DOAJ_Library_Report_2022_prepared.xlsx"
)
config = config_libraries_2021_2022(
    2022, str(file_path), "Transfers", date_data_2021_2022_2023
)
config.generate_data_file()

# Libraries 2023
date_data_2023 = date(2025, 5, 5)
file_path = (
    Path.home()
    / prepared_folder
    / "2025-05-05_DOAJ_Library_Report_2023_prepared_final.xlsx"
)
config = config_libraries_2023(str(file_path), "Transfers", date_data_2023)
config.generate_data_file()

# Libraries 2024
date_data_2024 = date(2025, 5, 5)
file_path = (
    Path.home()
    / prepared_folder
    / "2025-05-05_DOAJ_Library_Report_2024_prepared_final.xlsx"
)
config = config_libraries_2024_now(
    2024, str(file_path), "Transfers", date_data_2024
)
config.generate_data_file()

## Publishers
# Publishers 2021
date_data_sponsors = date(2025, 1, 7)
file_path = (
    Path.home()
    / prepared_folder
    / "2025-01-07-DOAJ_Publisher_Report_2021_prepared.xlsx"
)
config = config_publishers(
    2021, str(file_path), "Transfers", date_data_sponsors
)
config.generate_data_file()

# Publishers 2022
file_path = (
    Path.home()
    / prepared_folder
    / "2025-01-07-DOAJ_Publisher_Report_2022_prepared.xlsx"
)
config = config_publishers(
    2022, str(file_path), "Transfers", date_data_sponsors
)
config.generate_data_file()

# Publishers 2023
file_path = (
    Path.home()
    / prepared_folder
    / "2025-01-07-DOAJ_Publisher_Report_2023_prepared.xlsx"
)
config = config_publishers(
    2023, str(file_path), "Transfers", date_data_sponsors
)
config.generate_data_file()

# Publishers 2024
file_path = (
    Path.home()
    / prepared_folder
    / "2025-01-07-DOAJ_Publisher_Report_2024_prepared.xlsx"
)
config = config_publishers(
    2024, str(file_path), "Transfers", date_data_sponsors
)
config.generate_data_file()

### X - Misc


##### Check for transfers with ambiguous invoice dates


In [None]:
import pandas as pd
from pathlib import Path
from datetime import date


file_path = (
    Path.home()
    / "Nextcloud/TSOSI_data/doaj/3_enriched/2025-01-15-DOAJ_Library_Report_2024_enriched_done.xlsx"
)

df = pd.read_excel(file_path)

df["_invoice_date"] = pd.to_datetime(df["Invoice date"], errors="coerce")

date_bound = date(2023, 12, 1)
df["_date_before_bound"] = df["_invoice_date"].apply(
    lambda x: (True if pd.isna(x) else x.date() < date_bound)
)


mask = (
    ~df["Support amount"].isna()
    & ~df["_invoice_date"].isna()
    & df["_date_before_bound"]
)
columns = [c for c in df.columns if not c[0] == "_"]
to_review = df[mask][columns]
# to_review.to_excel(
#     "TSOSI_DOAJ_Library_2024_Invoice_data_review.xlsx", index=False
# )
to_review

##### Check for duplicated transfers


In [None]:
import pandas as pd
from pathlib import Path
from datetime import date


file_path = (
    Path.home()
    / "Nextcloud/TSOSI_data/doaj/3_enriched/2024-09-20-DOAJ_Library_Report_2021_enriched_done.xlsx"
)

amount_field = "amount"
entity_field = "Institution name"

df = pd.read_excel(file_path)

columns = [c for c in df.columns if not c[0] == "_"]
# Drop lines with missing amounts - They correspond to past transfers.
df = df[~df[amount_field].isna()][columns].copy()
# Check for entities with multiple occurences
grouped = df.groupby(entity_field)[amount_field].count()
grouped[grouped > 1]
mask = df[entity_field].isin(grouped[grouped > 1].index)
to_review = df[mask].sort_values(entity_field)
# Export
# to_review.to_excel(
#     "TSOSI_DOAJ_Library_2024_Duplicated_transfers_review.xlsx", index=False
# )
to_review