### 5 - Generate TSOSI data file


In [None]:
from pathlib import Path
import sys
import os
import django
from datetime import date

# Add the parent directory to the system path and setup django
BASE_DIR = str(Path(os.getcwd()).resolve().parent.parent.parent.parent)

if BASE_DIR not in sys.path:
    sys.path.append(BASE_DIR)

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "backend_site.settings")

django.setup()

from tsosi.data.preparation.doaj.libraries_2021_2022 import (
    get_config as config_libraries_2021_2022,
)
from tsosi.data.preparation.doaj.libraries_2023 import (
    get_config as config_libraries_2023,
)
from tsosi.data.preparation.doaj.libraries_2024_now import (
    get_config as config_libraries_2024_now,
)
from tsosi.data.preparation.doaj.publishers_202X import (
    get_config as config_publishers,
)

prepared_folder = "Nextcloud/TSOSI_data/doaj/4_prepared"
## Libraries
# Libraries 2021 & 2022
date_data_2021_2022_2023 = date(2024, 9, 20)
file_path = (
    Path.home()
    / prepared_folder
    / "2024-09-20-DOAJ_Library_Report_2021_prepared.xlsx"
)
config = config_libraries_2021_2022(
    2021, str(file_path), "Transfers", date_data_2021_2022_2023
)
config.generate_data_file()

file_path = (
    Path.home()
    / prepared_folder
    / "2024-09-20-DOAJ_Library_Report_2022_prepared.xlsx"
)
config = config_libraries_2021_2022(
    2021, str(file_path), "Transfers", date_data_2021_2022_2023
)
config.generate_data_file()

# Libraries 2023
file_path = (
    Path.home()
    / prepared_folder
    / "2024-09-20-DOAJ_Library_Report_2023_prepared.xlsx"
)
config = config_libraries_2023(
    str(file_path), "Transfers", date_data_2021_2022_2023
)
config.generate_data_file()

# Libraries 2024
date_data_2024 = date(2025, 1, 15)
file_path = (
    Path.home()
    / prepared_folder
    / "2025-01-15-DOAJ_Library_Report_2024_prepared.xlsx"
)
config = config_libraries_2024_now(
    2024, str(file_path), "Transfers", date_data_2024
)
config.generate_data_file()

## Publishers
# Publishers 2021
date_data_sponsors = date(2025, 1, 7)
file_path = (
    Path.home()
    / prepared_folder
    / "2025-01-07-DOAJ_Publisher_Report_2021_prepared.xlsx"
)
config = config_publishers(
    2021, str(file_path), "Transfers", date_data_sponsors
)
config.generate_data_file()

# Publishers 2022
file_path = (
    Path.home()
    / prepared_folder
    / "2025-01-07-DOAJ_Publisher_Report_2022_prepared.xlsx"
)
config = config_publishers(
    2022, str(file_path), "Transfers", date_data_sponsors
)
config.generate_data_file()

# Publishers 2023
file_path = (
    Path.home()
    / prepared_folder
    / "2025-01-07-DOAJ_Publisher_Report_2023_prepared.xlsx"
)
config = config_publishers(
    2023, str(file_path), "Transfers", date_data_sponsors
)
config.generate_data_file()

# Publishers 2024
file_path = (
    Path.home()
    / prepared_folder
    / "2025-01-07-DOAJ_Publisher_Report_2024_prepared.xlsx"
)
config = config_publishers(
    2024, str(file_path), "Transfers", date_data_sponsors
)
config.generate_data_file()

### Check for transfers with ambiguous invoice dates


In [None]:
import pandas as pd
from pathlib import Path
from datetime import date


file_path = (
    Path.home()
    / "Nextcloud/TSOSI_data/doaj/3_enriched/2025-01-15-DOAJ_Library_Report_2024_enriched_done.xlsx"
)

df = pd.read_excel(file_path)

df["_invoice_date"] = pd.to_datetime(df["Invoice date"], errors="coerce")

date_bound = date(2023, 12, 1)
df["_date_before_bound"] = df["_invoice_date"].apply(
    lambda x: (True if pd.isna(x) else x.date() < date_bound)
)


mask = (
    ~df["Support amount"].isna()
    & ~df["_invoice_date"].isna()
    & df["_date_before_bound"]
)
columns = [c for c in df.columns if not c[0] == "_"]
to_review = df[mask][columns]
# to_review.to_excel(
#     "TSOSI_DOAJ_Library_2024_Invoice_data_review.xlsx", index=False
# )
to_review

### Check for duplicated transfers


In [None]:
import pandas as pd
from pathlib import Path
from datetime import date


file_path = (
    Path.home()
    / "Nextcloud/TSOSI_data/doaj/3_enriched/2024-09-20-DOAJ_Library_Report_2021_enriched_done.xlsx"
)

amount_field = "amount"
entity_field = "Institution name"

df = pd.read_excel(file_path)

columns = [c for c in df.columns if not c[0] == "_"]
# Drop lines with missing amounts - They correspond to past transfers.
df = df[~df[amount_field].isna()][columns].copy()
# Check for entities with multiple occurences
grouped = df.groupby(entity_field)[amount_field].count()
grouped[grouped > 1]
mask = df[entity_field].isin(grouped[grouped > 1].index)
to_review = df[mask].sort_values(entity_field)
# Export
# to_review.to_excel(
#     "TSOSI_DOAJ_Library_2024_Duplicated_transfers_review.xlsx", index=False
# )
to_review