### Export TSOSI data file


In [None]:
from pathlib import Path
import sys
import os
import django

# Add the parent directory to the system path and setup django
BASE_DIR = str(Path(os.getcwd()).resolve().parent.parent)

if BASE_DIR not in sys.path:
    sys.path.append(BASE_DIR)

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "backend_site.settings")

django.setup()

from tsosi.data.preparation.doaj.publishers_202X import (
    get_config as doaj_publisher_config,
)
from tsosi.data.preparation.doaj.libraries_2024_now import (
    get_config as doaj_library_2024_config,
)
from tsosi.data.preparation.doaj.libraries_2023 import (
    get_config as doaj_library_2023_config,
)
from tsosi.data.preparation.doaj.libraries_2021_2022 import (
    get_config as doaj_library_2021_2022_config,
)

year = 2024
file_path = "/home/guillaume-alzieu/Dev/TSOSI/TSOSI_data/doaj/4_prepared/2025-01-07-DOAJ_Publisher_Report_2024_prepared.xlsx"
sheet_name = "Sheet1"
config = doaj_publisher_config(year, file_path, sheet_name)
# config = doaj_library_2023_config(file_path, sheet_name)
config.generate_data_file()

### Prepare raw data for manual matching


In [None]:
from pathlib import Path
import sys
import os
import django
import pandas as pd
from asgiref.sync import sync_to_async

# Add the parent directory to the system path and setup django
BASE_DIR = str(Path(os.getcwd()).resolve().parent.parent)

if BASE_DIR not in sys.path:
    sys.path.append(BASE_DIR)

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "backend_site.settings")

django.setup()

from tsosi.data.pid_matching import prepare_manual_matching


file_name = "2023-11-27-DOAB_Library_Report_2023.xlsx"
file_folder = "/home/guillaume-alzieu/Dev/TSOSI/TSOSI_data/doab/1_raw/"
sheet = "Sheet1"
name_column = "Institution"
country_colum = "Country"


@sync_to_async
def to_run():
    xls = pd.ExcelFile(f"{file_folder}{file_name}")
    data = pd.read_excel(xls, sheet)
    return prepare_manual_matching(
        data, name_column, country_column=country_colum
    )


res = await to_run()

In [None]:
# Optional date cleaning to output dates instead of datetimes
date_cols = [
    "Support Confirmation Date",
    "Support Start Date",
    "Support End Date",
]
for d_col in date_cols:
    res[d_col] = res[d_col].dt.date

In [5]:
file_folder = "/home/guillaume-alzieu/Dev/TSOSI/TSOSI_data/doab/2_matched/"
file_to_write = "2023-11-27-DOAB_Library_Report_2023_matched.xlsx"
file_path = f"{file_folder}{file_to_write}"
res.to_excel(file_path, index=False)

In [None]:
d = res.copy()
d[d["_processed"] != True]

### Process enriched data


In [None]:
from pathlib import Path
import sys
import os
import django
import pandas as pd
from asgiref.sync import sync_to_async

# Add the parent directory to the system path and setup django
BASE_DIR = str(Path(os.getcwd()).resolve().parent.parent)

if BASE_DIR not in sys.path:
    sys.path.append(BASE_DIR)

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "backend_site.settings")

django.setup()

from tsosi.data.utils import clean_null_values
from tsosi.data.preparation.cleaning_utils import clean_cell_value
from tsosi.data.pid_matching import process_enriched_data
from tsosi.models.transfert import (
    TRANSFERT_ENTITY_TYPE_EMITTER,
    TRANSFERT_ENTITY_TYPE_AGENT,
)

process_agents = True
folder = "/home/guillaume-alzieu/Dev/TSOSI/TSOSI_data/doaj"
file_base_name = "2024-09-20-DOAJ_Library_Report_2023"
enriched_file_path = f"{folder}/3_enriched/{file_base_name}_enriched_done.xlsx"
sheet_transferts = "Transferts"
name_column = "Institution name"
xls = pd.ExcelFile(enriched_file_path)

data = pd.read_excel(xls, sheet_transferts)

res = process_enriched_data(data, name_column, TRANSFERT_ENTITY_TYPE_EMITTER)

if process_agents:
    # Match agents data back to the transferts spreadsheet
    print("Processing agents/consortiums data")
    sheet_agent = "Consortiums"
    agent_col = "agent/consortium"
    country_col = "country"
    agents = pd.read_excel(xls, sheet_agent)
    for col in [agent_col, country_col]:
        res[col] = res[col].apply(clean_cell_value)
        agents[col] = agents[col].apply(clean_cell_value)
    clean_null_values(agents)
    clean_null_values(res)
    res = res.merge(agents, on=[agent_col, country_col], how="left")
    # Remove agents not in the agent spreadsheet (DOAJ mixed up consortium name and institution type)
    wrong_agents = res[~res[agent_col].isin(agents[agent_col])]
    res.loc[wrong_agents.index, agent_col] = None

    # Process agents data enrichment
    res = process_enriched_data(res, agent_col, TRANSFERT_ENTITY_TYPE_AGENT)
clean_null_values(res)

In [None]:
res[res["Agent"] == "Baden-Wurttemberg"]

In [3]:
prepared_file_path = f"{folder}/4_prepared/{file_base_name}_prepared.xlsx"

res.to_excel(prepared_file_path, sheet_name="Transferts", index=False)

In [None]:
import pycountry

{c.name: c for c in pycountry.countries}

### Redis setup


In [None]:
from pathlib import Path
import sys
import os
import django
from asgiref.sync import sync_to_async

# Add the parent directory to the system path and setup django
BASE_DIR = str(Path(os.getcwd()).resolve().parent.parent)

if BASE_DIR not in sys.path:
    sys.path.append(BASE_DIR)

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "backend_site.settings")

django.setup()

# from tsosi.data.token_bucket import ror_token_bucket as bucket
from tsosi.data.token_bucket import (
    WIKIPEDIA_TOKEN_BUCKET as bucket,
    REDIS_CLIENT,
)


async def to_run():
    tokens = bucket.consume(80)

    return tokens


res = await to_run()

In [None]:
from pycountry import countries


res = countries.search_fuzzy("Guadeloupe")
res

### Extract agents/consortiums


In [None]:
import pandas as pd
import re
import numpy as np

folder = "/home/guillaume-alzieu/Dev/TSOSI/TSOSI_data/doab/1_raw"
file_name = "2023-11-27-DOAB_Library_Report_2023.xlsx"
sheet_name = "Sheet1"

xls = pd.ExcelFile(f"{folder}/{file_name}")
df = pd.read_excel(xls, sheet_name)


def is_true(val) -> bool:
    if isinstance(val, str):
        return val.strip().lower() == "true"
    elif isinstance(val, bool):
        return val
    elif isinstance(val, (int, float)):
        return val == 1
    return False


def clean_cell_value[T](s: T) -> T:
    """
    Clean the value from a spreadsheet cell:
    - Normalize spacing values.
    - Strip whitespaces.
    """
    if not s or not isinstance(s, str):
        return s
    return re.sub(r"\s+", " ", s).strip()


agent_column = "Agent/Contact"
country_column = "Country"
# not_agents = ["standalone library", "Funder", "Consortium", "publisher"]
not_agents = []
df = df[~df[agent_column].isna()]
df = df[~df[agent_column].isin(not_agents)]
for c in df.columns:
    df[c] = df[c].apply(clean_cell_value)
df = df[[agent_column, country_column]].drop_duplicates()
df.replace(to_replace=[np.nan, pd.NA, pd.NaT], value=None, inplace=True)
df.sort_values(by=agent_column)