In [4]:
from pathlib import Path
import sys
import os
import django
from datetime import date
import pandas as pd
from asgiref.sync import sync_to_async

# Add the parent directory to the system path and setup django
BASE_DIR = str(Path(os.getcwd()).resolve().parent.parent)

if BASE_DIR not in sys.path:
    sys.path.append(BASE_DIR)

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "backend_site.settings")

django.setup()

from tsosi.data.preparation.doaj.publishers_202X import get_config

year = 2024
file_path = "/home/guillaume-alzieu/Dev/TSOSI/TSOSI_data/doaj/4_prepared/2025-01-07-DOAJ_Publisher_Report_2024_prepared.xlsx"
sheet_name = "Sheet1"
config = get_config(year, file_path, sheet_name)
config.generate_data_file()

INFO raw_data_config - Preparing data with config `doaj_publisher`.
INFO raw_data_config - Successfully prepared the data for config doaj_publisher
INFO raw_data_config - Successfully write TSOSI data file at /home/guillaume-alzieu/Dev/TSOSI/TSOSI_data/_exports/2025-02-04_doaj_publisher_2024_full.json


### Prepare raw data for manual matching


In [None]:
from pathlib import Path
import sys
import os
import django
import pandas as pd
from asgiref.sync import sync_to_async

# Add the parent directory to the system path and setup django
BASE_DIR = str(Path(os.getcwd()).resolve().parent.parent)

if BASE_DIR not in sys.path:
    sys.path.append(BASE_DIR)

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "backend_site.settings")

django.setup()

from tsosi.data.pid_matching import prepare_manual_matching


file_name = "2023-11-27-DOAB_Library_Report_2023.xlsx"
file_folder = "/home/guillaume-alzieu/Dev/TSOSI/TSOSI_data/doab/1_raw/"
sheet = "Sheet1"
name_column = "Institution"
country_colum = "Country"


@sync_to_async
def to_run():
    xls = pd.ExcelFile(f"{file_folder}{file_name}")
    data = pd.read_excel(xls, sheet)
    return prepare_manual_matching(
        data, name_column, country_column=country_colum
    )


res = await to_run()

In [None]:
res

In [5]:
file_folder = "/home/guillaume-alzieu/Dev/TSOSI/TSOSI_data/doab/2_matched/"
file_to_write = "2023-11-27-DOAB_Library_Report_2023_matched.xlsx"
file_path = f"{file_folder}{file_to_write}"
res.to_excel(file_path, index=False)

In [None]:
d = res.copy()
d[d["_processed"] != True]

### Prepare enriched data


In [None]:
from pathlib import Path
import sys
import os
import django
import pandas as pd
from asgiref.sync import sync_to_async

# Add the parent directory to the system path and setup django
BASE_DIR = str(Path(os.getcwd()).resolve().parent.parent)

if BASE_DIR not in sys.path:
    sys.path.append(BASE_DIR)

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "backend_site.settings")

django.setup()

from tsosi.data.pid_matching import process_enriched_data
from tsosi.models.transfert import TRANSFERT_ENTITY_TYPE_EMITTER

file = "/home/guillaume-alzieu/Dev/TSOSI/TSOSI_data/doaj/3_enriched/2025-01-07-DOAJ_Publisher_Report_2024_enriched_done.xlsx"
sheet = "Sheet1"
name_column = "Company"
xls = pd.ExcelFile(file)
data = pd.read_excel(xls, sheet)

res = process_enriched_data(data, name_column, TRANSFERT_ENTITY_TYPE_EMITTER)

file_to_write = "/home/guillaume-alzieu/Dev/TSOSI/TSOSI_data/doaj/4_prepared/2025-01-07-DOAJ_Publisher_Report_2024_prepared.xlsx"
res.to_excel(file_to_write, index=False)

In [None]:
res

In [None]:
import pycountry

{c.name: c for c in pycountry.countries}

### Redis setup


In [None]:
from pathlib import Path
import sys
import os
import django
from asgiref.sync import sync_to_async

# Add the parent directory to the system path and setup django
BASE_DIR = str(Path(os.getcwd()).resolve().parent.parent)

if BASE_DIR not in sys.path:
    sys.path.append(BASE_DIR)

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "backend_site.settings")

django.setup()

# from tsosi.data.token_bucket import ror_token_bucket as bucket
from tsosi.data.token_bucket import (
    WIKIPEDIA_TOKEN_BUCKET as bucket,
    REDIS_CLIENT,
)


async def to_run():
    tokens = bucket.consume(80)

    return tokens


res = await to_run()

In [None]:
from pycountry import countries


res = countries.search_fuzzy("Guadeloupe")
res

In [2]:
from pathlib import Path
import sys
import os
import django
from asgiref.sync import sync_to_async

# Add the parent directory to the system path and setup django
BASE_DIR = str(Path(os.getcwd()).resolve().parent.parent)

if BASE_DIR not in sys.path:
    sys.path.append(BASE_DIR)

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "backend_site.settings")

django.setup()

from tsosi.data.ingestion import ingest_data_file
from tsosi.models import empty_db

publisher_files = [
    "2025-02-04_doaj_publisher_2021_full.json",
    "2025-02-04_doaj_publisher_2022_full.json",
    "2025-02-04_doaj_publisher_2023_full.json",
    "2025-02-04_doaj_publisher_2024_full.json",
]


@sync_to_async
def to_run():
    # empty_db()
    # for file in publisher_files:
    #     ingest(file)
    # return


res = await to_run()

INFO ingestion - Ingesting 42 transfert records.
INFO ingestion - Created 42 Entity records
INFO ingestion - Created 39 Identifier and IdentifierEntityMatching records.
INFO ingestion - Created 42 Transfert records
INFO ingestion - Created 84 TransfertEntityMatching records
INFO ingestion - Successfully ingested 42 records.
INFO tasks - Triggering post-ingestion pipeline.
INFO tasks - Triggering new identifier fetching for registries: ['ror', 'wikidata']
INFO ingestion - Ingesting 43 transfert records.
INFO ingestion - Created 4 Entity records
INFO ingestion - Created 1 Identifier and IdentifierEntityMatching records.
INFO ingestion - Created 43 Transfert records
INFO ingestion - Created 86 TransfertEntityMatching records
INFO ingestion - Successfully ingested 43 records.
INFO tasks - Triggering post-ingestion pipeline.
INFO tasks - Triggering new identifier fetching for registries: ['wikidata']
INFO ingestion - Ingesting 51 transfert records.
INFO ingestion - Created 12 Entity records