### 1 - Pre-process raw data

Map the identifiers data with the emitters in the transfer list.


Identifiers manually added:

- Library of the University of Ottawa -> Q57942111
- Société Française d'Écologie et d'Évolution (SFE²) -> Q3488296


In [None]:
from pathlib import Path
import pandas as pd
import os
import sys
import django
from IPython.display import display

# Add the parent directory to the system path and setup django
BASE_DIR = str(Path(os.getcwd()).resolve().parent.parent.parent.parent)

if BASE_DIR not in sys.path:
    sys.path.append(BASE_DIR)

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "backend_site.settings")

django.setup()

from tsosi.data.preparation.cleaning_utils import clean_cell_value

date_str = "2025-03-03"
raw_folder = Path.home() / "Nextcloud/TSOSI_data/pci/0_raw"
raw_file = f"{date_str}_PCI_Funding_Report.xlsx"
raw_path = str(raw_folder / raw_file)

id_file = "2025-01-06_PCI_Funding_Data_Identifiers.xlsx"
id_path = str(raw_folder / id_file)
emitters_sheet = "Emitters"
consortiums_sheet = "Consortiums"


data = pd.read_excel(raw_path)
ids_emitters = pd.read_excel(id_path, emitters_sheet)
ids_consortiums = pd.read_excel(id_path, consortiums_sheet)


data = data.map(clean_cell_value)
ids_emitters = ids_emitters.map(clean_cell_value)
ids_consortiums = ids_consortiums.map(clean_cell_value)

## Map emitters to the transfers df
data["_emitter"] = data["From organization"].str.lower()
emitters_cols = {
    "Institution/origine du soutien": "emitter_name",
    "ROR": "emitter_ror_id",
    "wikidata": "emitter_wikidata_id",
    "Country": "emitter_country",
}
ids_emitters = ids_emitters.rename(columns=emitters_cols)[
    emitters_cols.values()
].copy()
ids_emitters["_emitter"] = ids_emitters["emitter_name"].str.lower()

ids_emitters = ids_emitters.drop_duplicates()
duplicates = ids_emitters[ids_emitters.duplicated(subset="_emitter")]
if not duplicates.empty:
    print("WARNING: The following emitters appear twice in the spreadsheet")
    display(duplicates)
    raise Exception("CF output.")
ids_emitters.drop(columns=["emitter_name"], inplace=True)
ids_emitters["_test"] = 1

data = data.merge(ids_emitters, on="_emitter", how="left")

# Check unmatched emitters (there might be missing inputs in PCI matching spreadsheet)
# data[data["_test"] != 1]

# Check data with no identifier
mask = (
    data["emitter_ror_id"].isna()
    & data["emitter_wikidata_id"].isna()
    & data["_test"]
    == 1
)
data[mask]

data.drop(columns=["_emitter", "_test"], inplace=True)


## Map consortiums to the transfers df
data["_agent"] = data["Via?"].str.lower()
agent_cols = {
    "Consortia": "agent_name",
    "ROR": "agent_ror_id",
    "wikidata": "agent_wikidata_id",
    "Website": "agent_website",
}

ids_consortiums = ids_consortiums.rename(columns=agent_cols)[
    agent_cols.values()
].copy()
ids_consortiums["_agent"] = ids_consortiums["agent_name"].str.lower()
ids_consortiums = ids_consortiums.drop_duplicates()
duplicates = ids_consortiums[ids_consortiums.duplicated(subset="_agent")]
if not duplicates.empty:
    print("WARNING: The following emitters appear twice in the spreadsheet")
    display(duplicates)
    raise Exception("CF output.")
ids_consortiums.drop(columns=["agent_name"], inplace=True)
ids_consortiums["_test"] = 1

data = data.merge(ids_consortiums, on="_agent", how="left")

# Check consortium data is okay
mask = ~data["_agent"].isna() & (data["_test"] != 1)
data[mask]

data.drop(columns=["_agent", "_test"], inplace=True)

In [None]:
prepared_folder = Path.home() / "Nextcloud/TSOSI_data/pci/4_prepared/"
prepared_file = f"{date_str}_PCI_Funding_Report_prepared.xlsx"
prepared_path = str(prepared_folder / prepared_file)
data.to_excel(prepared_path, index=False, sheet_name="Transfers")

### 5 - Generate TSOSI data file


In [None]:
from pathlib import Path
import sys
import os
import django
from datetime import date

# Add the parent directory to the system path and setup django
BASE_DIR = str(Path(os.getcwd()).resolve().parent.parent.parent.parent)

if BASE_DIR not in sys.path:
    sys.path.append(BASE_DIR)

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "backend_site.settings")

django.setup()

from tsosi.data.preparation.pci.default import get_config

date_data = date(2025, 3, 3)

file_folder = Path.home() / "Nextcloud/TSOSI_data/pci/4_prepared"
file_name = "2025-03-03_PCI_Funding_Report_prepared.xlsx"
file_path = str(file_folder / file_name)
sheet_name = "Transfers"

config = get_config(file_path, sheet_name, date_data)
config.generate_data_file()