### 0 - Raw file pre-processing


In [None]:
from pathlib import Path
import pandas as pd


file_input_folder = Path.home() / "Nextcloud/TSOSI_data/doab/0_raw/"
file_output_folder = Path.home() / "Nextcloud/TSOSI_data/doab/1_pre_processed/"
file_name = "2025-02-11-DOAB_Library_Report.xlsx"
output_name = "2025-02-11-DOAB_Library_Report_pre_processed.xlsx"


def pre_process(export=False):
    amount_columns = [
        "Annual amount (EUR)",
        "Annual amount (USD)",
        "Annual amount (GBP)",
    ]
    useful_columns = [
        "Company",
        "Country",
        *amount_columns,
        "Year",
        "Supporter type",
        "Agent",
        "Support start date",
        "Support end date",
        "Invoice preference",
        "Commitment period (years)",
        "Support confirmation date",
    ]

    df = pd.read_excel(f"{file_input_folder}/{file_name}")

    amount_mask = None
    for col in amount_columns:
        mask_part = ~df[col].isna()
        amount_mask = (
            mask_part if amount_mask is None else amount_mask | mask_part
        )

    date_mask = (
        ~df["Support start date"].isna() & ~df["Support end date"].isna()
    )
    mask = amount_mask & date_mask
    df_filtered = df[mask][useful_columns].copy()

    print(f"Discarded {len(df) - len(df_filtered)} rows out of {len(df)} rows.")
    if export:
        df_filtered.to_excel(
            f"{file_output_folder}/{output_name}",
            sheet_name="Transferts",
            index=False,
        )
    return df[~mask]


discarded = pre_process(True)

### 1 - Pre-processed data -> ROR matching


In [None]:
from pathlib import Path
import sys
import os
import django
import pandas as pd
from asgiref.sync import sync_to_async

# Add the parent directory to the system path and setup django
BASE_DIR = str(Path(os.getcwd()).resolve().parent.parent.parent.parent)

if BASE_DIR not in sys.path:
    sys.path.append(BASE_DIR)

os.environ.setdefault("DJANGO_SETTINGS_MODULE", "backend_site.settings")

django.setup()

from tsosi.data.pid_matching import prepare_manual_matching


file_folder = Path.home() / "Nextcloud/TSOSI_data/doab/1_pre_processed/"
file_name = "2025-02-11-DOAB_Library_Report_pre_processed.xlsx"
sheet = "Transferts"
name_column = "Company"
country_colum = "Country"


@sync_to_async
def to_run():
    xls = pd.ExcelFile(str(file_folder / file_name))
    data = pd.read_excel(xls, sheet)
    return prepare_manual_matching(
        data, name_column, country_column=country_colum
    )


res = await to_run()

In [None]:
export_folder = Path.home() / "Nextcloud/TSOSI_data/doab/2_matched/"
output_name = "2025-02-11-DOAB_Library_Report_matched.xlsx"
res.to_excel(str(export_folder / output_name), sheet_name=sheet, index=False)

In [None]:
res[~res["Agent"].isna()][["Agent", "Country"]].drop_duplicates(
    ["Agent", "Country"]
).sort_values("Agent")

In [None]:
t = res[res["Agent"] == "CDL"][["Agent", "Country"]]
t.drop_duplicates().iloc[1]["Country"]

### 3 - Prepare enriched data


### 4 - Organize transferts

Compute the Year when it's missing from the support period


In [None]:
from pathlib import Path
import pandas as pd
from functools import reduce


file_input_folder = Path.home() / "Nextcloud/TSOSI_data/doab/1_pre_processed/"
file_output_folder = Path.home() / "Nextcloud/TSOSI_data/doab/1_pre_processed/"
file_name = "2025-02-11-DOAB_Library_Report_pre_processed.xlsx"
output_name = "2025-02-11-DOAB_Library_Report_final.xlsx"
sheet_name = "Transferts"

xls = pd.ExcelFile(str(file_input_folder / file_name))
df = pd.read_excel(xls, sheet_name=sheet_name)

final_cols = [
    "Company",
    "Country",
    "date_start",
    "date_end",
    "Commitment period (years)",
    "Supporter type",
    "Agent",
    "amount_EUR",
    "amount_USD",
    "amount_GBP",
]


def handle_one_year_transferts(transferts: pd.DataFrame):
    statuses = ["one year commitment", "annual, one year commitment"]
    one_year = transferts[
        transferts["Invoice preference"].str.lower().str.strip().isin(statuses)
    ].copy()

    one_year["_spanned_years"] = (
        one_year["_date_end"].dt.date - one_year["_date_start"].dt.date
    ).apply(lambda x: round(x.days / 365))
    warning_mask = (one_year["Commitment period (years)"] > 1) | (
        one_year["_spanned_years"] > 1
    )
    warnings = one_year[warning_mask]
    if not warnings.empty:
        print(
            "The following entities have 'One year commitment' with commitment "
            "period > 1 year\n"
            f"{warnings["Company"].drop_duplicates().to_list()}\n"
        )

    one_year["amount_EUR"] = one_year["Annual amount (EUR)"]
    one_year["amount_USD"] = one_year["Annual amount (USD)"]
    one_year["amount_GBP"] = one_year["Annual amount (GBP)"]
    one_year["date_start"] = one_year["_date_start"]
    one_year["date_end"] = one_year["_date_end"]
    return one_year[final_cols].copy()


def handle_annual_transferts(transferts: pd.DataFrame):
    annuals = transferts[
        transferts["Invoice preference"].str.strip().str.lower() == "annual"
    ].copy()
    annuals["_spanned_years"] = (
        annuals["_date_end"].dt.date - annuals["_date_start"].dt.date
    ).apply(lambda x: max(round(x.days / 365), 1))

    # Annual invoice transferts spanning more than 1 year should be split into
    # individual years... The raw data is supposed to be this way
    # ERRORS - Check the consistency of the commitment period and the support period
    error_mask = (annuals["_spanned_years"] > 1) & (
        annuals["_spanned_years"] != annuals["Commitment period (years)"]
    )
    errors = annuals[error_mask]
    if not errors.empty:
        print(
            "WARNING - The following Entities have Annual transferts spanning "
            "more than 1 year and inconsistent commitment period:\n"
            f"{errors["Company"].drop_duplicates().to_list()}\n"
        )

    to_split_mask = (annuals["_spanned_years"] > 1) & (
        annuals["_spanned_years"] == annuals["Commitment period (years)"]
    )
    to_split = annuals[to_split_mask].copy()

    to_split["_date_range"] = to_split.apply(
        lambda row: pd.date_range(
            row["_date_start"], row["_date_end"], freq="YS", inclusive="neither"
        ).append(pd.DatetimeIndex([row["_date_start"]]).sort_values()),
        axis=1,
    )
    splitted = to_split.explode("_date_range")
    splitted["_date_range_end"] = splitted["_date_range"].apply(
        lambda x: pd.to_datetime(f"{x.year}-12-31")
    )
    splitted["date_start"] = splitted[["_date_start", "_date_range"]].max(
        axis=1
    )
    splitted["date_end"] = splitted[["_date_end", "_date_range_end"]].min(
        axis=1
    )

    # Group back all Annual invoices
    defaults = annuals[~to_split_mask].copy()
    defaults["date_start"] = defaults["_date_start"]
    defaults["date_end"] = defaults["_date_end"]

    annuals_clean = pd.concat([splitted, defaults])
    annuals_clean["amount_EUR"] = annuals_clean["Annual amount (EUR)"]
    annuals_clean["amount_USD"] = annuals_clean["Annual amount (USD)"]
    annuals_clean["amount_GBP"] = annuals_clean["Annual amount (GBP)"]

    return annuals_clean[final_cols].copy()


def handle_upfront_transferts(transferts: pd.DataFrame):
    # Re-group upfront transferts
    upfronts_base = transferts[
        transferts["Invoice preference"].str.strip().str.lower() == "upfront"
    ].copy()

    grouping_keys = [
        "Company",
        "Country",
        "Supporter type",
        "Agent",
        "Commitment period (years)",
        "Annual amount (EUR)",
        "Annual amount (USD)",
        "Annual amount (GBP)",
    ]
    grouped = upfronts_base.groupby(grouping_keys, dropna=False)
    upfronts = grouped.agg(
        date_start=pd.NamedAgg(column="_date_start", aggfunc="min"),
        date_end=pd.NamedAgg(column="_date_end", aggfunc="max"),
        number=pd.NamedAgg(column="_date_start", aggfunc="count"),
    )
    upfronts["original_ind"] = grouped.apply(
        lambda group: list(group.index), include_groups=False
    )
    upfronts.reset_index(inplace=True)
    upfronts["_spanned_years"] = (
        upfronts["date_end"].dt.date - upfronts["date_start"].dt.date
    ).apply(lambda x: max(round(x.days / 365), 1))

    # ERRORS - These are erroneous according to the discussed model
    mask_error = (
        upfronts["number"] != upfronts["Commitment period (years)"]
    ) | (upfronts["_spanned_years"] != upfronts["Commitment period (years)"])

    errors = upfronts[mask_error]
    if not errors.empty:
        print(
            "ERROR - There are some inconsistent data within the Upfront transferts\n"
            f"Check the following entities:\n{errors["Company"].drop_duplicates().to_list()}\n"
        )

    errors_ind = reduce(
        lambda a, b: a + b, errors["original_ind"].to_list(), []
    )
    upfronts_errors = upfronts_base[upfronts_base.index.isin(errors_ind)].copy()
    upfronts_errors["date_start"] = upfronts_errors["_date_start"]
    upfronts_errors["date_end"] = upfronts_errors["_date_end"]
    upfronts_errors["amount_EUR"] = upfronts_errors["Annual amount (EUR)"]
    upfronts_errors["amount_USD"] = upfronts_errors["Annual amount (USD)"]
    upfronts_errors["amount_GBP"] = upfronts_errors["Annual amount (GBP)"]

    # For the remaining groups, the transferts are simply the calculated date start
    # & date end and the amount is annual_amount * commitment_years
    upfronts_correct = upfronts[~mask_error].copy()
    upfronts_correct["amount_EUR"] = (
        upfronts_correct["Annual amount (EUR)"]
        * upfronts_correct["Commitment period (years)"]
    )
    upfronts_correct["amount_USD"] = (
        upfronts_correct["Annual amount (USD)"]
        * upfronts_correct["Commitment period (years)"]
    )
    upfronts_correct["amount_GBP"] = (
        upfronts_correct["Annual amount (GBP)"]
        * upfronts_correct["Commitment period (years)"]
    )

    # Retrieve all data: transformed and untouched erroneous ones
    upfronts_clean = pd.concat(
        [upfronts_correct[final_cols], upfronts_errors[final_cols]]
    )
    return upfronts_clean


def organize_transferts(df: pd.DataFrame):
    df["_date_start"] = pd.to_datetime(
        df["Support start date"], format="%d/%m/%Y", errors="raise"
    )
    df["_date_end"] = pd.to_datetime(
        df["Support end date"], format="%d/%m/%Y", errors="raise"
    )
    df["_spanned_time"] = df["_date_end"] - df["_date_start"]

    # Split data according to the Invoice preference
    res = pd.concat(
        [
            handle_one_year_transferts(df),
            handle_annual_transferts(df),
            handle_upfront_transferts(df),
        ]
    )

    return res


res = organize_transferts(df)

In [None]:
len(df[df["Invoice preference"] == "Upfront"])
len(res)

In [None]:
# Check for overlapping periods
columns = [
    "Company",
    "_date_start",
    "_date_end",
    "Country",
    "Agent",
]
d = res[columns].copy(deep=True)

d["_date_range"] = d.apply(
    lambda row: pd.date_range(row["_date_start"], row["_date_end"]),
    axis=1,
)
d = d.explode("_date_range")
d[d[["Company", "_date_range"]].duplicated()]["Company"].unique()

In [None]:
#### ONE YEAR COMMITMENT transferts

In [None]:
#### UPFRONTS transferts

In [None]:
#### TODO: Decide if we want to this
# For correct Annual invoice transferts,
# try to compute the support period start & end form the commitment years
annuals = res[res["Invoice preference"] == "Annual"].copy()
annuals["_spanned_years"] = (
    annuals["_date_end"].dt.date - annuals["_date_start"].dt.date
).apply(lambda x: round(x.days / 365))
mask = annuals["_spanned_years"] <= 1
corrects = annuals[mask].copy()
grouping_keys = [
    "Company",
    "Country",
    "Supporter type",
    "Agent",
    "Commitment period (years)",
    "Annual amount (EUR)",
    "Annual amount (USD)",
    "Annual amount (GBP)",
]
grouped = corrects.groupby(grouping_keys, dropna=False)
corrects = grouped.agg(
    date_start=pd.NamedAgg(column="_date_start", aggfunc="min"),
    date_end=pd.NamedAgg(column="_date_end", aggfunc="max"),
    number=pd.NamedAgg(column="_date_start", aggfunc="count"),
)
corrects["original_ind"] = grouped.apply(
    lambda group: list(group.index), include_groups=False
)
corrects.reset_index(inplace=True)

errors = corrects["number"] != corrects["Commitment period (years)"]
corrects[errors]

In [None]:
res["Invoice preference"]