From dde4c9feebcc5d1031727a1423b8b9899dba2253 Mon Sep 17 00:00:00 2001 From: Adrien Matissart Date: Thu, 16 May 2024 17:24:18 +0200 Subject: [PATCH] read vouches in TournesolInput --- backend/ml/inputs.py | 12 +++++++ backend/tournesol/lib/public_dataset.py | 10 +++--- solidago/src/solidago/pipeline/inputs.py | 44 ++++++++++++++++++------ 3 files changed, 52 insertions(+), 14 deletions(-) diff --git a/backend/ml/inputs.py b/backend/ml/inputs.py index 9f8272378f..52bfd6003b 100644 --- a/backend/ml/inputs.py +++ b/backend/ml/inputs.py @@ -14,6 +14,7 @@ ContributorScaling, Entity, ) +from vouch.models import Voucher class MlInputFromDb(TournesolInput): @@ -189,3 +190,14 @@ def get_individual_scores( dtf = pd.DataFrame(values) return dtf[["user_id", "entity", "criteria", "raw_score"]] + + def get_vouches(self): + values = Voucher.objects.filter( + by__is_active=True, + to__is_active=True, + ).values( + voucher="by__id", + vouchee="to__id", + vouch="value", + ) + return pd.DataFrame(values) diff --git a/backend/tournesol/lib/public_dataset.py b/backend/tournesol/lib/public_dataset.py index 8612436d52..58d8e3ef16 100644 --- a/backend/tournesol/lib/public_dataset.py +++ b/backend/tournesol/lib/public_dataset.py @@ -291,7 +291,7 @@ def write_comparisons_file( "criteria", "score", "score_max", - "week_date" + "week_date", ] writer = csv.DictWriter(write_target, fieldnames=fieldnames) writer.writeheader() @@ -413,7 +413,9 @@ def write_vouchers_file(write_target): "to_username": voucher.to.username, "value": voucher.value, } - for voucher in Voucher.objects.filter(is_public=True) - .select_related("by", "to") - .order_by("by__username", "to__username") + for voucher in ( + Voucher.objects.filter(is_public=True, by__is_active=True, to__is_active=True) + .select_related("by", "to") + .order_by("by__username", "to__username") + ) ) diff --git a/solidago/src/solidago/pipeline/inputs.py b/solidago/src/solidago/pipeline/inputs.py index fdfcd0703b..8109f8394c 100644 --- a/solidago/src/solidago/pipeline/inputs.py +++ b/solidago/src/solidago/pipeline/inputs.py @@ -57,14 +57,26 @@ def get_individual_scores( ) -> Optional[pd.DataFrame]: raise NotImplementedError + @abstractmethod def get_vouches(self): - # TODO: make abstract and implement in subclasses - return pd.DataFrame(columns=["voucher", "vouchee", "vouch"]) + """Fetch data about vouches shared between users + + Returns: + - DataFrame with columns + * `voucher`: int, user_id of the user who gives the vouch + * `vouchee`: int, user_id of the user who receives the vouch + * `vouch`: float, value of this vouch + """ + raise NotImplementedError + + def get_users(self): + users = self.ratings_properties.groupby("user_id").first()[["trust_score"]] + users["is_pretrusted"] = users["trust_score"] >= 0.8 + return users def get_pipeline_kwargs(self, criterion: str): ratings_properties = self.ratings_properties - users = ratings_properties.groupby("user_id").first()[["trust_score"]] - users["is_pretrusted"] = users["trust_score"] >= 0.8 + users = self.get_users() vouches = self.get_vouches() comparisons = self.get_comparisons(criteria=criterion) entities_ids = set(comparisons["entity_a"].unique()) | set( @@ -134,26 +146,25 @@ def __init__(self, dataset_zip: Union[str, BinaryIO]): # Fill trust_score on newly created users for which it was not computed yet self.users.trust_score = pd.to_numeric(self.users.trust_score).fillna(0.0) - username_to_user_id = pd.Series( + self.username_to_user_id = pd.Series( data=self.users.index, index=self.users["public_username"] ) - self.comparisons = self.comparisons.join(username_to_user_id, on="public_username") - + self.comparisons = self.comparisons.join(self.username_to_user_id, on="public_username") + with (zipfile.Path(zip_file) / "vouchers.csv").open(mode="rb") as vouchers_file: # keep_default_na=False is required otherwise some public usernames # such as "NA" are converted to float NaN. self.vouchers = pd.read_csv(vouchers_file, keep_default_na=False) - + with (zipfile.Path(zip_file) / "collective_criteria_scores.csv").open(mode="rb") as collective_scores_file: # keep_default_na=False is required otherwise some public usernames # such as "NA" are converted to float NaN. self.collective_scores = pd.read_csv(collective_scores_file, keep_default_na=False) - + with (zipfile.Path(zip_file) / "individual_criteria_scores.csv").open(mode="rb") as individual_scores_file: # keep_default_na=False is required otherwise some public usernames # such as "NA" are converted to float NaN. self.individual_scores = pd.read_csv(individual_scores_file, keep_default_na=False) - @classmethod def download(cls) -> "TournesolInputFromPublicDataset": @@ -197,3 +208,16 @@ def get_individual_scores( ) -> Optional[pd.DataFrame]: # TODO: read contributor scores from individual_scores.csv return None + + def get_vouches(self): + vouchers = self.vouchers[ + self.vouchers.by_username.isin(self.username_to_user_id.index) + & self.vouchers.to_username.isin(self.username_to_user_id.index) + ] + return pd.DataFrame( + { + "voucher": vouchers.by_username.map(self.username_to_user_id), + "vouchee": vouchers.to_username.map(self.username_to_user_id), + "vouch": vouchers.value, + } + )