diff --git a/backend/ml/inputs.py b/backend/ml/inputs.py index 9f8272378f..52bfd6003b 100644 --- a/backend/ml/inputs.py +++ b/backend/ml/inputs.py @@ -14,6 +14,7 @@ ContributorScaling, Entity, ) +from vouch.models import Voucher class MlInputFromDb(TournesolInput): @@ -189,3 +190,14 @@ def get_individual_scores( dtf = pd.DataFrame(values) return dtf[["user_id", "entity", "criteria", "raw_score"]] + + def get_vouches(self): + values = Voucher.objects.filter( + by__is_active=True, + to__is_active=True, + ).values( + voucher="by__id", + vouchee="to__id", + vouch="value", + ) + return pd.DataFrame(values) diff --git a/backend/tournesol/lib/public_dataset.py b/backend/tournesol/lib/public_dataset.py index 8612436d52..58d8e3ef16 100644 --- a/backend/tournesol/lib/public_dataset.py +++ b/backend/tournesol/lib/public_dataset.py @@ -291,7 +291,7 @@ def write_comparisons_file( "criteria", "score", "score_max", - "week_date" + "week_date", ] writer = csv.DictWriter(write_target, fieldnames=fieldnames) writer.writeheader() @@ -413,7 +413,9 @@ def write_vouchers_file(write_target): "to_username": voucher.to.username, "value": voucher.value, } - for voucher in Voucher.objects.filter(is_public=True) - .select_related("by", "to") - .order_by("by__username", "to__username") + for voucher in ( + Voucher.objects.filter(is_public=True, by__is_active=True, to__is_active=True) + .select_related("by", "to") + .order_by("by__username", "to__username") + ) ) diff --git a/solidago/experiments/tournesol.py b/solidago/experiments/tournesol.py index 6d46f016bd..a25b403635 100644 --- a/solidago/experiments/tournesol.py +++ b/solidago/experiments/tournesol.py @@ -38,8 +38,6 @@ for entity_id, video_id in enumerate(inputs.entity_id_to_video_id) } -logger.info("Preprocessing data for the pipeline") -users, vouches, all_entities, privacy = inputs.get_pipeline_objects() # criteria = set(inputs.comparisons["criteria"]) criteria = { "largely_recommended" } @@ -89,13 +87,18 @@ user_outputs, entities, voting_rights, scaled_user_models = dict(), dict(), dict(), dict() -users = pipeline.trust_propagation(users, vouches) for c in criteria: logger.info(f"Running the pipeline for criterion `{c}`") - - judgments = inputs.get_judgments(c) - + + pipeline_objects = inputs.get_pipeline_kwargs(criterion=c) + users = pipeline_objects["users"] + vouches = pipeline_objects["vouches"] + all_entities = pipeline_objects["entities"] + privacy = pipeline_objects["privacy"] + judgments = pipeline_objects["judgments"] + + users = pipeline.trust_propagation(users, vouches) voting_rights[c], entities[c] = pipeline.voting_rights(users, all_entities, vouches, privacy) user_models = pipeline.preference_learning(judgments, users, entities[c]) scaled_user_models[c] = pipeline.scaling(user_models, users, entities[c], voting_rights[c], privacy) diff --git a/solidago/src/solidago/pipeline/__init__.py b/solidago/src/solidago/pipeline/__init__.py index 57bb379f53..fa97097024 100644 --- a/solidago/src/solidago/pipeline/__init__.py +++ b/solidago/src/solidago/pipeline/__init__.py @@ -1,3 +1,5 @@ from .inputs import TournesolInput from .outputs import PipelineOutput from .pipeline import DefaultPipeline, Pipeline + +__all__ = ["TournesolInput", "DefaultPipeline", "Pipeline", "PipelineOutput"] diff --git a/solidago/src/solidago/pipeline/inputs.py b/solidago/src/solidago/pipeline/inputs.py index 9e880f5c2a..8109f8394c 100644 --- a/solidago/src/solidago/pipeline/inputs.py +++ b/solidago/src/solidago/pipeline/inputs.py @@ -57,6 +57,58 @@ def get_individual_scores( ) -> Optional[pd.DataFrame]: raise NotImplementedError + @abstractmethod + def get_vouches(self): + """Fetch data about vouches shared between users + + Returns: + - DataFrame with columns + * `voucher`: int, user_id of the user who gives the vouch + * `vouchee`: int, user_id of the user who receives the vouch + * `vouch`: float, value of this vouch + """ + raise NotImplementedError + + def get_users(self): + users = self.ratings_properties.groupby("user_id").first()[["trust_score"]] + users["is_pretrusted"] = users["trust_score"] >= 0.8 + return users + + def get_pipeline_kwargs(self, criterion: str): + ratings_properties = self.ratings_properties + users = self.get_users() + vouches = self.get_vouches() + comparisons = self.get_comparisons(criteria=criterion) + entities_ids = set(comparisons["entity_a"].unique()) | set( + comparisons["entity_b"].unique() + ) + entities = pd.DataFrame(index=list(entities_ids)) + + privacy = PrivacySettings() + user_entity_pairs = set( + comparisons[["user_id", "entity_a"]].itertuples(index=False, name=None) + ).union(comparisons[["user_id", "entity_b"]].itertuples(index=False, name=None)) + for rating in ratings_properties.itertuples(): + if (rating.user_id, rating.entity_id) in user_entity_pairs: + privacy[(rating.user_id, rating.entity_id)] = not rating.is_public + + judgments = DataFrameJudgments( + comparisons=comparisons.rename( + columns={ + "score": "comparison", + "score_max": "comparison_max", + } + ) + ) + + return { + "users": users, + "vouches": vouches, + "entities": entities, + "privacy": privacy, + "judgments": judgments, + } + class TournesolInputFromPublicDataset(TournesolInput): def __init__(self, dataset_zip: Union[str, BinaryIO]): @@ -72,14 +124,18 @@ def __init__(self, dataset_zip: Union[str, BinaryIO]): self.comparisons = pd.read_csv(comparison_file, keep_default_na=False) self.entity_id_to_video_id = pd.Series( list(set(self.comparisons.video_a) | set(self.comparisons.video_b)), - name="video_id" + name="video_id", ) video_id_to_entity_id = { video_id: entity_id for (entity_id, video_id) in self.entity_id_to_video_id.items() } - self.comparisons["entity_a"] = self.comparisons["video_a"].map(video_id_to_entity_id) - self.comparisons["entity_b"] = self.comparisons["video_b"].map(video_id_to_entity_id) + self.comparisons["entity_a"] = self.comparisons["video_a"].map( + video_id_to_entity_id + ) + self.comparisons["entity_b"] = self.comparisons["video_b"].map( + video_id_to_entity_id + ) self.comparisons.drop(columns=["video_a", "video_b"], inplace=True) with (zipfile.Path(zip_file) / "users.csv").open(mode="rb") as users_file: @@ -90,26 +146,25 @@ def __init__(self, dataset_zip: Union[str, BinaryIO]): # Fill trust_score on newly created users for which it was not computed yet self.users.trust_score = pd.to_numeric(self.users.trust_score).fillna(0.0) - username_to_user_id = pd.Series( + self.username_to_user_id = pd.Series( data=self.users.index, index=self.users["public_username"] ) - self.comparisons = self.comparisons.join(username_to_user_id, on="public_username") - + self.comparisons = self.comparisons.join(self.username_to_user_id, on="public_username") + with (zipfile.Path(zip_file) / "vouchers.csv").open(mode="rb") as vouchers_file: # keep_default_na=False is required otherwise some public usernames # such as "NA" are converted to float NaN. self.vouchers = pd.read_csv(vouchers_file, keep_default_na=False) - + with (zipfile.Path(zip_file) / "collective_criteria_scores.csv").open(mode="rb") as collective_scores_file: # keep_default_na=False is required otherwise some public usernames # such as "NA" are converted to float NaN. self.collective_scores = pd.read_csv(collective_scores_file, keep_default_na=False) - + with (zipfile.Path(zip_file) / "individual_criteria_scores.csv").open(mode="rb") as individual_scores_file: # keep_default_na=False is required otherwise some public usernames # such as "NA" are converted to float NaN. self.individual_scores = pd.read_csv(individual_scores_file, keep_default_na=False) - @classmethod def download(cls) -> "TournesolInputFromPublicDataset": @@ -153,27 +208,16 @@ def get_individual_scores( ) -> Optional[pd.DataFrame]: # TODO: read contributor scores from individual_scores.csv return None - - def get_pipeline_objects(self): - users = self.users - users = users.assign(is_pretrusted=(users["trust_score"] >= 0.8)) - vouches = pd.DataFrame(columns=["voucher", "vouchee", "vouch"]) - entities_indices = set(self.comparisons["entity_a"]) | set(self.comparisons["entity_b"]) - entities = pd.DataFrame(index=list(entities_indices)) - entities.index.name = "entity_id" - privacy = PrivacySettings() - for (user_id, entity_id) in set( - self.comparisons[["user_id", "entity_a"]].itertuples(index=False, name=None) - ).union( - self.comparisons[["user_id", "entity_b"]].itertuples(index=False, name=None) - ): - privacy[user_id, entity_id] = False - return users, vouches, entities, privacy - - def get_judgments(self, criterion): - comparisons = self.comparisons - if criterion is not None: - comparisons = comparisons[comparisons["criteria"] == criterion] - comparisons = comparisons.rename(columns={"score": "comparison"}) - comparisons = comparisons.assign(comparison_max=[10] * len(comparisons)) - return DataFrameJudgments(comparisons=comparisons) + + def get_vouches(self): + vouchers = self.vouchers[ + self.vouchers.by_username.isin(self.username_to_user_id.index) + & self.vouchers.to_username.isin(self.username_to_user_id.index) + ] + return pd.DataFrame( + { + "voucher": vouchers.by_username.map(self.username_to_user_id), + "vouchee": vouchers.to_username.map(self.username_to_user_id), + "vouch": vouchers.value, + } + ) diff --git a/solidago/src/solidago/pipeline/pipeline.py b/solidago/src/solidago/pipeline/pipeline.py index 4e99519c57..e837d8c776 100644 --- a/solidago/src/solidago/pipeline/pipeline.py +++ b/solidago/src/solidago/pipeline/pipeline.py @@ -16,6 +16,7 @@ from solidago.aggregation import Aggregation, StandardizedQrMedian, StandardizedQrQuantile, Average, EntitywiseQrQuantile from solidago.post_process import PostProcess, Squash, NoPostProcess +from solidago.pipeline.inputs import TournesolInput from solidago.pipeline.outputs import PipelineOutput logger = logging.getLogger(__name__) @@ -82,7 +83,7 @@ def __init__( aggregation: Aggregation = DefaultPipeline.aggregation, post_process: PostProcess = DefaultPipeline.post_process, ): - """ Instantiates the pipeline components. + """Instantiates the pipeline components. Parameters ---------- @@ -118,7 +119,22 @@ def from_json(cls, json) -> "Pipeline": aggregation=aggregation_from_json(json["aggregation"]), post_process=post_process_from_json(json["post_process"]), ) - + + def run( + self, + input: TournesolInput, + criterion: str, + output: Optional[PipelineOutput] = None + ): + # TODO: criterion should be managed by TournesolInput + + # TODO: read existing individual scores from input + # to pass `init_user_models` + return self( + **input.get_pipeline_kwargs(criterion), + output=output, + ) + def __call__( self, users: pd.DataFrame, @@ -148,8 +164,6 @@ def __call__( judgments[user] must yield the judgment data provided by the user init_user_models: dict[int, ScoringModel] user_models[user] is the user's model - skip_set: set[int] - Steps that are skipped in the pipeline Returns ------- @@ -229,8 +243,8 @@ def to_json(self): post_process=self.post_process.to_json() ) + @staticmethod def save_individual_scalings( - self, user_models: dict[int, ScaledScoringModel], output: PipelineOutput, ): @@ -251,8 +265,8 @@ def save_individual_scalings( ) output.save_individual_scalings(scalings_df) + @staticmethod def save_individual_scores( - self, user_scorings: dict[int, ScoringModel], raw_user_scorings: dict[int, ScoringModel], voting_rights: VotingRights, diff --git a/solidago/src/solidago/preference_learning/__init__.py b/solidago/src/solidago/preference_learning/__init__.py index da46034788..d1776ea8a7 100644 --- a/solidago/src/solidago/preference_learning/__init__.py +++ b/solidago/src/solidago/preference_learning/__init__.py @@ -1,4 +1,4 @@ -""" Step 3 of the pipeline. +""" **Step 3 of the pipeline** Preference learning infers, for each user and based on their data, a model of the user's preferences. @@ -13,3 +13,6 @@ from .lbfgs_generalized_bradley_terry import LBFGSUniformGBT except RuntimeError: pass + + +__all__ = ["PreferenceLearning", "UniformGBT", "LBFGSUniformGBT"] diff --git a/solidago/src/solidago/preference_learning/base.py b/solidago/src/solidago/preference_learning/base.py index fe998b5509..3547fe3135 100644 --- a/solidago/src/solidago/preference_learning/base.py +++ b/solidago/src/solidago/preference_learning/base.py @@ -26,7 +26,7 @@ def __call__( Parameters ---------- - user_judgments: dict[str, pd.DataFrame] + judgments: May contain different forms of judgments, but most likely will contain "comparisons" and/or "assessments" entities: DataFrame with columns @@ -35,8 +35,9 @@ def __call__( initialization: dict[int, ScoringModel] or ScoringModel or None Starting models, added to facilitate optimization It is not supposed to affect the output of the training - new_judgments: New judgments - This allows to prioritize coordinate descent, starting with newly evaluated entities + new_judgments: + New judgments + This allows to prioritize coordinate descent, starting with newly evaluated entities Returns ------- @@ -79,8 +80,9 @@ def user_learn( initialization: ScoringModel or None Starting model, added to facilitate optimization It is not supposed to affect the output of the training - new_judgments: New judgments - This allows to prioritize coordinate descent, starting with newly evaluated entities + new_judgments: + New judgments + This allows to prioritize coordinate descent, starting with newly evaluated entities Returns ------- diff --git a/solidago/src/solidago/preference_learning/generalized_bradley_terry.py b/solidago/src/solidago/preference_learning/generalized_bradley_terry.py index 7af2b2a55b..f1a3afae0e 100644 --- a/solidago/src/solidago/preference_learning/generalized_bradley_terry.py +++ b/solidago/src/solidago/preference_learning/generalized_bradley_terry.py @@ -252,12 +252,8 @@ def __init__( ): """ - Parameters + Parameters (TODO) ---------- - initialization: dict[int, float] - previously computed entity scores - error: float - tolerated error """ super().__init__(prior_std_dev, convergence_error) self.cumulant_generating_function_error = cumulant_generating_function_error diff --git a/solidago/src/solidago/preference_learning/lbfgs_generalized_bradley_terry.py b/solidago/src/solidago/preference_learning/lbfgs_generalized_bradley_terry.py index fa2518ce4c..4de2738d11 100644 --- a/solidago/src/solidago/preference_learning/lbfgs_generalized_bradley_terry.py +++ b/solidago/src/solidago/preference_learning/lbfgs_generalized_bradley_terry.py @@ -175,12 +175,8 @@ def __init__( max_iter: int = 100, ): """ - Parameters + Parameters (TODO) ---------- - initialization: dict[int, float] - previously computed entity scores - error: float - tolerated error """ super().__init__(prior_std_dev, convergence_error, max_iter=max_iter) self.cumulant_generating_function_error = cumulant_generating_function_error diff --git a/solidago/src/solidago/trust_propagation/__init__.py b/solidago/src/solidago/trust_propagation/__init__.py index 3a0e3908d2..f7814548d2 100644 --- a/solidago/src/solidago/trust_propagation/__init__.py +++ b/solidago/src/solidago/trust_propagation/__init__.py @@ -1,4 +1,4 @@ -""" Step 1 of the pipeline. +""" **Step 1 in the pipeline** Trust propagation is tasked to combine pretrusts and vouches to derive trust scores for the different users. @@ -8,3 +8,5 @@ from .no_trust_propagation import NoTrustPropagation from .lipschitrust import LipschiTrust from .trust_all import TrustAll + +__all__ = ["TrustPropagation", "NoTrustPropagation", "LipschiTrust", "TrustAll"] diff --git a/solidago/src/solidago/trust_propagation/base.py b/solidago/src/solidago/trust_propagation/base.py index 1510283dff..a9c40a94dc 100644 --- a/solidago/src/solidago/trust_propagation/base.py +++ b/solidago/src/solidago/trust_propagation/base.py @@ -3,6 +3,10 @@ import pandas as pd class TrustPropagation(ABC): + """ + Base class for Trust Propagation algorithms + """ + @abstractmethod def __call__(self, users: pd.DataFrame, @@ -12,17 +16,24 @@ def __call__(self, Parameters ---------- - users: DataFrame with columns + users: DataFrame + with columns + * user_id (int, index) * is_pretrusted (bool) - vouches: DataFrame with columns + + vouches: DataFrame + with columns + * voucher (str) * vouchee (str) * vouch (float) Returns ------- - users: DataFrame with columns + users: DataFrame + with columns + * user_id (int, index) * is_pretrusted (bool) * trust_score (float) diff --git a/solidago/src/solidago/trust_propagation/lipschitrust.py b/solidago/src/solidago/trust_propagation/lipschitrust.py index 0373cd3927..92615cfacc 100644 --- a/solidago/src/solidago/trust_propagation/lipschitrust.py +++ b/solidago/src/solidago/trust_propagation/lipschitrust.py @@ -17,14 +17,18 @@ def __init__(self, error: float=1e-8 ): """ A robustified variant of PageRank - Inputs: - - pretrust_value is the pretrust of a pretrusted user - (Trust^{pre}_{checkmark} in paper) - - decay is the decay of trusts in voucher's vouchees - (beta in paper) - - sink_vouch is the vouch to none, used to incentivize vouching + + Parameters + ---------- + pretrust_value: + the pretrust of a pretrusted user. + (`Trust^{pre}_{checkmark}` in paper) + decay: + the decay of trusts in voucher's vouchees. + (`beta` in paper) + sink_vouch: is the vouch to none, used to incentivize vouching (V^{sink}_{checkmark} in paper) - - error > 0 is an upper bound on error (in L1 norm) + error: > 0 is an upper bound on error (in L1 norm) (epsilon_{LipschiTrust} in paper) """ assert pretrust_value >= 0 and pretrust_value <= 1 @@ -41,22 +45,6 @@ def __call__(self, users: pd.DataFrame, vouches: pd.DataFrame ) -> pd.DataFrame: - """ - Inputs: - - users: DataFrame with columns - * user_id (int, index) - * is_pretrusted (bool) - - vouches: DataFrame with columns - * voucher (str) - * vouchee (str) - * vouch (float) - - Returns: - - users: DataFrame with columns - * user_id (int, index) - * is_pretrusted (bool) - * trust_score (float) - """ if len(users) == 0: return users.assign(trust_score=[]) diff --git a/solidago/src/solidago/trust_propagation/no_trust_propagation.py b/solidago/src/solidago/trust_propagation/no_trust_propagation.py index 43a11bf575..9c81ab7282 100644 --- a/solidago/src/solidago/trust_propagation/no_trust_propagation.py +++ b/solidago/src/solidago/trust_propagation/no_trust_propagation.py @@ -5,32 +5,19 @@ class NoTrustPropagation(TrustPropagation): def __init__(self, pretrust_value: float=0.8,): + """ + Parameters + ---------- + pretrust_value: + trust score to assign to pretrusted users + """ self.pretrust_value = pretrust_value def __call__(self, users: pd.DataFrame, vouches: pd.DataFrame ) -> pd.DataFrame: - """ Propagates trust through vouch network - - Parameters - ---------- - users: DataFrame with columns - * user_id (int, index) - * is_pretrusted (bool) - vouches: DataFrame with columns - * voucher (str) - * vouchee (str) - * vouch (float) - - Returns - ------- - users: DataFrame with columns - * user_id (int, index) - * is_pretrusted (bool) - * trust_score (float) - """ - return users.assign(trust_score=users["is_pretrusted"] * pretrust_value) + return users.assign(trust_score=users["is_pretrusted"] * self.pretrust_value) def __str__(self): return f"{type(self).__name__}(pretrust_value={self.pretrust_value})" diff --git a/solidago/src/solidago/trust_propagation/trust_all.py b/solidago/src/solidago/trust_propagation/trust_all.py index 53f59e204b..79f3d2e37b 100644 --- a/solidago/src/solidago/trust_propagation/trust_all.py +++ b/solidago/src/solidago/trust_propagation/trust_all.py @@ -1,32 +1,10 @@ -""" TrustAll is a naive solution that assignes an equal amount of trust to all users -""" - from .base import TrustPropagation import pandas as pd -import numpy as np + class TrustAll(TrustPropagation): - def __call__(self, - users: pd.DataFrame, - vouches: pd.DataFrame - ) -> dict[str, float]: - """ - Inputs: - - users: DataFrame with columns - * user_id (int, index) - * is_pretrusted (bool) - - vouches: DataFrame with columns - * voucher (str) - * vouchee (str) - * vouch (float) - - Returns: - - users: DataFrame with columns - * user_id (int, index) - * is_pretrusted (bool) - * trust_score (float) - """ - return users.assign(trust_score=[1.0] * len(users)) - - + """`TrustAll` is a naive solution that assignes an equal amount of trust to all users""" + + def __call__(self, users: pd.DataFrame, vouches: pd.DataFrame): + return users.assign(trust_score=1.0) diff --git a/solidago/src/solidago/voting_rights/__init__.py b/solidago/src/solidago/voting_rights/__init__.py index 5587816df3..2504129452 100644 --- a/solidago/src/solidago/voting_rights/__init__.py +++ b/solidago/src/solidago/voting_rights/__init__.py @@ -1,4 +1,4 @@ -""" Step 2 of the pipeline. +""" **Step 2 in the pipeline** Voting rights are assigned per user and per entity, based on users' trust scores and privacy settings. @@ -11,3 +11,6 @@ from .affine_overtrust import AffineOvertrust from .compute_voting_rights import compute_voting_rights + + +__all__ = ["VotingRightsAssignment", "IsTrust", "AffineOvertrust"] diff --git a/solidago/src/solidago/voting_rights/affine_overtrust.py b/solidago/src/solidago/voting_rights/affine_overtrust.py index 8732c4df9a..1096f3fff8 100644 --- a/solidago/src/solidago/voting_rights/affine_overtrust.py +++ b/solidago/src/solidago/voting_rights/affine_overtrust.py @@ -178,11 +178,10 @@ def min_voting_right( ---------- max_overtrust: float Maximal overtrust allowed for entity_id - users: DataFrame with columns - * user_id (int, index) - * trust_score (float) + trust_scores: + trust score values per user privacy_weights: dict[int, float] - privacy_weights[u] is the privacy weight of user u + privacy weight per user Returns ------- diff --git a/solidago/tests/test_judgments.py b/solidago/tests/test_judgments.py deleted file mode 100644 index ffa05c9403..0000000000 --- a/solidago/tests/test_judgments.py +++ /dev/null @@ -1,7 +0,0 @@ -from solidago.pipeline.inputs import TournesolInputFromPublicDataset -from solidago.judgments import Judgments, DataFrameJudgments - -def test_tournesol_import(): - inputs = TournesolInputFromPublicDataset("tests/data/tiny_tournesol.zip") - judgments = inputs.get_judgments("largely_recommended") - assert "aidjango" in set(judgments.comparisons["public_username"]) diff --git a/solidago/tests/test_privacy_settings.py b/solidago/tests/test_privacy_settings.py index 28d828388e..b92cffb441 100644 --- a/solidago/tests/test_privacy_settings.py +++ b/solidago/tests/test_privacy_settings.py @@ -11,10 +11,10 @@ def test_privacy_io(): def test_tournesol_import(): inputs = TournesolInputFromPublicDataset("tests/data/tiny_tournesol.zip") - privacy = inputs.get_pipeline_objects()[3] + privacy = inputs.get_pipeline_kwargs(criterion="largely_recommended")["privacy"] aidjango_id = inputs.users[inputs.users["public_username"] == "aidjango"].index[0] video_id_to_entity_id = { video_id: entity_id for (entity_id, video_id) in inputs.entity_id_to_video_id.items() } - assert not privacy[aidjango_id, video_id_to_entity_id['dBap_Lp-0oc']] + assert privacy[aidjango_id, video_id_to_entity_id['dBap_Lp-0oc']] == False