Skip to content

Commit

Permalink
Merge pull request #158 from twitter/bradm/incorrect_filter
Browse files Browse the repository at this point in the history
Augment incorrect filtering to improve correctness
  • Loading branch information
bradmiller committed Oct 23, 2023
2 parents 0c32e5c + 696215a commit 4f64737
Show file tree
Hide file tree
Showing 5 changed files with 101 additions and 29 deletions.
12 changes: 9 additions & 3 deletions sourcecode/scoring/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,19 +200,21 @@ def rater_factor_key(i):
notHelpfulArgumentativeOrBiasedTagKey = "notHelpfulArgumentativeOrBiased"
notHelpfulHardToUnderstandKey = "notHelpfulHardToUnderstand"
notHelpfulNoteNotNeededKey = "notHelpfulNoteNotNeeded"
notHelpfulSourcesMissingOrUnreliableTagKey = "notHelpfulSourcesMissingOrUnreliable"
notHelpfulIrrelevantSourcesTagKey = "notHelpfulIrrelevantSources"

notHelpfulTagsAndTieBreakOrder = [
(0, notHelpfulOtherTagKey),
(8, notHelpfulIncorrectTagKey),
(2, "notHelpfulSourcesMissingOrUnreliable"),
(2, notHelpfulSourcesMissingOrUnreliableTagKey),
(4, "notHelpfulOpinionSpeculationOrBias"),
(5, "notHelpfulMissingKeyPoints"),
(12, "notHelpfulOutdated"),
(10, notHelpfulHardToUnderstandKey),
(7, notHelpfulArgumentativeOrBiasedTagKey),
(9, "notHelpfulOffTopic"),
(11, notHelpfulSpamHarassmentOrAbuseTagKey),
(1, "notHelpfulIrrelevantSources"),
(1, notHelpfulIrrelevantSourcesTagKey),
(3, "notHelpfulOpinionSpeculation"),
(6, notHelpfulNoteNotNeededKey),
]
Expand All @@ -235,12 +237,16 @@ def rater_factor_key(i):
]
ratingWeightKey = "ratingWeight"

incorrectFilterColumns = [
wideIncorrectFilterSuffix = "_wide"
_incorrectFilterColumns = [
"notHelpfulIncorrect_interval",
"p_incorrect_user_interval",
"num_voters_interval",
"tf_idf_incorrect_interval",
]
incorrectFilterColumns = _incorrectFilterColumns + [
f"{col}{wideIncorrectFilterSuffix}" for col in _incorrectFilterColumns
]

misleadingTags = [
"misleadingOther",
Expand Down
55 changes: 42 additions & 13 deletions sourcecode/scoring/incorrect_filter.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Utilites for tag based scoring logic."""

from typing import List, Optional

from . import constants as c

import numpy as np
Expand All @@ -18,7 +20,7 @@ def _get_user_incorrect_ratio(nhTagRatings: pd.DataFrame) -> pd.DataFrame:
"""

user_incorrect = (
nhTagRatings[[c.raterParticipantIdKey, "notHelpfulIncorrect"]]
nhTagRatings[[c.raterParticipantIdKey, c.notHelpfulIncorrectTagKey]]
.groupby(c.raterParticipantIdKey)
.agg("sum")
)
Expand All @@ -34,7 +36,7 @@ def _get_user_incorrect_ratio(nhTagRatings: pd.DataFrame) -> pd.DataFrame:


def _get_incorrect_tfidf_ratio(
augmented_ratings: pd.DataFrame, user_filter: bool, suffix: str
augmented_ratings: pd.DataFrame, user_filter: Optional[bool], suffix: str
) -> pd.DataFrame:
"""Computes empirical p(incorrect | note) / p(incorrect | raters over all notes) subject to rater-note inclusion function.
Expand All @@ -47,8 +49,10 @@ def _get_incorrect_tfidf_ratio(
pd.DataFrame with one row for each note, with computed sum(tf_idf_incorrect) score for raters
included in filter
"""

ratings_w_user_totals = augmented_ratings[user_filter]
if user_filter is not None:
ratings_w_user_totals = augmented_ratings[user_filter]
else:
ratings_w_user_totals = augmented_ratings

note_nh_count = (
ratings_w_user_totals[[c.raterParticipantIdKey, c.noteIdKey]]
Expand All @@ -73,7 +77,7 @@ def _get_incorrect_tfidf_ratio(
rating_aggs = ratings_w_user_totals.groupby(c.noteIdKey).agg("sum").reset_index()
rating_aggs_w_cnt = rating_aggs.merge(note_nh_count, on=c.noteIdKey)

rating_aggs_w_cnt["tf_idf_incorrect"] = (rating_aggs_w_cnt["notHelpfulIncorrect"]) / np.log(
rating_aggs_w_cnt["tf_idf_incorrect"] = (rating_aggs_w_cnt[c.notHelpfulIncorrectTagKey]) / np.log(
1 + (rating_aggs_w_cnt["p_incorrect_user"])
) # p(incorrect over all rater ratings)
rating_aggs_w_cnt.drop(["notHelpfulIncorrect_total", "cnt"], inplace=True, axis=1)
Expand All @@ -84,45 +88,70 @@ def _get_incorrect_tfidf_ratio(


def get_incorrect_aggregates(
ratings: pd.DataFrame, noteParams: pd.DataFrame, raterParams: pd.DataFrame
ratingsOrig: pd.DataFrame,
noteParams: pd.DataFrame,
raterParams: pd.DataFrame,
applyFilter: bool = True,
extraCols: List[str] = [],
colSuffix: str = "",
) -> pd.DataFrame:
"""Computes non-helpful tag aggregates for each note.
Args:
ratings: initial input ratings DF containing all ratings
noteParams: MF results for notes
raterParams: MF results for raters
applyFilter: bool indicating whether to filter included ratings based on factor
extraCols: list of tags to include along with notHelpfulIncorrect
colSuffix: str which will be added to the end of each column other than noteId
Returns:
pd.DataFrame containing one row per note that was scored during MF. Columns correspond to
aggregates for the Not-Helpful tags, including raw totals, totals adjusted based on the
distance between the rater and the note and ratios based on the adjusted weight totals.
"""
# augment notHelpfulIncorrect with any additional columns
ratings = ratingsOrig.copy()
if extraCols:
for column in extraCols:
assert column is not c.notHelpfulIncorrectTagKey
ratings[c.notHelpfulIncorrectTagKey] += ratings[column]
ratings[c.notHelpfulIncorrectTagKey] = (
ratings[c.notHelpfulIncorrectTagKey].clip(0, 1).astype(np.int64)
)

# consider only ratings with some NH tag
notHelpfulTaggedRatings = ratings.loc[ratings[c.notHelpfulTagsTSVOrder].sum(axis=1) > 0]

# get per user incorrect term frequency
user_totals = _get_user_incorrect_ratio(notHelpfulTaggedRatings)
# add user and note factors
ratings_w_user_totals = (
notHelpfulTaggedRatings[[c.raterParticipantIdKey, c.noteIdKey, "notHelpfulIncorrect"]]
notHelpfulTaggedRatings[[c.raterParticipantIdKey, c.noteIdKey, c.notHelpfulIncorrectTagKey]]
.merge(user_totals, on=c.raterParticipantIdKey, suffixes=(None, "_total"))
.merge(noteParams[[c.noteIdKey, c.internalNoteFactor1Key]], on=c.noteIdKey)
.merge(
raterParams[[c.raterParticipantIdKey, c.internalRaterFactor1Key]], on=c.raterParticipantIdKey
)
)

interval_filter = (
np.abs(
ratings_w_user_totals[c.internalRaterFactor1Key].clip(-0.4, 0.4)
- ratings_w_user_totals[c.internalNoteFactor1Key].clip(-0.4, 0.4)
interval_filter = None
if applyFilter:
interval_filter = (
np.abs(
ratings_w_user_totals[c.internalRaterFactor1Key].clip(-0.4, 0.4)
- ratings_w_user_totals[c.internalNoteFactor1Key].clip(-0.4, 0.4)
)
< c.intervalHalfWidth
)
< c.intervalHalfWidth
)

incorrectAggregates = _get_incorrect_tfidf_ratio(
ratings_w_user_totals, interval_filter, "_interval"
)

# apply column suffix
columns = incorrectAggregates.columns
cols = [f"{col}{colSuffix}" if col is not c.noteIdKey else col for col in columns]
incorrectAggregates.columns = cols

return incorrectAggregates
5 changes: 0 additions & 5 deletions sourcecode/scoring/mf_base_scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ def __init__(
crhThresholdLCBIntercept: float = 0.35,
crhSuperThreshold: float = 0.5,
inertiaDelta: float = 0.01,
weightedTotalVotes: float = 2.5,
useStableInitialization: bool = True,
):
"""Configure MatrixFactorizationScorer object.
Expand Down Expand Up @@ -70,7 +69,6 @@ def __init__(
repeated reason tags in not-helpful ratings to achieve CRH status.
inertiaDelta: Minimum amount which a note that has achieve CRH status must drop below the
applicable threshold to lose CRH status.
weightedTotalVotes: Minimum number of weighted incorrect votes required to lose CRH status.
useStableInitialization: whether to use a specific modeling group of users to stably initialize
"""
super().__init__(seed)
Expand All @@ -89,7 +87,6 @@ def __init__(
self._crhThresholdLCBIntercept = crhThresholdLCBIntercept
self._crhSuperThreshold = crhSuperThreshold
self._inertiaDelta = inertiaDelta
self._weightedTotalVotes = weightedTotalVotes
self._modelingGroupToInitializeForStability = 13 if useStableInitialization else None
self._mfRanker = MatrixFactorization()

Expand Down Expand Up @@ -305,7 +302,6 @@ def _score_notes_and_users(
crhThresholdLCBIntercept=self._crhThresholdLCBIntercept,
crhSuperThreshold=self._crhSuperThreshold,
inertiaDelta=self._inertiaDelta,
weightedTotalVotes=self._weightedTotalVotes,
)

# Determine "valid" ratings
Expand Down Expand Up @@ -378,7 +374,6 @@ def _score_notes_and_users(
crhThresholdLCBIntercept=self._crhThresholdLCBIntercept,
crhSuperThreshold=self._crhSuperThreshold,
inertiaDelta=self._inertiaDelta,
weightedTotalVotes=self._weightedTotalVotes,
finalRound=True,
)
# Takes raterParams from most recent MF run, but use the pre-computed
Expand Down
30 changes: 27 additions & 3 deletions sourcecode/scoring/note_ratings.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,6 @@ def compute_scored_notes(
crhThresholdLCBIntercept: float,
crhSuperThreshold: float,
inertiaDelta: float,
weightedTotalVotes: float,
finalRound: bool = False,
# TODO: We might want to consider inputing only the series here, instead of the whole callable
is_crh_function: Callable[..., pd.Series] = is_crh,
Expand Down Expand Up @@ -412,7 +411,6 @@ def compute_scored_notes(
repeated reason tags in not-helpful ratings to achieve CRH status.
inertiaDelta: Minimum amount which a note that has achieve CRH status must drop below the
applicable threshold to lose CRH status.
weightedTotalVotes: Minimum number of weighted incorrect votes required to lose CRH status.
finalRound: If true, enable additional status assignment logic which is only applied when
determining final status. Given that these mechanisms add complexity we don't apply them
in earlier rounds.
Expand Down Expand Up @@ -485,6 +483,15 @@ def compute_scored_notes(
ratings, noteParams, raterParams
)
noteStats = noteStats.merge(incorrectAggregates, on=c.noteIdKey, how="outer")
incorrectAggregatesWide = incorrect_filter.get_incorrect_aggregates(
ratings,
noteParams,
raterParams,
applyFilter=False,
extraCols=[c.notHelpfulSourcesMissingOrUnreliableTagKey, c.notHelpfulIrrelevantSourcesTagKey],
colSuffix=c.wideIncorrectFilterSuffix,
)
noteStats = noteStats.merge(incorrectAggregatesWide, on=c.noteIdKey, how="outer")

# Add tag filtering and sticky scoring logic.
rules.extend(
Expand Down Expand Up @@ -528,7 +535,24 @@ def compute_scored_notes(
minRatingsNeeded,
),
scoring_rules.FilterIncorrect(
RuleID.INCORRECT_OUTLIER, {RuleID.TAG_OUTLIER}, c.needsMoreRatings, weightedTotalVotes
RuleID.INCORRECT_OUTLIER,
{RuleID.TAG_OUTLIER},
c.needsMoreRatings,
tagThreshold=2,
voteThreshold=3,
weightedTotalVotes=2.5,
superThreshold=None,
colSuffix="",
),
scoring_rules.FilterIncorrect(
RuleID.INCORRECT_OUTLIER_WIDE,
{RuleID.TAG_OUTLIER},
c.needsMoreRatings,
tagThreshold=4,
voteThreshold=5,
weightedTotalVotes=4.0,
superThreshold=0.5,
colSuffix=c.wideIncorrectFilterSuffix,
),
]
)
Expand Down
28 changes: 23 additions & 5 deletions sourcecode/scoring/scoring_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ class RuleID(Enum):
ELEVATED_CRH_INERTIA = RuleAndVersion("ElevatedCRHInertia", "1.0", False)
LCB_INERTIA = RuleAndVersion("LcbCRHInertia", "1.0", False)
INCORRECT_OUTLIER = RuleAndVersion("FilterIncorrect", "1.0", False)
INCORRECT_OUTLIER_WIDE = RuleAndVersion("FilterIncorrectWide", "1.0", False)

# Rules used in _meta_score.
META_INITIAL_NMR = RuleAndVersion("MetaInitialNMR", "1.0", False)
Expand Down Expand Up @@ -280,20 +281,32 @@ def __init__(
ruleID: RuleID,
dependencies: Set[RuleID],
status: str,
weightedTotalVotes: float = 2.5,
tagThreshold: int,
voteThreshold: int,
weightedTotalVotes: float,
superThreshold: Optional[float],
colSuffix: str,
):
"""Filter CRH notes for outliers with high levels of incorrect tag from similar factor raters.
Args:
rule: enum corresponding to a namedtuple defining a rule name and version string for the ScoringRule.
dependencies: Rules which must run before this rule can run.
status: the status which each note should be set to (e.g. CRH, CRNH, NMR)
tagThreshold: threshold for number of included raters to issue a tag
voteThreshold: threshold for number of included raters (raters must have issued a NH tag to be inclueed)
weightedTotalVotes: For the filter to trigger, the sum of weighted incorrect votes must
exceed the minAdjustedTotal.
superThreshold: if set, allow notes with an intercept above threshold to bypass the filter.
colSuffix: string suffix to apply to lookup columns
"""
super().__init__(ruleID, dependencies)
self._status = status
self.weightedTotalVotes = weightedTotalVotes
self._tagThreshold = tagThreshold
self._voteThreshold = voteThreshold
self._weightedTotalVotes = weightedTotalVotes
self._superThreshold = superThreshold
self._colSuffix = colSuffix

def score_notes(
self, noteStats: pd.DataFrame, currentLabels: pd.DataFrame, statusColumn: str
Expand All @@ -305,9 +318,14 @@ def score_notes(

# Identify impacted notes.
noteStatusUpdates = crhStats.loc[
(crhStats["notHelpfulIncorrect_interval"] >= 2)
& (crhStats["num_voters_interval"] >= 3)
& (crhStats["tf_idf_incorrect_interval"] >= self.weightedTotalVotes)
(crhStats[f"notHelpfulIncorrect_interval{self._colSuffix}"] >= self._tagThreshold)
& (crhStats[f"num_voters_interval{self._colSuffix}"] >= self._voteThreshold)
& (crhStats[f"tf_idf_incorrect_interval{self._colSuffix}"] >= self._weightedTotalVotes)
& (
True
if self._superThreshold is None
else crhStats[c.internalNoteInterceptKey] < self._superThreshold
)
][[c.noteIdKey]]

pd.testing.assert_frame_equal(noteStatusUpdates, noteStatusUpdates.drop_duplicates())
Expand Down

0 comments on commit 4f64737

Please sign in to comment.