In [None]:
import numpy as np
import pandas as pd
from copy import deepcopy
from tqdm import tqdm
import ast
import re

from datasets import load_dataset, load_metric, Dataset, ClassLabel, set_caching_enabled
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

In [None]:
dfs = [pd.read_csv(f"../data/2024-04_content_batch_{i}.csv") for i in tqdm(range(1,16))]
df = pd.concat(dfs).reset_index(drop=True)
print([len(d) for d in dfs])
print(len(df))

ids_to_drop = df[(df.revision_is_identity_reverted == True) & ((df.self_revert) | (df.reverting_revision_is_reverted_revision))]["revision_id"].values
print(len(ids_to_drop))

In [4]:
df.revision_id.value_counts().head(2)

revision_id
1482024812    1
1665607121    1
Name: count, dtype: int64

In [4]:
df["is_add"] = df.added.apply(lambda x: x != "{}")
df["is_remove"] = df.removed.apply(lambda x: x != "{}")
df["is_change"] = df.changed.apply(lambda x: x != "{}")
df["is_labels"] = df.labels.apply(lambda x: x != "{}")
df["is_descriptions"] = df.descriptions.apply(lambda x: x != "{}")

df.groupby(["is_add", "is_remove", "is_change"]).revision_is_identity_reverted.agg(["mean", "count"])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mean,count
is_add,is_remove,is_change,Unnamed: 3_level_1,Unnamed: 4_level_1
False,False,False,0.76436,61297
False,False,True,0.292103,1093665
False,True,False,0.350654,530317
False,True,True,0.205924,33692
True,False,False,0.10558,4603084
True,False,True,0.134176,60540
True,True,False,0.170537,8784
True,True,True,0.178896,19257


In [6]:
df.groupby("event_user_is_anonymous").revision_is_identity_reverted.mean()

event_user_is_anonymous
False    0.117665
True     0.558898
Name: revision_is_identity_reverted, dtype: float64

In [6]:
df.groupby(["is_labels", "is_descriptions"]).revision_is_identity_reverted.agg(["mean", "count"])

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,count
is_labels,is_descriptions,Unnamed: 2_level_1,Unnamed: 3_level_1
False,False,0.183386,4806467
False,True,0.102222,713262
True,False,0.110009,869145
True,True,0.381629,21762


In [None]:
holdout_test = pd.read_csv("../data/holdout/test_holdout.csv", sep="\t")
df_cat = pd.read_csv("../data/2024-04_metadata.csv")
# df_cat = df_cat[df_cat.revision_id.isin(set(holdout_test.rev))]

revs_check = set(holdout_test.rev)
dict_real = {k: v for k,v in zip(holdout_test.rev, holdout_test.label)}
# tmp = df[df.revision_id.isin(revs_check)][["revision_id", "revision_is_identity_reverted"]]
# tmp["real"] = tmp.revision_id.map(dict_real)

  df_cat = pd.read_csv("data/2024-04_metadata.csv")


# Building train dataset: 

In [None]:
# Getting dict with IDs to description translation: 
labels = pd.read_csv("../data/full_labels_2024-04_text_en.csv")
labels_dict = {k:v for k, v in zip(labels['id'], labels.label_en) if not pd.isna(v)}

In [9]:
df.head()

Unnamed: 0,page_title,event_timestamp,event_user_is_anonymous,label_en,revision_id,revision_parent_id,revision_is_identity_reverted,self_revert,reverting_revision_is_reverted_revision,batch_id,added,removed,changed,descriptions,labels,is_add,is_remove,is_change,is_labels,is_descriptions
0,Q108120268,2021-08-16 19:52:57,False,"""Richard D. Walk""",1482024812,1482024690,False,,,1,"{""root['claims']['P140'][0]['references']"": None}",{},{},{},{},True,False,False,False,False
1,Q23730171,2021-08-17 15:02:39,False,"""Santa Lucía""",1482702064,1482702008,False,,,1,"{""root['descriptions']['ceb']"": {'language': '...",{},{},"{'sv': {'language': 'sv', 'value': 'ort i Colo...",{},True,False,False,False,True
2,Q33773,2021-08-17 18:26:32,False,"""Convention of Moss""",1482803316,1482802553,False,,,1,"{""root['claims']['P710'][1]"": {'mainsnak': {'s...",{},{},{},{},True,False,False,False,False
3,Q268797,2021-08-19 15:11:32,False,"""Cloz""",1483872645,1483872610,False,,,1,{},"{""root['claims']['P31'][1]['qualifiers']"": Non...",{},{},{},False,True,False,False,False
4,Q23657383,2021-08-20 09:28:57,False,"""Timothy P. Trella""",1484377279,1484377260,False,,,1,"{""root['claims']['P734']"": [{'mainsnak': {'sna...",{},{},{},{},True,False,False,False,False


In [10]:
tmp_df = deepcopy(df)

# filtering 80% to leave part for classifier
np.random.seed(42)
unique_ids_all = list(df.page_title.unique())
ids_train_lm = list(np.random.choice(unique_ids_all, size=int(0.8*len(unique_ids_all)), replace=False))
# ids_test = list(set(unique_ids_all) - set(ids_train))

# Filtering by timestamp (event_timestamp > X go to test)
ids_to_drop_date = tmp_df[
    (pd.to_datetime(tmp_df['event_timestamp']) > pd.to_datetime('2023-06-01')) | 
    (pd.to_datetime(tmp_df['event_timestamp']) < pd.to_datetime('2021-09-01'))
].revision_id

tmp_df["is_lm_train"] = df.page_title.isin(ids_train_lm)
tmp_df = tmp_df[tmp_df.is_lm_train]

### Build sentences for removes action: 

In [11]:
# Define the pattern
PATTERN_P = r"P\d+"
PATTERN_Q = r"Q\d+"
DEFAULT_VALUE = "unknown"


def check_id_pattern(string: str) -> bool:
    if re.match(PATTERN_P, string) or re.match(PATTERN_Q, string):
        return True
    else:
        return False


def check_important_wording(string: str) -> bool:
    if string in [
        "amount",
        "unit",
        "time",
        "timezone",
        "latitude",
        "longitude",
        "altitude",
        "text",
    ]:
        return True
    else:
        return False


def process_key(key: str):
    pattern = r"\[\'(.*?)\'\]"
    matches = re.findall(pattern, key)
    if matches:
        return matches
    else:
        return []


def remove_wikilink(link: str) -> str:
    link = str(link)
    return link.replace("http://www.wikidata.org/entity/", "")


def get_value_by_type(json: dict, datatype: str):
    if datatype == "wikibase-entityid":
        try:
            return [datatype, json["value"]["id"]]
        except:  # noqa: E722
            # ToDo: fix this minor processing case
            return [datatype, DEFAULT_VALUE]
    elif datatype == "string":
        return [datatype, json["value"]]
    elif datatype == "globecoordinate":
        return [
            datatype,
            json["value"]["latitude"],
            json["value"]["longitude"],
            json["value"]["altitude"],
        ]
    elif datatype == "monolingualtext":
        return [datatype, json["value"]["text"]]
    elif datatype == "time":
        return [datatype, json["value"]["time"], json["value"]["timezone"]]
    elif datatype == "quantity":
        return [
            datatype,
            json["value"]["amount"],
            remove_wikilink(json["value"]["unit"]),
        ]
    else:
        return [datatype]


def process_sentence(items, labels_dict: dict) -> str:
    items_transformed = [
        labels_dict.get(i, DEFAULT_VALUE if check_id_pattern(i) else i) for i in items
    ]
    return " ".join(items_transformed)


def process_alteration(
    left_q_id: str,
    alterations: str,
    action_type: str = "remove: ",
    labels_dict: dict = {},
):
    initial_sentence = [
        action_type,
        left_q_id if not pd.isna(left_q_id) else DEFAULT_VALUE,
    ]
    v_tmp = ast.literal_eval(alterations)
    sentences = []
    for key in v_tmp.keys():
        sentence_key = deepcopy(initial_sentence)
        sentence_key += process_key(key)
        if "sitelinks" in sentence_key:  # skipping for now
            continue
        elif (
            ("aliases" in sentence_key)
            or ("labels" in sentence_key)
            or ("descriptions" in sentence_key)
        ):
            if isinstance(v_tmp[key], list):
                for el in v_tmp[key]:
                    sentence_copy = deepcopy(sentence_key)
                    sentence_copy.append(el["value"])
                    sentences.append(process_sentence(sentence_copy, labels_dict))
            elif isinstance(v_tmp[key], str):
                sentence_copy = deepcopy(sentence_key)
                sentence_copy.append(v_tmp[key])
                sentences.append(process_sentence(sentence_copy, labels_dict))
            else:
                sentence_copy = deepcopy(sentence_key)
                sentence_copy.append(v_tmp[key]["value"])
                sentences.append(process_sentence(sentence_copy, labels_dict))
        elif ("claims" in sentence_key) and (
            len(sentence_key) <= 4
        ):  # skipping qualifiers case
            if isinstance(v_tmp[key], list):
                for el in v_tmp[key]:
                    if el["mainsnak"].get("datavalue"):
                        datatype = el["mainsnak"]["datavalue"]["type"]
                        if datatype in [
                            "string",
                            "monolingualtext",
                            "wikibase-entityid",
                        ]:
                            sentence_copy = deepcopy(sentence_key)
                            sentence_copy += get_value_by_type(
                                el["mainsnak"]["datavalue"], datatype
                            )
                            sentences.append(
                                process_sentence(sentence_copy, labels_dict)
                            )
            elif isinstance(v_tmp[key], str):
                sentence_copy = deepcopy(sentence_key)
                sentence_copy.append(v_tmp[key])
                sentences.append(process_sentence(sentence_copy, labels_dict))
            else:
                if v_tmp[key]["mainsnak"].get("datavalue"):
                    datatype = v_tmp[key]["mainsnak"]["datavalue"]["type"]
                    if datatype in ["string", "monolingualtext", "wikibase-entityid"]:
                        sentence_copy = deepcopy(sentence_key)
                        sentence_copy += get_value_by_type(
                            v_tmp[key]["mainsnak"]["datavalue"], datatype
                        )
                        sentences.append(process_sentence(sentence_copy, labels_dict))
    return sentences


def process_change(left_q_id: str, changes: str, action_type: str = "change: ", labels_dict: dict = {}):
    initial_sentence = [
        action_type,
        left_q_id if not pd.isna(left_q_id) else DEFAULT_VALUE,
    ]
    v_tmp = ast.literal_eval(changes)
    sentences = []
    for key in v_tmp.keys():
        sentence_key = deepcopy(initial_sentence)
        sentence_key += process_key(key)
        if (
            ("aliases" in sentence_key)
            or ("labels" in sentence_key)
            or ("descriptions" in sentence_key)
        ):
            sentence_copy_old, sentence_copy_new = deepcopy(sentence_key), deepcopy(
                sentence_key
            )
            sentence_copy_old += [v_tmp[key]["old_value"]]
            sentence_copy_new += [v_tmp[key]["new_value"]]
            sentences.append(
                (
                    process_sentence(sentence_copy_old, labels_dict),
                    process_sentence(sentence_copy_new[1:], labels_dict),
                )
            )
        elif (
            ("claims" in sentence_key)
            and ("qualifiers" not in sentence_key)
            and ("rank" not in sentence_key)
        ):  # skipping qualifiers case
            if "string" in sentence_key:
                datatype = "string"
            elif "monolingualtext" in sentence_key:
                datatype = "monolingualtext"
            elif "claims" in sentence_key:
                datatype = "claims"
            else:
                datatype = "skip"
            if datatype in ["string", "monolingualtext", "claims"]:
                items_to_add = [
                    i
                    for i in sentence_key[4:]
                    if check_id_pattern(i) or check_important_wording(i)
                ]
                sentence_copy_old, sentence_copy_new = deepcopy(
                    sentence_key[:4]
                ), deepcopy(sentence_key[:4])
                if sentence_key[-1] == "numeric-id":
                    new_value, old_value = (
                        f"Q{v_tmp[key]['new_value']}",
                        f"Q{v_tmp[key]['old_value']}",
                    )
                else:
                    new_value, old_value = remove_wikilink(
                        v_tmp[key]["new_value"]
                    ), remove_wikilink(v_tmp[key]["old_value"])
                sentence_copy_old += items_to_add + [old_value]
                sentence_copy_new += items_to_add + [new_value]
                sentence_copy_old, sentence_copy_new = [
                    str(i) for i in sentence_copy_old
                ], [str(i) for i in sentence_copy_new]
                sentences.append(
                    (
                        process_sentence(sentence_copy_old, labels_dict),
                        process_sentence(sentence_copy_new[1:], labels_dict),
                    )
                )
    return sentences

In [12]:
sentences = []
labels = []
revision_ids = []
action_types = []    


for q_id, rev_id, v, l in tqdm(zip(tmp_df.page_title, tmp_df.revision_id.values, tmp_df.removed.values, tmp_df.revision_is_identity_reverted.values)):
    
    action_type = "remove: "
    sentences_found = process_alteration(left_q_id=q_id, alterations=v, action_type=action_type, labels_dict=labels_dict)
    
    sentences += sentences_found
    labels += [l] * len(sentences_found)
    revision_ids += [rev_id] * len(sentences_found)
    action_types += [action_type] * len(sentences_found)
    
    
for q_id, rev_id, v, l in tqdm(zip(tmp_df.page_title, tmp_df.revision_id.values, tmp_df.added.values, tmp_df.revision_is_identity_reverted.values)):
    
    action_type = "add: "
    sentences_found = process_alteration(left_q_id=q_id, alterations=v, action_type=action_type, labels_dict=labels_dict)
    
    sentences += sentences_found
    labels += [l] * len(sentences_found)
    revision_ids += [rev_id] * len(sentences_found)
    action_types += [action_type] * len(sentences_found)
    
    
for q_id, rev_id, v, l in tqdm(zip(tmp_df.page_title, tmp_df.revision_id.values, tmp_df.changed.values, tmp_df.revision_is_identity_reverted.values)):
    
    action_type = "change: "
    sentences_found = process_change(left_q_id=q_id, changes=v, action_type=action_type, labels_dict=labels_dict)
    
    sentences += sentences_found
    labels += [l] * len(sentences_found)
    revision_ids += [rev_id] * len(sentences_found)
    action_types += [action_type] * len(sentences_found)

5131944it [01:21, 62817.52it/s]
5131944it [04:44, 18013.36it/s]
5131944it [01:40, 51229.51it/s]


### Build sentences for changed action:

In [None]:
df_text = pd.DataFrame(
    {
        "text_1": [s[0] if isinstance(s, tuple) else s for s in sentences], 
        "text_2": [s[1] if isinstance(s, tuple) else " " for s in sentences],
        "label": labels,
        "revision_id": revision_ids,
        "action_type": action_types
    }
)

# filtering
df_text = df_text[~df_text.revision_id.isin(revs_check)]  # filtering holdout
df_text = df_text[~df_text.revision_id.isin(ids_to_drop_date)]  # filtering by date
df_text = df_text[~df_text.revision_id.isin(ids_to_drop)]  # filtering by redundunt target

# balancing
data_ones = df_text[df_text.label == 1]
data_zeros = df_text[df_text.label == 0].sample(len(data_ones), random_state=42)
data = pd.concat([data_ones, data_zeros]).reset_index(drop=True)

# Saving
data.to_csv("../data/mlm_training_data_full.csv", index=False)

print(len(data))

495966


### Training the BERT model

In [None]:
training_dataset = Dataset.from_csv("../data/mlm_training_data_full.csv")
feat_class = ClassLabel(num_classes=2)
training_dataset = training_dataset.cast_column("label", feat_class)
training_dataset = training_dataset.train_test_split(test_size=0.05, stratify_by_column="label", shuffle=True, seed=42)

# tokenization:
model_checkpoint = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

sentence1_key = "text_1"
sentence2_key = "text_2"

def preprocess_function(examples):
    return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True, max_length=512)

encoded_dataset = training_dataset.map(preprocess_function, batched=True)

num_labels = 2
metric_name = "accuracy"
batch_size = 8

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

Generating train split: 495966 examples [00:01, 327814.25 examples/s]
Casting the dataset: 100%|██████████| 495966/495966 [00:00<00:00, 1135724.81 examples/s]
Map: 100%|██████████| 471167/471167 [01:00<00:00, 7841.86 examples/s] 
Map: 100%|██████████| 24799/24799 [00:03<00:00, 7946.08 examples/s] 
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you 

In [None]:
args = TrainingArguments(
    f"../models/bert",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=False,
)

metric = load_metric("glue", "mrpc")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.evaluate()

# Calculating text scores for other revisions (needed for later catboost training)

In [None]:
tmp_df = deepcopy(df)

# filtering 80% to leave part for classifier
np.random.seed(42)
unique_ids_all = list(df.page_title.unique())
ids_train_lm = list(np.random.choice(unique_ids_all, size=int(0.8*len(unique_ids_all)), replace=False))
# ids_test = list(set(unique_ids_all) - set(ids_train))

tmp_df["is_lm_train"] = df.page_title.isin(ids_train_lm)
tmp_df = tmp_df[~tmp_df.is_lm_train | (pd.to_datetime(tmp_df['event_timestamp']) > pd.to_datetime('2023-06-01'))]


In [34]:
sentences = []
labels = []
revision_ids = []
action_types = []    


for q_id, rev_id, v, l in tqdm(zip(tmp_df.page_title, tmp_df.revision_id.values, tmp_df.removed.values, tmp_df.revision_is_identity_reverted.values)):
    
    action_type = "remove: "
    sentences_found = process_alteration(left_q_id=q_id, alterations=v, action_type=action_type, labels_dict=labels_dict)
    
    sentences += sentences_found
    labels += [l] * len(sentences_found)
    revision_ids += [rev_id] * len(sentences_found)
    action_types += [action_type] * len(sentences_found)
    
    
for q_id, rev_id, v, l in tqdm(zip(tmp_df.page_title, tmp_df.revision_id.values, tmp_df.added.values, tmp_df.revision_is_identity_reverted.values)):
    
    action_type = "add: "
    sentences_found = process_alteration(left_q_id=q_id, alterations=v, action_type=action_type, labels_dict=labels_dict)
    
    sentences += sentences_found
    labels += [l] * len(sentences_found)
    revision_ids += [rev_id] * len(sentences_found)
    action_types += [action_type] * len(sentences_found)
    
    
for q_id, rev_id, v, l in tqdm(zip(tmp_df.page_title, tmp_df.revision_id.values, tmp_df.changed.values, tmp_df.revision_is_identity_reverted.values)):
    
    action_type = "change: "
    sentences_found = process_change(left_q_id=q_id, changes=v, action_type=action_type, labels_dict=labels_dict)
    
    sentences += sentences_found
    labels += [l] * len(sentences_found)
    revision_ids += [rev_id] * len(sentences_found)
    action_types += [action_type] * len(sentences_found)

2598093it [00:57, 45244.68it/s]
2598093it [03:14, 13342.99it/s]
2598093it [01:06, 38944.21it/s]


In [35]:
df_text = pd.DataFrame(
    {
        "text_1": [s[0] if isinstance(s, tuple) else s for s in sentences], 
        "text_2": [s[1] if isinstance(s, tuple) else " " for s in sentences],
        "label": labels,
        "revision_id": revision_ids,
        "action_type": action_types
    }
)

print(len(df_text))

2229877


In [None]:
from transformers import pipeline

def preds_processing(preds):
    res = []
    for i in preds:
        res.append(i[1]['score'] > 0.5)
    return res

def preds_processing_prob(preds):
    res = []
    for i in preds:
        res.append(i[1]['score'])
    return res

texts_to_process = []

for text_1, text_2 in zip(df_text.text_1.values, df_text.text_2.values):
    if (text_2 == " "):
        texts_to_process.append(text_1)
    else:
        texts_to_process.append({"text": text_1, "text_pair": text_2})
      
checkpoint = "../models/bert/checkpoint-294480"

device = 0
batch_size = 32
tokenizer = AutoTokenizer.from_pretrained(checkpoint, truncation=True, max_length=512, device=device)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint).to(device)
clf = pipeline(task="text-classification", model=model, tokenizer=tokenizer, device=device, batch_size=batch_size)

scores = []
print("Number of text to process: ", len(texts_to_process))
for i in tqdm(range(0, len(texts_to_process), 500)):
    tokenizer_kwargs = {'truncation': True, 'max_length': 512}
    preds = clf(texts_to_process[i:i+500], return_all_scores=True, **tokenizer_kwargs, batch_size=batch_size)
    scores += preds_processing_prob(preds)

print("Number of text scores: ", len(scores))

df_text["scores"] = scores

In [None]:
df_text.to_csv("../data/mlm_text_features.csv", index=False)