# Data collection:

In [None]:
%%time
import glob
import joblib
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import altair as alt
from collections import Counter
import re
import json
pd.set_option('display.max_colwidth', None)

RANDOM_STATE = 42

extracted_data = joblib.load("../data/ru_wiki_extracted_pages.data")
error_data = joblib.load("../data/ru_wiki_error_pages.data")
final_data = pd.DataFrame(joblib.load("../data/ru_wiki_final_dataset_v2.data"))
with open('../data/ru_reveal_wiki_location.json') as user_file:
    ru_reveal_wiki_location = json.load(user_file)
locations_dict = dict()
for location in tqdm(ru_reveal_wiki_location):
    location_key = list(location.keys())[0]
    location_values = np.sort(list(location.values())[0])
    locations_dict[location_key] = "_".join(location_values)

with open('../data/ru_reveal_wiki_topic.json') as user_file:
    ru_reveal_wiki_topic = json.load(user_file)
topics_dict = {list(d.keys())[0]: [v['topic'] for v in list(d.values())[0]] for d in tqdm(ru_reveal_wiki_topic)}

# Processing data: 
changed_df = final_data[final_data.status.isin([3, 4])]
not_found_df = final_data[final_data.status.isin([1])]
action_features = pd.DataFrame(changed_df["actions"].to_list())
status_features = changed_df[["status"]].reset_index(drop=True)
n_added = changed_df.lines_added.apply(len).reset_index(drop=True)
n_removed = changed_df.lines_deleted.apply(len).reset_index(drop=True)
n_changed = changed_df.lines_changed.apply(len).reset_index(drop=True)

# Category
categories_added, categories_removed = [], []
categories_counter = Counter()
for wiki_features, ruwiki_features in \
    zip(changed_df["wiki_features"].to_list(), changed_df["ruwiki_features"].to_list()):
    categories_added.append(set(ruwiki_features["categories"]) - set(wiki_features["categories"]))
    categories_removed.append(set(wiki_features["categories"]) - set(ruwiki_features["categories"])) 
    categories_counter.update(categories_added[-1])
    categories_counter.update(categories_removed[-1])
    
# topic
topics = changed_df.page_name.apply(lambda d: list(np.sort(topics_dict.get(d, [])))).values
# location
locations = changed_df.page_name.apply(lambda d: locations_dict.get(d, "unknown")).values

  0%|          | 0/1035086 [00:00<?, ?it/s]

  0%|          | 0/1924975 [00:00<?, ?it/s]

CPU times: user 1min 21s, sys: 58.8 s, total: 2min 20s
Wall time: 2min 49s


In [31]:
from itertools import product
from fuzzywuzzy import fuzz  # type: ignore

def clean_text(text):
    if text is None:
        return text
    # Remove \n characters
    text = re.sub(r'\n', ' ', text)
    # Remove redundant spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def clean_template(text):
    return text

def get_template_parameters(templates):
    all_parameters = {}
    for t in templates:
        # Split the template string on the '|' character
        parts = t.split('|')

        if parts:
            # The first part is assumed to be the template name
            template_name = parts[0]

            # Initialize a dictionary for the current template
            template_parameters = {}

            # Process the remaining parts as parameter key-value pairs
            for param_pair in parts[1:]:
                param_split = param_pair.split('=')
                param_name, param_value = param_split[0], "".join(param_split[1:])
                # Create a combined key using the template name and parameter name
                combined_key = f"{template_name}+{param_name}"
                template_parameters[clean_text(combined_key)] = clean_text(param_value)

            # Add the template name and its parameters to the dictionary
            all_parameters.update(template_parameters)

    return all_parameters


def compare_templates(t1, t2): 
    params1 = get_template_parameters([t1])
    params2 = get_template_parameters([t2])
    changes = []
    for i in params1.keys():
        if params2.get(i) != params1.get(i):
            param_to_add = (i, params1.get(i), params2.get(i))
            if param_to_add not in changes:
                changes.append(param_to_add)
                
    for i in params2.keys():
        if params2.get(i) != params1.get(i):
            param_to_add = (i, params1.get(i), params2.get(i))
            if param_to_add not in changes:
                changes.append(param_to_add)
    
    return changes

In [32]:
error_count = 0
tem_added, tem_deleted, param_change = [], [], []
tem_added_counter, tem_deleted_counter, param_change_counter = Counter(), Counter(), Counter()
for wiki_features, ruwiki_features in \
    tqdm(zip(changed_df["wiki_features"].to_list(), changed_df["ruwiki_features"].to_list())):
    tem_added.append(set([clean_template(i) for i in ruwiki_features["templates"]]) - set([clean_template(i) for i in wiki_features["templates"]]) - set([None]))
    tem_deleted.append(set([clean_template(i) for i in wiki_features["templates"]]) - set([clean_template(i) for i in ruwiki_features["templates"]]) - set([None]))
    tem_added_counter.update(tem_added[-1])
    tem_deleted_counter.update(tem_deleted[-1])
    
    params_changed = []
    
    tem_added_tmp = set(ruwiki_features["templates"]) - set(wiki_features["templates"]) - set([None])
    tem_deleted_tmp = set(wiki_features["templates"]) - set(ruwiki_features["templates"]) - set([None])
    for t1, t2 in product(tem_added_tmp, tem_deleted_tmp):
        similarity = fuzz.ratio(t1, t2)
        if similarity > 60 and similarity < 100:
            try:
                # remove similar templates from the list (as canges will be added to the list)
                tem_added[-1] = tem_added[-1] - set([t1])
                tem_deleted[-1] = tem_deleted[-1] - set([t2])
                params_changed += compare_templates(t1, t2)
            except Exception as e:
                print(e)
                error_count += 1
                pass
    param_change.append(params_changed)
    param_change_counter.update(params_changed)

0it [00:00, ?it/s]

In [33]:
# Get all file names:
import glob
from tqdm.auto import tqdm
import json


def parse_json(text):
    text = text.replace('"desc": " ', '"desc": "')
    text = text.replace("\'Донецкой области\' на \'Донецкой Народной Республике\'", "Донецкой области на Донецкой Народной Республике")
    text = text.replace('"Радивоновка"', 'Радивоновка').replace('"Акимовском районе"', 'Акимовском районе').replace('"Мелитополе"', 'Мелитополе')
    text = text.replace('```json\n', '').replace('\n```', '').replace(',\n}', '}').replace('" "', ' empty double quotes ')
    text = text.replace('\"буржуек\"', '\\\"буржуек\\\"').replace(' " ', ' double quotes  ').replace("”", '"')
    text = text.replace('"будут изображены" to "изображены"', 'будут изображены to изображены')
    text = text.replace('55°47\'32"N 37°36\'15"E.', "").replace("\\\'Маньяк\\\'", "Маньяк").replace(")\n}", "\n}")
    text = text.replace('"нерешительное и дилетантское"', 'нерешительное и дилетантское')
    
    return json.loads(text)
    # except Exception as e:
    #     print(e)
    #     return None

files = glob.glob("../data/embed/output/summary/batch_*.jsonl")
responses = []
for output_file in files:
    # Read responses from jsonl file
    with open(output_file, "r") as f:
        for line in f:
            responses.append(json.loads(line))

summaries = []
ids = []
for response in tqdm(responses):
    summaries.append(parse_json(response["response"]["body"]["choices"][0]['message']["content"]))
    ids.append(response["custom_id"])

summary_dict = dict(zip(ids, summaries))

# Extracting requests and their ids:
jsonl_files_input = glob.glob("../data/embed/input/batch_requests_summary*.jsonl")

ids_input = []
texts_input = []
for file in jsonl_files_input:
    with open(file, "r") as f:
        for line in f:
            request = json.loads(line)
            ids_input.append(request["custom_id"])
            texts_input.append(request["body"]["messages"][1]["content"][1:-1])

# Create a dataframe with the embeddings and the ids
text_to_summary = {text: summary_dict.get(id, None)['desc'] for text, id in zip(texts_input, ids_input) if summary_dict.get(id, None) is not None}

  0%|          | 0/30599 [00:00<?, ?it/s]

In [34]:
import re
def extract_page_title(text):
    try:
        return re.findall(r'page title: (.*)\nchange type', text)[0]
    except Exception as e:
        print(e)
        print(text)
        return None

# Modify the dictionary to include the page title in keys: 
text_to_summary = {extract_page_title(k): v for k, v in text_to_summary.items()}

In [35]:
files = glob.glob("../data/embed/output/embedding_summary/batch_*.jsonl")
responses = []
for output_file in files:
    # Read responses from jsonl file
    with open(output_file, "r") as f:
        for line in f:
            responses.append(json.loads(line))

embeddings = []
ids = []
for response in responses:
    embeddings.append(response["response"]["body"]["data"][0]['embedding'])
    ids.append(response["custom_id"])

embeddings_dict = dict(zip(ids, embeddings))

In [36]:
# Extracting requests and their ids:
jsonl_files_input = glob.glob("../data/embed/input/batch_requests_embed*.jsonl")

ids_input = []
texts_input = []
for file in jsonl_files_input:
    with open(file, "r") as f:
        for line in f:
            request = json.loads(line)
            ids_input.append(request["custom_id"])
            texts_input.append(request["body"]["input"])

# Create a dataframe with the embeddings and the ids
text_to_embedding = {text: embeddings_dict[id] for text, id in zip(texts_input, ids_input)}

In [None]:
changed_df["n_added"] = n_added
changed_df["n_removed"] = n_removed
changed_df["n_changed"] = n_changed
changed_df["categories_added"] = categories_added
changed_df["categories_removed"] = categories_removed
changed_df["topics"] = topics
changed_df["locations"] = locations
changed_df["templates_added"] = tem_added
changed_df["templates_deleted"] = tem_deleted
changed_df["params_changed"] = param_change
changed_df["text_to_summary"] = changed_df.page_name.apply(lambda d: text_to_summary.get(d, None))
changed_df["text_to_embedding"] = changed_df.text_to_summary.apply(lambda d: text_to_embedding.get(d, None))



# Drop the column "parsing time", 'n_added', 'n_removed', 'n_changed'
changed_df = changed_df.drop(columns=['n_added', 'n_removed', 'n_changed', "parsing time"])
changed_df.to_csv("../data/rwfork_changed.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  changed_df["n_added"] = n_added
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  changed_df["n_removed"] = n_removed
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  changed_df["n_changed"] = n_changed
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_ind

In [None]:
display(changed_df.shape)
display(changed_df.columns)

(33664, 21)

Index(['page_name', 'status', 'lines_added', 'lines_deleted', 'lines_changed',
       'actions', 'wiki_features', 'ruwiki_features', 'parsing_time',
       'n_added', 'n_removed', 'n_changed', 'categories_added',
       'categories_removed', 'topics', 'locations', 'templates_added',
       'templates_deleted', 'params_changed', 'text_to_summary',
       'text_to_embedding'],
      dtype='object')

In [None]:
# Save to csv:
not_found_page_names = not_found_df.page_name.to_list() + [a["page_name"] for a in error_data]
pd.DataFrame(not_found_page_names, columns=["page_name"]).to_csv("../data/rwfork_not_found.csv", index=False)

# Dataset Documentation  

This dataset contains **33,664 rows** and **17 columns**. Each row represents a Wikipedia page where content differences exist between the Russian Wikipedia and the Russian Wikipedia Fork (RuWiki). Pages without any content differences were excluded. Below are the details of the columns:  

1. **`page_name`**:  
   The name of the Wikipedia page being analyzed.  

2. **`status`**:  
   Indicates the type of edit made to the page:  
   - **3**: Changes in metadata only (not visible in the content).  
     - The columns `lines_added`, `lines_deleted`, and `lines_changed` will be empty.  
   - **4**: Changes in content.  
     - At least one of the columns `lines_added`, `lines_deleted`, or `lines_changed` will contain data.  

3. **`lines_added`**:  
   A list of sentences (text pieces) that were added to RuWiki compared to Wikipedia.  

4. **`lines_deleted`**:  
   A list of sentences (text pieces) that were removed from RuWiki compared to Wikipedia.  

5. **`lines_changed`**:  
   A list of sentence pairs representing changes made in RuWiki compared to Wikipedia.  
   - The first element of each pair is the original sentence from Wikipedia.  
   - The second element is the modified sentence from RuWiki.  

6. **`actions`**:  
   A list of actions performed on the page to transition from Wikipedia to RuWiki.  
   - These actions were extracted automatically using the `mwedittypes` library.  

7. **`wiki_features`**:  
   Additional metadata about the page, extracted from Wikipedia.  

8. **`ruwiki_features`**:  
   Additional metadata about the page, extracted from RuWiki.  

9. **`categories_added`**:
      The list of categories added to the page in RuWiki compared to Wikipedia.

10. **`categories_removed`**:
      The list of categories removed from the page in RuWiki compared to Wikipedia.

11. **`topics`**:
      The list of topics related to the Russian Wikipedia page (not Ruwiki page). It is extracted using https://api.wikimedia.org/wiki/Lift_Wing_API/Reference/Get_articletopic_outlink_prediction API.

12. **`locations`**:
      The list of locations related to the Russian Wikipedia page (not Ruwiki page).

13. **`templates_added`**:
      The list of templates added to the page in RuWiki compared to Wikipedia.

14. **`templates_deleted`**:
      The list of templates deleted from the page in RuWiki compared to Wikipedia.

15. **`params_changed`**:
      The list of parameters changed in the templates of the page in RuWiki compared to Wikipedia.

16. **`text_to_summary`**:
      The summary of the edit made to transition from Wikipedia to RuWiki. It is created using OpenAI API feeding all the changes made to the page.

17. **`text_to_embedding`**:
      The embedding of the summary of the edit made to transition from Wikipedia to RuWiki. It is created using OpenAI API text of the summary. Can be used for search or clustering.

