In [1]:
%%time
import glob
import joblib
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import altair as alt
from collections import Counter
import re
import json
from fuzzywuzzy import fuzz  # type: ignore
from itertools import product

extracted_data = joblib.load("../data/ru_wiki_extracted_pages.data")
error_data = joblib.load("../data/ru_wiki_error_pages.data")
final_data = pd.DataFrame(joblib.load("../data/ru_wiki_final_dataset_v2.data"))
with open('../data/ru_reveal_wiki_location.json') as user_file:
    ru_reveal_wiki_location = json.load(user_file)
locations_dict = dict()
for location in tqdm(ru_reveal_wiki_location):
    location_key = list(location.keys())[0]
    location_values = np.sort(list(location.values())[0])
    locations_dict[location_key] = "_".join(location_values)

  0%|          | 0/1035086 [00:00<?, ?it/s]

CPU times: user 1min 10s, sys: 31.5 s, total: 1min 42s
Wall time: 1min 52s


In [2]:
with open('../data/ru_reveal_wiki_topic.json') as user_file:
    ru_reveal_wiki_topic = json.load(user_file)
topics_dict = {list(d.keys())[0]: [v['topic'] for v in list(d.values())[0]] for d in tqdm(ru_reveal_wiki_topic)}

  0%|          | 0/1924975 [00:00<?, ?it/s]

In [3]:
user_dict = {a["page_name"]: a["revision_details"]["user"] for a in extracted_data}

In [4]:
def clean_text(text):
    if text is None:
        return text
    # Remove \n characters
    text = re.sub(r'\n', ' ', text)
    # Remove redundant spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def clean_template(text):
    return text

def get_template_parameters(templates):
    all_parameters = {}
    for t in templates:
        # Split the template string on the '|' character
        parts = t.split('|')

        if parts:
            # The first part is assumed to be the template name
            template_name = parts[0]

            # Initialize a dictionary for the current template
            template_parameters = {}

            # Process the remaining parts as parameter key-value pairs
            for param_pair in parts[1:]:
                param_split = param_pair.split('=')
                param_name, param_value = param_split[0], "".join(param_split[1:])
                # Create a combined key using the template name and parameter name
                combined_key = f"{template_name}+{param_name}"
                template_parameters[clean_text(combined_key)] = clean_text(param_value)

            # Add the template name and its parameters to the dictionary
            all_parameters.update(template_parameters)

    return all_parameters


def compare_templates(t1, t2): 
    params1 = get_template_parameters([t1])
    params2 = get_template_parameters([t2])
    changes = []
    for i in params1.keys():
        if params2.get(i) != params1.get(i):
            param_to_add = (i, params1.get(i), params2.get(i))
            if param_to_add not in changes:
                changes.append(param_to_add)
                
    for i in params2.keys():
        if params2.get(i) != params1.get(i):
            param_to_add = (i, params1.get(i), params2.get(i))
            if param_to_add not in changes:
                changes.append(param_to_add)
    
    return changes



# Clustering: 

In [None]:
changed_df = final_data[final_data.status.isin([3])].reset_index(drop=True)
action_features = pd.DataFrame(changed_df["actions"].to_list())
status_features = changed_df[["status"]].reset_index(drop=True)
n_added = changed_df.lines_added.apply(len).reset_index(drop=True)
n_removed = changed_df.lines_deleted.apply(len).reset_index(drop=True)
n_changed = changed_df.lines_changed.apply(len).reset_index(drop=True)

n_added = changed_df.lines_added.apply(lambda x: len(x)>0).reset_index(drop=True)
n_removed = changed_df.lines_deleted.apply(lambda x: len(x)>0).reset_index(drop=True)
n_changed = changed_df.lines_changed.apply(lambda x: len(x)>0).reset_index(drop=True)



# Templates
tem_added, tem_deleted, param_change = [], [], []
tem_added_counter, tem_deleted_counter, param_change_counter = Counter(), Counter(), Counter()
for wiki_features, ruwiki_features in \
    tqdm(zip(changed_df["wiki_features"].to_list(), changed_df["ruwiki_features"].to_list())):
    tem_added.append(set([clean_template(i) for i in ruwiki_features["templates"]]) - set([clean_template(i) for i in wiki_features["templates"]]) - set([None]))
    tem_deleted.append(set([clean_template(i) for i in wiki_features["templates"]]) - set([clean_template(i) for i in ruwiki_features["templates"]]) - set([None]))
    tem_added_counter.update(tem_added[-1])
    tem_deleted_counter.update(tem_deleted[-1])
    
    params_changed = []
    
    tem_added_tmp = set(ruwiki_features["templates"]) - set(wiki_features["templates"]) - set([None])
    tem_deleted_tmp = set(wiki_features["templates"]) - set(ruwiki_features["templates"]) - set([None])
    for t1, t2 in product(tem_added_tmp, tem_deleted_tmp):
        similarity = fuzz.ratio(t1, t2)
        if similarity > 60 and similarity < 100:
            try:
                params_changed += compare_templates(t1, t2)
            except:
                print(t1)
                pass
    param_change.append(params_changed)
    param_change_counter.update(params_changed)
    
added_most_common = [i[0] for i in tem_added_counter.most_common(15)]
deleted_most_common = [i[0] for i in tem_deleted_counter.most_common(15)]
change_key = Counter()
for li in param_change:
    for a, b, c in li:
        change_key.update([a])
        change_key.update([b])
        change_key.update([c])

keys_most_common = [i[0] for i in change_key.most_common(15)]
tag_a_features = pd.DataFrame([{c: c in cat_list for c in added_most_common} for cat_list in tem_added]).reset_index(drop=True)
tag_d_features = pd.DataFrame([{c: c in cat_list for c in deleted_most_common} for cat_list in tem_deleted]).reset_index(drop=True)
tag_c_features = pd.DataFrame([{c: c in ([a[0] for a in cat_list]) or (c in [a[1] for a in cat_list]) or (c in [a[2] for a in cat_list])
                                for c in keys_most_common} for cat_list in param_change]).reset_index(drop=True)
changed_df["tag_cat"] = tem_added    

# Category
categories_added, categories_removed, common_cat = [], [], []
categories_counter, common_categories_counter = Counter(), Counter()
for wiki_features, ruwiki_features in \
    zip(changed_df["wiki_features"].to_list(), changed_df["ruwiki_features"].to_list()):
    categories_added.append(set(ruwiki_features["categories"]) - set(wiki_features["categories"]))
    categories_removed.append(set(wiki_features["categories"]) - set(ruwiki_features["categories"]))
    common_cat.append(set(wiki_features["categories"]) & set(ruwiki_features["categories"]))
    categories_counter.update(categories_added[-1])
    categories_counter.update(categories_removed[-1])
    common_categories_counter.update(common_cat[-1])

changed_df["rm_cat"] = categories_removed
categories_most_common = [i[0] for i in categories_counter.most_common(5)]
categories_features = pd.DataFrame([{c: c in cat_list for c in categories_most_common} for cat_list in categories_removed]).reset_index(drop=True)
    
# topic
topics = changed_df.page_name.apply(lambda d: list(np.sort(topics_dict.get(d, []))))
topic_counter = Counter()
for i in topics:
    topic_counter.update(i)
topics_features = pd.DataFrame([{c: c in cat_list for c in topic_counter.keys()} for cat_list in topics]).reset_index(drop=True)

# location
locations = changed_df.page_name.apply(lambda d: locations_dict.get(d, "unknown")).reset_index(drop=True)
changed_df["locations"] = locations
repl = locations.value_counts()[20:].index
locations_features = pd.get_dummies(locations.replace(repl, 'uncommon')).reset_index(drop=True)

# user
user_features = changed_df.page_name.apply(lambda d: user_dict.get(d, "unknown")).reset_index(drop=True)
changed_df["user"] = user_features
repl = user_features.value_counts()[20:].index
user_features = pd.get_dummies(user_features.replace(repl, 'uncommon')).reset_index(drop=True)

In [6]:
import re
from stop_words import get_stop_words
stop_words = get_stop_words('ru')

def clean(text):
    # Define a regular expression pattern to match brackets and punctuation
    pattern = r'[()\[\]{}<>.,;!?:"\'-]'
    # Use re.sub() to replace matches with an empty string
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text

most_common_words = Counter()
for i in changed_df.page_name.apply(lambda x: [a for a in clean(x).split() if a not in stop_words]):
    most_common_words.update(i)
    
common_words = [a[0] for a in most_common_words.most_common(50)]

words = changed_df.page_name.apply(lambda x: {w: w in x for w in common_words})
words_features = pd.DataFrame(words.to_list())

In [7]:
features = pd.concat([
    locations_features.reset_index(drop=True), 
    action_features.reset_index(drop=True),
    tag_a_features.reset_index(drop=True),
    tag_d_features.reset_index(drop=True),
    tag_c_features.reset_index(drop=True),
    categories_features.reset_index(drop=True),
], axis=1)

features.columns = [x.replace(" ", "_") if x else "None" for x in features.columns]

In [8]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

pca = PCA(n_components=30)
# Fit the PCA model to your data and transform the data to the new feature space

scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(features)
transformed_data = pca.fit_transform(normalized_data)
# The transformed_data variable now contains your data in the reduced feature space
# You can also access the explained variance ratio for each component
explained_variance_ratio = pca.explained_variance_ratio_
# Print the explained variance ratio
print("Explained Variance Ratio:", explained_variance_ratio)
print("Explained Variance Ratio:", np.sum(explained_variance_ratio))

# Optionally, you can access the principal components themselves
principal_components = pca.components_
# Print the principal components (eigenvectors)
print("Principal Components (Eigenvectors):")
print(principal_components)

Explained Variance Ratio: [0.21394143 0.15233972 0.1226794  0.08445715 0.05485501 0.05093407
 0.04617379 0.04013948 0.02790248 0.02640415 0.02287483 0.01569849
 0.00916852 0.00756126 0.00694566 0.00690997 0.00654279 0.0056737
 0.00513964 0.00482592 0.00435439 0.00401216 0.00386696 0.00352471
 0.00348072 0.00302859 0.00297924 0.00282755 0.00276381 0.00248979]
Explained Variance Ratio: 0.9444953945248958
Principal Components (Eigenvectors):
[[-3.28538535e-03 -1.27113476e-02 -2.63343314e-03 ...  1.95648839e-03
  -0.00000000e+00  1.65329319e-03]
 [-2.60715989e-03  3.47265307e-01 -1.91222951e-03 ...  6.57898912e-03
  -0.00000000e+00  7.18163448e-03]
 [-4.17538625e-03 -1.68015328e-01 -3.04979932e-03 ...  1.75122647e-03
  -0.00000000e+00  2.50503254e-04]
 ...
 [-8.99242682e-03  1.12868470e-02  2.12324292e-03 ... -1.46987810e-01
   0.00000000e+00 -9.11265334e-03]
 [ 8.34438355e-03  5.89916867e-03  2.62218498e-02 ...  8.37258908e-01
  -0.00000000e+00  1.11993891e-02]
 [ 9.29995392e-03 -1.583453

In [9]:
from numpy import unique
from sklearn.cluster import DBSCAN, AffinityPropagation, HDBSCAN, AgglomerativeClustering
model = DBSCAN(eps=0.7, min_samples=10)
# fit model and predict clusters
yhat = model.fit_predict(transformed_data)
# retrieve unique clusters
clusters = unique(yhat)

In [10]:
len(clusters)

73

In [None]:
changed_df["cluster"] = yhat

for i in unique(yhat):
    if len(changed_df[changed_df.cluster == i]) > 100:
        print(i)
        print(len(changed_df[changed_df.cluster == i]))
        print(changed_df[changed_df.cluster == i].status.value_counts())
        if len(changed_df[changed_df.cluster == i]) > 10:  
            display(changed_df[changed_df.cluster == i].sample(10))
        print("-"*50)

In [12]:
changed_df[["page_name", "locations", "user", "cluster"]].to_csv("clusters_notext_no_users.csv")