In [1]:
%%time
import glob
import joblib
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import altair as alt
import json
import seaborn as sns
from collections import Counter
import re



extracted_data = joblib.load("../data/ru_wiki_extracted_pages.data")
error_data = joblib.load("../data/ru_wiki_error_pages.data")
final_data = pd.DataFrame(joblib.load("../data/ru_wiki_final_dataset.data"))
with open('../data/ru_reveal_wiki_location.json') as user_file:
    ru_reveal_wiki_location = json.load(user_file)

locations_dict = dict()
for location in tqdm(ru_reveal_wiki_location):
    location_key = list(location.keys())[0]
    location_values = np.sort(list(location.values())[0])
    locations_dict[location_key] = "_".join(location_values)

  0%|          | 0/1035086 [00:00<?, ?it/s]

CPU times: user 1min 3s, sys: 12.5 s, total: 1min 16s
Wall time: 1min 18s


In [2]:
changed_pages = final_data[final_data.status.isin([3,4])].page_name.to_list()
error_pages = final_data[final_data.status.isin([1])].page_name.to_list() + [a["page_name"] for a in error_data]
unchanged_pages = [a["page_name"] for a in extracted_data if a["is_duplicate"]] + final_data[final_data.status.isin([2])].page_name.to_list()

In [57]:
len(changed_pages)

33664

In [58]:
len(unchanged_pages)

1874043

In [59]:
len(error_pages)

17745

In [60]:
len(changed_pages) + len(unchanged_pages) + len(error_pages)

1925452

In [118]:
1874043 / 1925452

0.973300295203412

In [61]:
locations_df = pd.DataFrame({
    "page_name": changed_pages + unchanged_pages + error_pages,
    "page_location": [locations_dict.get(p) for p in changed_pages] \
        + [locations_dict.get(p) for p in unchanged_pages] \
        + [locations_dict.get(p) for p in error_pages],
    "status": ["changed"] * len(changed_pages) + ["duplicated"] * len(unchanged_pages) + ["missing"] * len(error_pages)
})

In [62]:
print(locations_df.page_location.isna().mean())

0.4624192137742203


In [63]:
locations_df = locations_df[~locations_df.page_location.isna()]

In [64]:
group_res = locations_df.groupby(["page_location", "status"]) \
    .count().sort_values("page_name", ascending=False).reset_index()

top_15_changed_locations = group_res[group_res.status == "changed"].head(10).page_location.to_list()

locations_df["page_location"] = \
    locations_df["page_location"].apply(lambda x: x if x in top_15_changed_locations else "other")

group_res = locations_df.groupby(["page_location", "status"]) \
    .count().sort_values("page_name", ascending=False).reset_index()

couter_dict = group_res.groupby("status").page_name.sum().to_dict()
group_res["denominator"] = group_res.status.map(couter_dict)
group_res["location rate"] = group_res["page_name"] / group_res["denominator"] 

group_res['page_location_code'] = pd.Categorical(group_res.page_location, top_15_changed_locations)
group_res = group_res.sort_values("page_location_code").reset_index(drop=True)

In [65]:
renaming_dict = {
    "other": "other",
    "Russia": "RU",
    "United States of America": "US",
    "Ukraine": "UA",
    "France": "FR",
    "Germany": "DE",
    "United Kingdom": "UK",
    "Italy": "IT", 
    "Spain": "ES",
    "Japan": "JP",
    "Belarus": "BY",
    "Poland": "PL",
    "Canada": "CA",
    "Finland": "FI",
    "Russia_Ukraine": "UA+RU",
    "China": "CN"
}
group_res.page_location_code = group_res.page_location.map(renaming_dict)

In [None]:
# configurations
Blue = "#69b8d6"
Red = '#d53e4f'
Gray = '#5D646F'
Grid = '#5D646F'
background = '#F3F7F7'
font = 'Helvetica Neue'

title_color = Gray
text_color = Gray
text_size = 14
background = '#F3F7F7'


title_color = Gray
text_color = Gray
text_size = 14
background = '#F3F7F7'

# axes
axisY = alt.Axis(grid=True, 
                 domain=False, 
                 ticks=False,
                 labelAngle = 0,
                 labelColor = text_color,
                 labelFontSize = text_size - 2,
                 labelFontWeight = 400,
                 labelFont = font,
                 labelPadding = 5,
                 titleFont = font,
                 titleColor = text_color,
                 titleFontSize = text_size-2,
                 titleFontWeight = 400,
                 )
# axes
axisX = alt.Axis(grid=False, 
                 domain=False, 
                 ticks=False,
                 labels=False,
                 labelAngle = 30,
                 labelColor = text_color,
                 labelFontSize = text_size-2,
                 labelFontWeight = 400,
                 labelFont = font,
                 titleColor = text_color,
                 titleFontSize = text_size-2,
                 titleFontWeight = 400,
                 titleFont = font,
                 titleAngle=30,
                 labelPadding = 5,
                 titlePadding = -100
                 )

hist = alt.Chart(group_res, title="").mark_bar().encode(
    x=alt.X('status:O', axis=axisX, title="", sort=["changed", "duplicate", "error"]),
    y=alt.Y("location rate:Q", axis=axisY, title="Location rate within group", scale=alt.Scale(domain=[0, 0.4])),
    color = alt.Color('status:O', scale=alt.Scale(scheme='tableau10'), legend=alt.Legend(
        orient='none',
        title="Group:", titleAlign="right", titleFont=font, titleColor=text_color,
        legendX=200.5, legendY=4, fillColor="white", columns=1, strokeColor="gray", padding=5,
        direction='vertical',
        titleAnchor='middle', labelFontSize=text_size-2, labelFont=font, labelColor = text_color)),
    column = alt.Column('page_location_code:O', sort=group_res.page_location_code.unique(),
                        title="", 
                        spacing=7, 
                        header=alt.Header(labelAngle=0, labelAlign="center",  labelPadding=-165, 
                                          labelFontSize=text_size-4, labelFont=font, labelColor = text_color),)
).configure_view(
    stroke='gray', strokeOpacity=0.3,
).configure_axis(
    domainWidth=0.8
).configure_title(fontSize=20, 
                  color = Gray, 
                  font=font,
                  fontWeight = 600,
                  anchor = 'start'
).properties(width=20, height=140)

hist

In [13]:
1 - 0.4624

0.5376000000000001

In [14]:
3755 / (3755 + 1079 + 10)

0.7751857968620974

# Topics

In [67]:
with open('../data/ru_reveal_wiki_topic.json') as user_file:
    ru_reveal_wiki_topic = json.load(user_file)
topics_dict = {list(d.keys())[0]: [v['topic'] for v in list(d.values())[0]] for d in tqdm(ru_reveal_wiki_topic)}

  0%|          | 0/1924975 [00:00<?, ?it/s]

In [68]:
changed_pages = final_data[final_data.status.isin([3,4])].page_name.to_list()
error_pages = final_data[final_data.status.isin([1])].page_name.to_list() + [a["page_name"] for a in error_data]
unchanged_pages = [a["page_name"] for a in extracted_data if a["is_duplicate"]] + final_data[final_data.status.isin([2])].page_name.to_list()

In [69]:
counter_changed = Counter()
counter_error = Counter()
counter_unchanged = Counter()

n_errors = 0

for d in tqdm(changed_pages):
    topics = topics_dict.get(d, [])
    if len(topics) > 0:
        counter_changed.update([topics[0]])
    else:
        n_errors +=1
for d in tqdm(error_pages):
    topics = topics_dict.get(d, [])
    if len(topics) > 0:
        counter_error.update([topics[0]])
    else:
        n_errors +=1
for d in tqdm(unchanged_pages):
    topics = topics_dict.get(d, [])
    if len(topics) > 0:
        counter_unchanged.update([topics[0]])
    else:
        n_errors +=1

  0%|          | 0/33664 [00:00<?, ?it/s]

  0%|          | 0/17745 [00:00<?, ?it/s]

  0%|          | 0/1874043 [00:00<?, ?it/s]

In [72]:
names = list(set(counter_changed.keys()) | set(counter_error.keys()) | set(counter_unchanged.keys()))
ratios_changed = [counter_changed[k] / len(changed_pages) for k in tqdm(names)]
ratios_error = [counter_error[k] / len(error_pages) for k in tqdm(names)]
ratios_unchanged = [counter_unchanged[k] / len(unchanged_pages) for k in tqdm(names)]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

  0%|          | 0/64 [00:00<?, ?it/s]

In [73]:
df = pd.DataFrame(
    {
        "category": names*3, 
        "ratio": ratios_changed+ratios_error+ratios_unchanged, 
        "status": ["changed"]*len(names) + ["missing"]*len(names)+ ["duplicated"]*len(names)
    }
)
df

Unnamed: 0,category,ratio,status
0,Culture.Sports,0.018358,changed
1,Geography.Regions.Asia.North_Asia,0.041855,changed
2,Geography.Regions.Asia.Central_Asia,0.001456,changed
3,Geography.Regions.Africa.Eastern_Africa,0.000178,changed
4,Geography.Regions.Europe.Western_Europe,0.002792,changed
...,...,...,...
187,Culture.Media.Films,0.011932,duplicated
188,STEM.Mathematics,0.001208,duplicated
189,STEM.Space,0.008090,duplicated
190,STEM.Libraries_&_Information,0.000152,duplicated


In [74]:
df["main_category"] = df["category"].apply(lambda x: x.split(".")[0])
df["not_main_category"] = df["category"].apply(lambda x: x.split(".")[-1])

In [75]:
top_10_changed_topics = df[df.status == "changed"].sort_values("ratio", ascending=False).head(10).not_main_category.to_list()
df["not_main_category"] = \
    df["not_main_category"].apply(lambda x: x if x in top_10_changed_topics else "other")

df = df.groupby(["not_main_category", "status"]).sum().reset_index().sort_values("ratio", ascending=False)

df['not_main_category'] = pd.Categorical(df.not_main_category, top_10_changed_topics + ["other"])
df = df.sort_values("not_main_category").reset_index(drop=True)

In [None]:
# configurations
Blue = "#69b8d6"
Red = '#d53e4f'
Gray = '#5D646F'
Grid = '#5D646F'
background = '#F3F7F7'
font = 'Helvetica Neue'

title_color = Gray
text_color = Gray
text_size = 14
background = '#F3F7F7'


title_color = Gray
text_color = Gray
text_size = 14
background = '#F3F7F7'

# axes
axisY = alt.Axis(grid=True, 
                 domain=False, 
                 ticks=False,
                 labelAngle = 0,
                 labelColor = text_color,
                 labelFontSize = text_size - 2,
                 labelFontWeight = 400,
                 labelFont = font,
                 labelPadding = 5,
                 titleFont = font,
                 titleColor = text_color,
                 titleFontSize = text_size-2,
                 titleFontWeight = 400,
                 )
# axes
axisX = alt.Axis(grid=False, 
                 domain=False, 
                 ticks=False,
                 labels=False,
                 labelAngle = 30,
                 labelColor = text_color,
                 labelFontSize = text_size-2,
                 labelFontWeight = 400,
                 labelFont = font,
                 titleColor = text_color,
                 titleFontSize = text_size-2,
                 titleFontWeight = 400,
                 titleFont = font,
                 titleAngle=30,
                 labelPadding = 5,
                 titlePadding = -100
                 )

hist = alt.Chart(df, title="").mark_bar().encode(
    x=alt.X('status:O', axis=axisX, title="", sort=["changed", "duplicate", "error"]),
    y=alt.Y("ratio:Q", axis=axisY, title="Topic rate within group", scale=alt.Scale(domain=[0, 0.4])),
    color = alt.Color('status:O', scale=alt.Scale(scheme='tableau10'), legend=
                      alt.Legend(
                        orient='none',
                        title="Group:", titleAlign="right", titleFont=font, titleColor=text_color,
                        legendX=230, legendY=-4, fillColor="white", columns=1, strokeColor="gray", padding=5,
                        direction='vertical',
                        titleAnchor='middle', labelFontSize=text_size-2, labelFont=font, labelColor = text_color)
                     ),
    column = alt.Column('not_main_category:O', sort=group_res.page_location_code.unique(),
                        title="", 
                        spacing=7, 
                        header=alt.Header(labelAngle=20, labelAlign="left",  labelPadding=10, labelOrient="bottom",
                                          labelBaseline="bottom", 
                                          labelFontSize=text_size-4, labelFont=font, labelColor = text_color),)
).configure_view(
    stroke='gray', strokeOpacity=0.3,
).configure_axis(
    domainWidth=0.8
).configure_title(fontSize=20, 
                  color = Gray, 
                  font=font,
                  fontWeight = 600,
                  anchor = 'start'
).properties(width=20, height=140)

hist

# Categories: 
- top 5 added
- top 5 deleted categories


In [26]:
changed_df = final_data[final_data.status.isin([3,4])]

In [27]:
added_categories, removed_categories = Counter(), Counter()

for wiki_features, ruwiki_features in \
    zip(changed_df["wiki_features"].to_list(), changed_df["ruwiki_features"].to_list()):
    added_categories.update(set(ruwiki_features["categories"]) - set(wiki_features["categories"]))
    removed_categories.update(set(wiki_features["categories"]) - set(ruwiki_features["categories"])) 

In [28]:
added_categories.most_common(15)

[('Сёла Донецкой Народной Республики', 621),
 ('Посёлки Донецкой Народной Республики', 213),
 ('Посёлки городского типа Донецкой Народной Республики', 138),
 ('Воинские формирования России, участвовавшие в военных действиях на Украине (с 2022)',
  115),
 ('Посёлки городского типа Луганской Народной Республики', 109),
 ('Посёлки городского типа Донецкой области (до 2022)', 109),
 ('Населённые пункты Волновахского района (до 2022)', 83),
 ('Населённые пункты Артёмовского района', 78),
 ('Населённые пункты Амвросиевского района (исторического)', 77),
 ('Населённые пункты Шахтёрского района (исторического)', 71),
 ('Населённые пункты Бахмутского района (до 2022)', 71),
 ('Погибшие в ходе военных действий на Украине (Россия)', 68),
 ('Населённые пункты Добропольского района (исторического)', 67),
 ('Участники военных действий на Украине со стороны России', 66),
 ('Населённые пункты Александровского района (Донецкая Народная Республика)',
  62)]

In [29]:
removed_categories.most_common(15)

[('Статьи с эдитнотисом об осторожности при редактировании', 5042),
 ('Лица, подвергнутые санкциям в связи с конфликтом на Украине', 1412),
 ('Временно оккупированные территории Украины', 920),
 ('Компании, подвергнутые санкциям в связи с вторжением России на Украину',
  452),
 ('Посёлки городского типа Донецкой области', 129),
 ('Воинские формирования России, участвовавшие во вторжении России на Украину (2022)',
  123),
 ('Посёлки городского типа Луганской области', 109),
 ('Погибшие в ходе вторжения России на Украину (Россия)', 83),
 ('Лица, подвергнутые санкциям со стороны Украины в связи с конфликтом на её территории',
  81),
 ('Участники вторжения России на Украину', 80),
 ('Участники обороны Украины от вторжения России (2022)', 79),
 ('Здания и сооружения, разрушенные в результате российско-украинской войны',
  66),
 ('Населённые пункты Александровского района (Донецкая область)', 61),
 ('Населённые пункты Константиновского района (Донецкая область)', 60),
 ('Города Донецкой обла

In [30]:
len(removed_categories)

991

In [31]:
len(added_categories)

1056

## References

In [109]:
def extract_domain_names(texts):
    domains = []
    for text in texts:
        pattern = r'https?://(?:www\.)?([^/\r\n}]+)'
        domain_names = re.findall(pattern, text)
        domains += domain_names
    return domains

url_references_added = Counter()
url_references_deleted= Counter()

n_pages_added = 0
n_pages_deleted = 0

for wiki_features, ruwiki_features in \
    zip(changed_df["wiki_features"].to_list(), changed_df["ruwiki_features"].to_list()):
    references_deleted = list(
            set(extract_domain_names(wiki_features.get('references', []))) - 
            set(extract_domain_names(ruwiki_features.get('references', [])))
        )
    references_added = list(
            set(extract_domain_names(ruwiki_features.get('references', []))) - 
            set(extract_domain_names(wiki_features.get('references', [])))
        )
    url_references_deleted.update(references_deleted)
    url_references_added.update(references_added)
    
    n_pages_added += 1 if len(references_added) > 0 else 0
    n_pages_deleted += 1 if len(references_deleted) > 0 else 0

In [110]:
total_added = np.sum(list(url_references_added.values()))
print("Total number of added references: ", total_added)
print("Total number of pages with added references: ", n_pages_added)

print("Rating of added references:")
url_references_added.most_common(20)

Total number of added references:  14300
Total number of pages with added references:  7106
Rating of added references:


[('publication.pravo.gov.ru', 5220),
 ('lug-info.com', 783),
 ('zapgov.ru', 700),
 ('khogov.ru', 647),
 ('glavadnr.ru', 631),
 ('base.garant.ru', 409),
 ('crimea.gov.ru', 233),
 ('tass.ru', 231),
 ('kremlin.ru', 182),
 ('docs.cntd.ru', 149),
 ('ria.ru', 138),
 ('rbc.ru', 129),
 ('cyberleninka.ru', 127),
 ('kommersant.ru', 90),
 ('minjust.gov.ru', 78),
 ('rg.ru', 70),
 ('lenta.ru', 63),
 ('interfax.ru', 61),
 ('iz.ru', 52),
 ('gazeta.ru', 52)]

In [111]:
source = pd.DataFrame(url_references_added.items())
source.columns = ["web", "count"]
source = source[source.web != "web.archive.org"]
source["rate"] = source["count"] / 33664
source_top = source.sort_values("rate", ascending=False)[:10]
source_top

Unnamed: 0,web,count,rate
1,publication.pravo.gov.ru,5220,0.155062
16,lug-info.com,783,0.023259
14,zapgov.ru,700,0.020794
15,khogov.ru,647,0.019219
13,glavadnr.ru,631,0.018744
21,base.garant.ru,409,0.012149
110,crimea.gov.ru,233,0.006921
37,tass.ru,231,0.006862
19,kremlin.ru,182,0.005406
5,docs.cntd.ru,149,0.004426


In [112]:
source_top["classification"] = ["Russian government", "General resource",
                                "General resource","General resource",
                                "General resource", "General resource",
                                "Russian government", "General resource",
                                "Russian government", "General resource"]
source_top

Unnamed: 0,web,count,rate,classification
1,publication.pravo.gov.ru,5220,0.155062,Russian government
16,lug-info.com,783,0.023259,General resource
14,zapgov.ru,700,0.020794,General resource
15,khogov.ru,647,0.019219,General resource
13,glavadnr.ru,631,0.018744,General resource
21,base.garant.ru,409,0.012149,General resource
110,crimea.gov.ru,233,0.006921,Russian government
37,tass.ru,231,0.006862,General resource
19,kremlin.ru,182,0.005406,Russian government
5,docs.cntd.ru,149,0.004426,General resource


In [None]:
# axes
axisY = alt.Axis(grid=True, 
                 domain=False, 
                 ticks=False,
                 labelAngle = 0,
                 labelColor = text_color,
                 labelFontSize = text_size,
                 labelFontWeight = 400,
                 labelFont = font,
                 labelPadding = 5,
                 titleFont = font,
                 titleColor = text_color,
                 titleFontSize = text_size-2,
                 titleFontWeight = 400,
                 )
# axes
axisX = alt.Axis(grid=False, 
                 domain=False, 
                 ticks=False,
                 labelAngle =20,
                 labelColor = text_color,
                 labelFontSize = text_size-2,
                 labelFontWeight = 400,
                 labelFont = font,
                 titleColor = text_color,
                 titleFontSize = text_size-2,
                 titleFontWeight = 400,
                 titleFont = font,
                 labelPadding = 5,
                 titlePadding = 5
                 )
domain = ["Russian government", "General resource"]
range_ = ['#d1615d', '#5778a4']

hist_added = alt.Chart(source_top, title="").mark_bar().encode(
    x=alt.X('web:O', sort="-y", axis=axisX, title=""),
    y=alt.Y("rate:Q", axis=axisY, title="Rate"),
    color = alt.Color('classification:O', scale=alt.Scale(domain=domain, range=range_), legend=
                      alt.Legend(
                        orient='none',
                        title="Added pages types:", titleAlign="center", titleFont=font, titleColor=text_color,
                        legendX=290, legendY=-4, fillColor="white", columns=1, strokeColor="gray", padding=5,
                        direction='vertical',
                        titleAnchor='middle', labelFontSize=text_size-2, labelFont=font, labelColor = text_color)
                     ),
).configure_view(
    stroke='transparent'
).configure_axis(
    domainWidth=0.8
).configure_title(fontSize=30, 
                  color = Gray, 
                  font=font,
                  fontWeight = 600,
                  anchor = 'start'
).properties(width=450, height=100)

hist_added

In [114]:
total_deleted = np.sum(list(url_references_deleted.values()))
print("Total number of deleted references: ", total_deleted)
print("Total number of pages with deleted references: ", n_pages_deleted)

print("Rating of deleted references:")
url_references_deleted.most_common(21)

Total number of deleted references:  20891
Total number of pages with deleted references:  6095
Rating of deleted references:


[('gska2.rada.gov.ua', 1108),
 ('web.archive.org', 1040),
 ('sanctions.nazk.gov.ua', 662),
 ('dialog.ua', 386),
 ('meduza.io', 336),
 ('zakon2.rada.gov.ua', 325),
 ('archive.is', 322),
 ('bbc.com', 314),
 ('eur-lex.europa.eu', 243),
 ('korrespondent.net', 229),
 ('svoboda.org', 225),
 ('president.gov.ua', 177),
 ('rupep.org', 155),
 ('novayagazeta.ru', 154),
 ('dw.com', 153),
 ('zakon5.rada.gov.ua', 147),
 ('dnr-news.com', 144),
 ('echo.msk.ru', 136),
 ('currenttime.tv', 123),
 ('youtube.com', 113),
 ('zakon.rada.gov.ua', 108)]

In [115]:
source = pd.DataFrame(url_references_deleted.items())
source.columns = ["web", "count"]
source = source[source.web != "web.archive.org"]
source["rate"] = source["count"] / 33664
source_top = source.sort_values("rate", ascending=False)[:10]
source_top

source_top["classification"] = ["EU/Ukrainian government", "EU/Ukrainian government",
                                "General resource", "General resource", 
                                "EU/Ukrainian government", "General resource",
                                "General resource", "EU/Ukrainian government",
                                "General resource", "General resource"]
source_top

Unnamed: 0,web,count,rate,classification
2,gska2.rada.gov.ua,1108,0.032913,EU/Ukrainian government
42,sanctions.nazk.gov.ua,662,0.019665,EU/Ukrainian government
36,dialog.ua,386,0.011466,General resource
39,meduza.io,336,0.009981,General resource
1,zakon2.rada.gov.ua,325,0.009654,EU/Ukrainian government
3,archive.is,322,0.009565,General resource
73,bbc.com,314,0.009327,General resource
46,eur-lex.europa.eu,243,0.007218,EU/Ukrainian government
204,korrespondent.net,229,0.006803,General resource
165,svoboda.org,225,0.006684,General resource


In [None]:
domain = ["EU/Ukrainian government", "General resource"]
range_ = ['#e49444', '#5778a4']

hist_removed = alt.Chart(source_top, title="").mark_bar().encode(
    x=alt.X('web:O', sort="-y", axis=axisX, title=""),
    y=alt.Y("rate:Q", axis=axisY, title="Rate"),
    color = alt.Color('classification:O', scale=alt.Scale(domain=domain, range=range_), legend=
                      alt.Legend(
                        orient='none',
                        title="Removed pages types:", titleAlign="center", titleFont=font, titleColor=text_color,
                        legendX=290, legendY=-4, fillColor="white", columns=1, strokeColor="gray", padding=5,
                        direction='vertical',
                        titleAnchor='middle', labelFontSize=text_size-2, labelFont=font, labelColor = text_color)
                     ),
).configure_view(
    stroke='transparent'
).configure_axis(
    domainWidth=0.8
).configure_title(fontSize=30, 
                  color = Gray, 
                  font=font,
                  fontWeight = 600,
                  anchor = 'start'
).properties(width=450, height=100)

hist_removed

# Media Difference: 

In [535]:
added_media, removed_media = Counter(), Counter()

for wiki_features, ruwiki_features in \
    zip(changed_df["wiki_features"].to_list(), changed_df["ruwiki_features"].to_list()):
    added_media.update(set(ruwiki_features["media"]) - set(wiki_features["media"]))
    removed_media.update(set(wiki_features["media"]) - set(ruwiki_features["media"])) 

In [537]:
len(added_media)

499

In [538]:
len(removed_media)

1796

In [543]:
removed_media.most_common(15)

[('Почесна відзнака «Місто-рятівник».jpg', 7),
 ('Silicone glans ring.jpg', 4),
 ('Wiki-cunnilingus.png', 4),
 ('Lolicon Sample.png', 3),
 ('Origin-of-the-World.jpg', 3),
 ('Moscow Pride 2010 (Family).jpg', 3),
 ('Europe-Ukraine (disputed territory).svg', 3),
 ('Wiki-anal missionary.png', 3),
 ('Flag of Donetsk.svg', 3),
 ('Balanitis on an intact penis.jpg', 3),
 ('Lesson 1 Private Tutor.jpg', 2),
 ('BDSM-Paar - Top and Bottom - Europride 2002.jpg', 2),
 ('Russian plane with bombs shot down over Chernihiv (1).jpg', 2),
 ('Flag of Russia.svg', 2),
 ('Volodymyr Zelensky and Donald Trump.jpg', 2)]

In [544]:
added_media.most_common(10)

[('Russian Federation (orthographic).svg', 3),
 ('Платформа Ухтомская (август 2023) (02).jpg', 2),
 ('Общая энциклопедия наук и искусств в алфавитном порядке, составленная и изданная Й.С. Эршем и Й.Г. Грубером, 1838 год..jpg',
  2),
 ('Damages in Mariupol 2014 - 0136.jpg', 2),
 ('Mariupol locator map.svg', 2),
 ('Mariupol City Hall.jpg', 2),
 ('Flag of Mariupol.svg', 2),
 ('Damages in Mariupol 2014 - 0131.jpg', 2),
 ('Damages in Mariupol 2014 - 0129.jpg', 2),
 ('Mariupol 2007 (20).jpg', 2)]

In [545]:
final_data

Unnamed: 0,page_name,status,lines_added,lines_deleted,lines_changed,actions,wiki_features,ruwiki_features,parsing_time
0,Сююрю-Кая (пещера),3,[],[],[],"{'change_Argument': 0, 'insert_Argument': 0, '...",{'categories': ['География Бахчисарайского рай...,{'categories': ['География Бахчисарайского рай...,2023-09-19 21:14:12.417969
1,Новый Свет (Ленинский район),3,[],[],[],"{'change_Argument': 0, 'insert_Argument': 0, '...",{'categories': ['Исчезнувшие населённые пункты...,{'categories': ['Исчезнувшие населённые пункты...,2023-09-19 21:14:12.435584
2,Педофильское движение,3,[],[],[],"{'change_Argument': 0, 'insert_Argument': 0, '...","{'categories': ['Педофилия', 'Общественные дви...","{'categories': ['Педофилия', 'Общественные дви...",2023-09-19 21:14:12.443224
3,Снитовский сельсовет,3,[],[],[],"{'change_Argument': 0, 'insert_Argument': 0, '...",{'categories': ['Упразднённые сельсоветы Ивано...,{'categories': ['Упразднённые сельсоветы Ивано...,2023-09-19 21:14:12.450069
4,Подлесский сельсовет,3,[],[],[],"{'change_Argument': 0, 'insert_Argument': 0, '...",{'categories': ['Упразднённые сельсоветы Ляхов...,{'categories': ['Упразднённые сельсоветы Ляхов...,2023-09-19 21:14:12.457260
...,...,...,...,...,...,...,...,...,...
35906,Катунь,4,"[Катунь () — одна из крупнейших рек на Алтае, ...","[Длина реки — 688 км, Площадь бассейна — 60 90...",[],"{'change_Argument': 0, 'insert_Argument': 0, '...","{'categories': ['Катунь', 'Притоки Оби', 'Реки...","{'categories': ['Катунь', 'Притоки Оби', 'Реки...",2023-09-19 23:54:51.433589
35907,Это я — Эдичка,3,[],[],[],"{'change_Argument': 0, 'insert_Argument': 0, '...","{'categories': ['Романы Эдуарда Лимонова', 'Ма...","{'categories': ['Романы Эдуарда Лимонова', 'Ма...",2023-09-19 23:54:51.446008
35908,Склонность (фильм),3,[],[],[],"{'change_Argument': 0, 'insert_Argument': 0, '...","{'categories': ['Фильмы-драмы Великобритании',...","{'categories': ['Фильмы-драмы Великобритании',...",2023-09-19 23:54:51.461387
35909,"Алексеевка (Первомайский район, Крым)",4,[],[],"[(Алексе́евка (до 1945 года Эски́-Алике́ч ; , ...","{'change_Argument': 0, 'insert_Argument': 0, '...",{'categories': ['Населённые пункты Первомайско...,{'categories': ['Населённые пункты Первомайско...,2023-09-19 23:54:51.554371


In [547]:
final_data["wiki_features"].to_list()[0]

{'categories': ['География Бахчисарайского района',
  'Пещеры в известняках',
  'Пещеры Ай-Петринской яйлы'],
 'media': ['Сююрю-Кая (пещера).jpg'],
 'references': ['<ref>{{Cite web |url=http://www.rgo-speleo.ru/caves/cavelist1989.htm#x4 |title=Перечень классифицированных пещер |access-date=2017-02-28 |archive-date=2020-08-04 |archive-url=https://web.archive.org/web/20200804020253/http://www.rgo-speleo.ru/caves/cavelist1989.htm#x4 |deadlink=no }}</ref>']}