In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm
import base64

tqdm.pandas()
from wekeypedia import WikipediaPage as Page

In [2]:
page = Page('Mary_Shelley', lang='fr')

In [3]:
revisions = (
    pd
    .DataFrame
    .from_records(page.get_revisions_list())
    .assign(
        diff = lambda df: df.revid.progress_apply(lambda x: page.get_diff(x)).fillna(''),
        deleted = lambda df: df['diff'].apply(lambda diff: page.extract_plusminus(diff)['deleted']),
        added = lambda df: df['diff'].apply(lambda diff: page.extract_plusminus(diff)['added']),
    )
    .set_index('revid')
)

revisions

100%|████████████████████████████████████████████████████████████████████| 1106/1106 [06:30<00:00,  2.83it/s]


Unnamed: 0_level_0,parentid,user,userid,timestamp,size,sha1,comment,anon,diff,deleted,added
revid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
213909561,213909204,Punctilla,3894623,2024-04-02T17:55:56Z,117811,2bfc455f5a505c71d760f94665d8207b6abd14be,/* Lac Léman et Frankenstein */m syntaxe & typo,,"<tr>\n <td colspan=""2"" class=""diff-lineno"">Li...","[ Shelley, }}, , au {{s-|XVIII}}, je, sortis, ...","[|Shelley, . , du {{s-|XVIII}}, [elle], sorti..."
213909204,213907613,Punctilla,3894623,2024-04-02T17:41:19Z,117805,91d67258a1c6a26b25aff32edc9a3217cf749b3b,/* Percy Bysshe Shelley */Naissance de son fil...,,"<tr>\n <td colspan=""2"" class=""diff-lineno"">Li...","[de son, et, de, celui, d’Harriet]","[du, qu’il a, conçu, avec, Harriet]"
213907613,213907385,Jean-Christophe BENOIST,44331,2024-04-02T16:53:53Z,117803,6527512046bed02375da01bc143438af0ff9dc2c,Pourquoi ? Les sources notables ne l'évitent p...,,"<tr>\n <td colspan=""2"" class=""diff-lineno"">Li...",[auteur],[auteure]
213907385,213899110,Toise on dort,4662387,2024-04-02T16:46:41Z,117802,b50e097c57e23bc00fcb32442e5ccdf562da0cf4,à éviter,,"<tr>\n <td colspan=""2"" class=""diff-lineno"">Li...",[auteure],[auteur]
213899110,213899080,DarkVador79-UA,4553277,2024-04-02T11:38:11Z,117803,6527512046bed02375da01bc143438af0ff9dc2c,mef RI,,"<tr>\n <td colspan=""2"" class=""diff-lineno"">Li...","[date-, |, |1797|en, littérature, date-|1, |, ...","[(, ), Date de naissance, , 1797, Date de déc..."
...,...,...,...,...,...,...,...,...,...,...,...
36113,33447,Mgimpel,203,2003-03-05T18:13:35Z,427,bac7194b08a73d3906d8fd9a820d978764a16d07,lien vers articles anglais et esperanto,,"<tr>\n <td colspan=""2"" class=""diff-lineno"">Li...",[],[[[eo:Mary SHELLEY]][[en:Mary Shelley]]]
33447,15741,Emma,68,2002-12-03T09:10:40Z,387,8aa9b2b52ee8bc7ece4ebaf44803b18cfee3fd18,,,"<tr>\n <td colspan=""2"" class=""diff-lineno"">Li...",[],"[, [[Frankenstein]] fut écrit en [[1818]] et e..."
15741,4669,script de conversion,0,2002-10-31T10:12:05Z,290,dc2ab2e03c89649f77e3ba05d256feb9ad513fc2,Conversion automatique,,,[],[]
4669,4668,Youssefsan,8,2002-05-27T09:49:34Z,290,dc2ab2e03c89649f77e3ba05d256feb9ad513fc2,*,,"<tr>\n <td colspan=""2"" class=""diff-lineno"">Li...","[, , , , , ]",[[[science fiction]] | [[littérature anglaise]...


In [32]:
def count(text, marker): 
    return text.lower().count(marker.lower())

df = (
    revisions
    .assign(
        auteure = lambda df: df.added.apply(lambda x: count(' '.join(x), 'auteure')),
        auteur = lambda df: df.added.apply(lambda x: count(' '.join(x), 'auteur')) - df.auteure,
        autrice = lambda df: df.added.apply(lambda x: count(' '.join(x), 'autrice')),
    )
)

(
    df
    .query('timestamp >= "2020-01-01"')
    .assign(
        year = lambda df: pd.to_datetime(df.timestamp).dt.year
    )
    .groupby(['user'])
    [['auteur', 'auteure', 'autrice']]
    .sum()
    .sort_values('auteur', ascending=False)
)

Unnamed: 0_level_0,auteur,auteure,autrice
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Sherwood6,22,7,0
Do not follow,4,1,0
Gnrc,2,0,0
91.160.179.105,2,0,0
Azurfrog,1,0,0
...,...,...,...
79.91.223.71,0,0,4
78.113.181.106,0,0,0
2A04:CEC0:11C7:6F4:68EB:C098:1EFC:A55E,0,0,0
2A02:8428:81A5:6501:212A:B9E5:2388:A778,0,0,0


In [26]:
(
    df
    .query('timestamp >= "2020-01-01"')
    .query('user=="Sherwood6"')
    .assign(
        year = lambda df: pd.to_datetime(df.timestamp).dt.year
    )
    .groupby(['year'])
    [['auteur', 'auteure', 'autrice']]
    .sum()
)

Unnamed: 0_level_0,auteur,auteure,autrice
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022,0,3,0
2023,2,0,0
2024,20,4,0


In [28]:
def format_revid_index(revid, page):
    return f'<a href="https://fr.wikipedia.org/w/index.php?title={page}&diff=prev&oldid={revid}">{revid}</a>'

In [30]:
(
    df
    .query('timestamp >= "2020-01-01"')
    .query('user=="Sherwood6"')
    .query('auteur > 0')
    [['user', 'timestamp']]
    .style
    .format_index(lambda revid: format_revid_index(revid, 'Mary_Shelley'))
)

Unnamed: 0_level_0,user,timestamp
revid,Unnamed: 1_level_1,Unnamed: 2_level_1
213884037,Sherwood6,2024-04-01T21:10:43Z
213867631,Sherwood6,2024-04-01T12:24:12Z
213845390,Sherwood6,2024-03-31T18:25:34Z
213845378,Sherwood6,2024-03-31T18:25:17Z
211725843,Sherwood6,2024-01-21T17:54:46Z
211587991,Sherwood6,2024-01-17T10:22:05Z
211410618,Sherwood6,2024-01-11T13:47:43Z
211286708,Sherwood6,2024-01-07T17:06:09Z
209316143,Sherwood6,2023-11-03T22:03:25Z
201919228,Sherwood6,2023-03-02T15:03:25Z
