Load processed conflict data for April 2020.  This includes the Schema:TwoColConflictConflict data joined with metadata about involved revisions.

In [1]:
conflicts = spark.read.parquet("/user/awight/edit-conflicts/conflict_rev_details").cache()
conflicts.count()

47786

In [2]:
conflicts.orderBy("conflict_timestamp", ascending=False).select("conflict_timestamp", "page_namespace", "page_title").show(10, False)

+-------------------+--------------+------------------------------------------------------+
|conflict_timestamp |page_namespace|page_title                                            |
+-------------------+--------------+------------------------------------------------------+
|2020-04-30 23:59:57|1             |Somaliere i Norge                                     |
|2020-04-30 23:59:07|0             |Mexborough                                            |
|2020-04-30 23:53:54|3             |Люба КБ                                               |
|2020-04-30 23:50:10|10            |Ballon d'Or recipients                                |
|2020-04-30 23:46:46|0             |Evidence based assessment/Rx4DxTx of bipolar in youths|
|2020-04-30 23:46:14|0             |List of Dog Man Episodes                              |
|2020-04-30 23:46:12|0             |Farfarello                                            |
|2020-04-30 23:46:05|0             |Farfarello                                  

Add some calculated columns:
* `next_edit_delta`: Number of seconds elapsed between entering the conflict workflow and the next revision on an article.
* `is_revolved`: True if a new revision is stored within 1 hour of entering the conflict workflow.  This is a crappy proxy for the actual success of the workflow.
* `is_talk`: True if the article namespace was a talk page (odd namespace ID) or the project namespace (ID = 4).
* `is_anon`: True when the user is anonymous.

In [4]:
import pandas as pd
c = conflicts.toPandas()
c['next_edit_delta'] = (c['next_timestamp'] - c['conflict_timestamp']) / pd.Timedelta(1, unit='s')
c['is_resolved'] = c['next_rev_id'].ne(pd.NaT) & c['next_edit_delta'].lt(3600)
c['is_talk'] = (c['page_namespace'].ne(0) & c['page_namespace'].mod(2).eq(1)) | c['page_namespace'].eq(4)
c['is_anon'] = c['user_editcount'].eq(0)

In [5]:
c.groupby(['is_talk', 'is_anon', 'is_twocol']).mean()['is_resolved']

is_talk  is_anon  is_twocol
False    False    False        0.844103
                  True         0.891847
         True     False        0.706459
                  True         0.685268
True     False    False        0.927507
                  True         0.973219
         True     False        0.855937
                  True         0.826531
Name: is_resolved, dtype: float64

In [6]:
c.groupby(['is_talk', 'is_anon', 'is_twocol']).count()['is_resolved']

is_talk  is_anon  is_twocol
False    False    False        14067
                  True          4244
         True     False         9072
                  True           448
True     False    False        16443
                  True          1755
         True     False         1659
                  True            98
Name: is_resolved, dtype: int64

In [9]:
# '2020-03-25 12:00:00' -> 
from pyspark.sql.functions import col, unix_timestamp
dewiki_conflicts = conflicts.filter((col('wiki') == 'dewiki') & (unix_timestamp(col('conflict_timestamp'), "yyyy-MM-dd HH:mm:ss") > 1585137600))
dewiki_conflicts.count()

4972

In [11]:
d = dewiki_conflicts.toPandas()
d['next_edit_delta'] = (d['next_timestamp'] - d['conflict_timestamp']) / pd.Timedelta(1, unit='s')
d['is_resolved'] = d['next_rev_id'].ne(pd.NaT) & d['next_edit_delta'].lt(3600)
d['is_talk'] = (d['page_namespace'].ne(0) & d['page_namespace'].mod(2).eq(1)) | d['page_namespace'].eq(4)
d['is_anon'] = d['user_editcount'].eq(0)
d.groupby(['is_talk', 'is_anon', 'is_twocol']).mean()['is_resolved']

is_talk  is_anon  is_twocol
False    False    False        0.813333
                  True         0.880803
         True     True         0.718563
True     False    False        0.958294
                  True         0.980728
         True     False        0.908784
                  True         0.822917
Name: is_resolved, dtype: float64

In [12]:
d.groupby(['is_talk', 'is_anon', 'is_twocol']).count()['is_resolved']

is_talk  is_anon  is_twocol
False    False    False          75
                  True         1594
         True     True          334
True     False    False        2110
                  True          467
         True     False         296
                  True           96
Name: is_resolved, dtype: int64

In [19]:
c["user_is_new"] = (c["user_editcount"] < 100)
c["no_js"] = c["is_js"] != True
c.groupby(['no_js']).mean()['user_is_new']

no_js
False    0.377483
True     0.484655
Name: user_is_new, dtype: float64