In [1]:
from datetime import timezone
from datetime import datetime
import pandas as pd
import numpy as np
import pyspark.sql.functions as f
from pyspark.sql.types import *
from pyspark.sql import Window
from functools import reduce
from pyspark.sql.functions import udf
from dateutil import parser
import pickle

In [2]:
treated_wikis = pd.read_csv("data/ores_rcfilters_cutoffs.csv")

In [3]:
# select canonical cutoffs
wikis = set(treated_wikis.wiki_db)

In [4]:
# ok we're ready to fire up spark and make a stratified sample
wmhist = spark.read.table("wmf.mediawiki_history")
# we only need the latest snapshot
wmhist = wmhist.filter(f.col("snapshot") == "2019-09")
wmhist = wmhist.filter((f.col("event_entity") == "revision"))

In [5]:
wmhist = wmhist.select(['wiki_db','event_timestamp','event_comment','revision_id','revision_parent_id','revision_text_bytes','revision_text_bytes_diff','revision_text_sha1','revision_is_identity_reverted','revision_first_identity_reverting_revision_id','revision_is_identity_revert','revision_tags','event_user_id','event_user_text','event_user_is_anonymous','event_user_creation_timestamp','event_user_first_edit_timestamp','event_user_revision_count','event_user_seconds_since_previous_revision','page_id','page_title_historical','page_title',                'page_namespace','page_is_redirect','page_is_deleted','page_revision_count','page_seconds_since_previous_revision',"event_user_groups","event_user_is_bot_by","revision_deleted_parts",'revision_deleted_parts_are_suppressed'])


In [6]:

wmhist = wmhist.filter(f.col("wiki_db").isin(wikis))
wmhist = wmhist.filter(f.col("page_namespace") == 0)

In [7]:
# for convenience we'll use the start of the week as the time of intervention and ignore the fact that it was actually mid-week in some cases
wmhist = wmhist.withColumn("week",f.date_trunc("week",wmhist.event_timestamp))

In [8]:
remember_dict = {}

In [9]:
min_time = f.lit("2007-01-01")
wmhist = wmhist.filter(f.col('week') > min_time)

In [10]:
overwrite = False
if os.path.exists("data/all_wiki_weeks.csv") and not overwrite is True:
    wiki_weeks = pd.read_csv("data/all_wiki_weeks.csv",parse_dates=['week'])
else:
    wikis = wmhist.select("wiki_db").distinct().toPandas()
    weeks = wmhist.select("week").distinct().toPandas()

    wikis = wikis.assign(key=1)
    weeks = weeks.assign(key=1)
    wiki_weeks = wikis.merge(weeks,on='key')
    wiki_weeks = wiki_weeks.drop('key',axis=1)
    wiki_weeks.to_csv("data/all_wiki_weeks.csv",index=False)

In [11]:
from spark_functions import build_wmhist_step1
wmhist, remember_dict = build_wmhist_step1(wmhist, remember_dict)


In [12]:
from spark_functions import process_reverts
reverts, remember_dict = process_reverts(wmhist, spark, remember_dict)

In [13]:
wiki_weeks = spark.createDataFrame(wiki_weeks)

In [16]:
# exclude reverts with ttr > 30 days = 60 seconds * 60 minutes / second * 24hours / day * 30 days           
reverts = reverts.filter(f.col("time_to_revert") <= 30*24*60*60)
remember_dict['max_time_to_revert_days'] = 30                                                        

# reverts = reverts.withColumn("med_ttr", f.expr('percentile_approx(time_to_revert, 0.5,1)').over(Window.partitionBy(['wiki_db','week'])))                                                                       

In [17]:
reverts_by_week = reverts.groupBy(['wiki_db','week']).pivot("anon_new_established",['anonymous','newcomer','established']).agg(f.count('reverted_revision_id').alias("N_reverted"),
                                                                                                                               f.mean(f.log('time_to_revert')).alias('geom_mean_ttr'))

In [18]:
edits_by_week = wmhist.filter(wmhist.role_type!='bot'). groupBy(['wiki_db','week']).pivot("anon_new_established",['anonymous','newcomer','established']).agg(f.count("revision_id"))

In [19]:
edits_by_week = edits_by_week.withColumnRenamed("established","established_N_edits")
edits_by_week = edits_by_week.withColumnRenamed("anonymous","anonymous_N_edits")
edits_by_week = edits_by_week.withColumnRenamed("newcomer","newcomer_N_edits")

In [20]:
wiki_weeks = wiki_weeks.join(reverts_by_week,on=['wiki_db','week'],how='left_outer')

In [21]:
wiki_weeks = wiki_weeks.join(edits_by_week,on=['wiki_db','week'], how='left_outer')

In [22]:
wiki_weeks = wiki_weeks.fillna(0,['anonymous_N_reverted','newcomer_N_reverted','established_N_reverted','anonymous_N_edits','newcomer_N_edits','established_N_edits'])


In [24]:
wiki_weeks_out = wiki_weeks.repartition(1)

In [None]:
wiki_weeks_out.write.csv("/user/nathante/ores_bias_project/wiki_weeks_simplified.csv",header=True, compression=None, mode="overwrite")

In [36]:
?wiki_weeks_out.write.csv

In [28]:
wiki_weeks

DataFrame[wiki_db: string, week: timestamp, anonymous_N_reverted: bigint, anonymous_geom_mean_ttr: double, newcomer_N_reverted: bigint, newcomer_geom_mean_ttr: double, established_N_reverted: bigint, established_geom_mean_ttr: double, anonymous_N_edits: bigint, newcomer_N_edits: bigint, established_N_edits: bigint]