In [247]:
import pandas as pd
import numpy as np

import pyspark.sql.functions as f
from pyspark.sql.types import *
from pyspark.sql import Window
from functools import reduce
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql.functions import udf


In [248]:
from datetime import datetime

In [249]:
spark.sparkContext.addPyFile("../.local/lib/python3.5/site-packages/mwcomments-0.3.3-py3.5.egg")
spark.sparkContext.addPyFile("../.local/lib/python3.5/site-packages/sortedcontainers-2.1.0-py3.5.egg")
spark.sparkContext.addPyFile("../.local/lib/python3.5/site-packages/python_dateutil-2.8.0-py3.5.egg")
spark.sparkContext.addPyFile("./spark_functions.py")

In [250]:
import mwcomments

In [251]:
from project_settings import *

In [252]:
remeber_dict = {}

In [253]:
cutoffs = pd.read_csv(os.path.join(data_dir,"ores_rcfilters_cutoffs.csv"))

In [254]:
# compare 14 days before and after the cutoff
# unless there's another cutoff less than 28 days away, in which case split the difference
by_wiki = cutoffs.groupby('wiki_db')

In [255]:
cutoffs['date'] = pd.to_datetime(cutoffs.deploy_dt)
cutoffs = cutoffs.drop("deploy_gap",1)
cutoffs = cutoffs.drop("deploy_dt",1)
cutoffs = cutoffs.drop("commit_dt",1)


In [256]:
def set_cutoff_period(df):
    df = df.sort_values(by=['date'])
    next_cutoff = df.shift(1)
    df['time_since_last_cutoff']  = df.date - df.shift(1).date
    df['time_till_next_cutoff']  = df.shift(-1).date - df.date
    df = df.reset_index()
    return df

In [257]:
cutoffs = cutoffs.groupby("wiki_db").apply(set_cutoff_period)

cutoffs = cutoffs.drop('wiki_db',1).reset_index()
cutoffs = cutoffs.drop("level_1",1)

select =[ 'wiki_db','has_ores','has_rcfilters','has_rcfilters_watchlist','time_since_last_cutoff','time_till_next_cutoff','date']

In [258]:
# We special case wikis where some issues lead to changes and deployments that we don't want to analyze. 
# fawiki: bug leads to cutoff disabling ores for 2 days. These won't show up in any other interval, so ignore them. 
cutoffs = cutoffs.loc[~((cutoffs.wiki_db == 'fawiki') & ( (cutoffs.date == pd.to_datetime("2017-12-09 11:19:00")) | (cutoffs.date == pd.to_datetime("2017-12-11 18:56:00"))))]

In [259]:
# etwiki, frwiki, and hewiki apparently turned on rcfilters 50 days after enabling ORES. This is OK. The periods overlap.

#frwiki and ruwiki experienced a bug on the deployment of rcfilters to watchlist. So let's ignore them for those messages.   

cutoffs = cutoffs.loc[~((cutoffs.wiki_db == 'frwiki') & (cutoffs.date >= pd.to_datetime("2017-11-09 14:35:00")))]

cutoffs = cutoffs.loc[~((cutoffs.wiki_db == 'ruwiki') & (cutoffs.date >= pd.to_datetime("2017-11-20 19:22:00") ))]

In [260]:
cutoffs.loc[(cutoffs.time_since_last_cutoff<=pd.Timedelta(120,'D')) | (cutoffs.time_till_next_cutoff<=pd.Timedelta(120,'D')), select]

Unnamed: 0,wiki_db,has_ores,has_rcfilters,has_rcfilters_watchlist,time_since_last_cutoff,time_till_next_cutoff,date
3,cswiki,True,False,False,NaT,76 days 23:21:00,2017-02-22 00:33:00
4,cswiki,True,True,False,76 days 23:21:00,308 days 17:24:00,2017-05-09 23:54:00
12,etwiki,True,False,False,NaT,50 days 05:26:00,2017-03-20 18:28:00
13,etwiki,True,True,False,50 days 05:26:00,308 days 17:24:00,2017-05-09 23:54:00
22,frwiki,False,False,False,NaT,55 days 07:02:00,2017-04-11 11:09:00
23,frwiki,True,True,False,55 days 07:02:00,156 days 20:24:00,2017-06-05 18:11:00
27,hewiki,True,False,False,NaT,29 days 10:38:00,2017-04-10 13:16:00
28,hewiki,True,True,False,29 days 10:38:00,308 days 17:24:00,2017-05-09 23:54:00
32,kowiki,False,True,False,NaT,0 days 00:00:00,2019-03-04 16:27:00
33,kowiki,True,True,True,0 days 00:00:00,NaT,2019-03-04 16:27:00


In [261]:
cutoffs.loc[cutoffs.wiki_db == 'arwiki']

Unnamed: 0,wiki_db,index,commitsha,damaging_hard,damaging_likelybad_max,damaging_likelybad_min,damaging_likelygood_max,damaging_likelygood_min,damaging_maybebad_max,damaging_maybebad_min,...,extension_status_default,rcfilters_watchlist_enabled_default,commitsha_default,has_extension,has_beta_extension,has_rcfilters,has_rcfilters_watchlist,date,time_since_last_cutoff,time_till_next_cutoff
0,arwiki,0,c19a7d1dd4f8b720ef99cd5a33a9853a0555fa1c,,1.0,maximum recall @ precision >= 0.45,maximum recall @ precision >= 0.997,0.0,,,...,,True,22f6ec8967a3b66019881f27bb37515eaae855bc,False,False,True,True,2018-05-09 18:04:00,NaT,NaT


In [262]:
# kowiki was enabled over two commits with the same deploy time
cutoffs = cutoffs.drop(32)

In [263]:
cutoffs.loc[cutoffs.wiki_db == 'wikidatawiki',select]

Unnamed: 0,wiki_db,has_ores,has_rcfilters,has_rcfilters_watchlist,time_since_last_cutoff,time_till_next_cutoff,date
63,wikidatawiki,True,False,False,NaT,321 days 08:41:00,2016-06-22 15:13:00
64,wikidatawiki,True,True,False,321 days 08:41:00,169 days 13:27:00,2017-05-09 23:54:00
65,wikidatawiki,False,False,False,169 days 13:27:00,3 days 23:58:00,2017-10-26 13:21:00
66,wikidatawiki,True,True,True,3 days 23:58:00,28 days 05:52:00,2017-10-30 13:19:00
67,wikidatawiki,False,False,False,28 days 05:52:00,0 days 00:00:00,2017-11-27 19:11:00
68,wikidatawiki,True,True,True,0 days 00:00:00,NaT,2017-11-27 19:11:00


In [264]:
# wikidatawiki had an issue on 2017-10-30 and 2017-11-27 with the move to default on watchlist so we'll ignore that cutoff

In [265]:
cutoffs = cutoffs.loc[~((cutoffs.wiki_db == 'wikidatawiki') & (cutoffs.date >= pd.to_datetime("2017-10-26 13:21:00")))]


In [266]:
cutoffs.loc[(cutoffs.time_since_last_cutoff<=pd.Timedelta(60,'D')) | (cutoffs.time_till_next_cutoff<=pd.Timedelta(60,'D')), select]

Unnamed: 0,wiki_db,has_ores,has_rcfilters,has_rcfilters_watchlist,time_since_last_cutoff,time_till_next_cutoff,date
12,etwiki,True,False,False,NaT,50 days 05:26:00,2017-03-20 18:28:00
13,etwiki,True,True,False,50 days 05:26:00,308 days 17:24:00,2017-05-09 23:54:00
22,frwiki,False,False,False,NaT,55 days 07:02:00,2017-04-11 11:09:00
23,frwiki,True,True,False,55 days 07:02:00,156 days 20:24:00,2017-06-05 18:11:00
27,hewiki,True,False,False,NaT,29 days 10:38:00,2017-04-10 13:16:00
28,hewiki,True,True,False,29 days 10:38:00,308 days 17:24:00,2017-05-09 23:54:00
33,kowiki,True,True,True,0 days 00:00:00,NaT,2019-03-04 16:27:00


In [267]:
first_cutoff = cutoffs.groupby(['wiki_db']).date.min()

In [268]:
# we'll look over the range of dates starting in June 2017 after thresholds in model info were released

In [270]:
remember_dict["min.first.cutoff"] = first_cutoff

In [271]:
min_first_cutoff = datetime(year=2017,month=6,day=1)

In [272]:
first_cutoff = first_cutoff.reset_index()

In [273]:
first_cutoff.loc[(first_cutoff.date < min_first_cutoff),"date"] = min_first_cutoff

In [274]:
first_cutoff = first_cutoff.reset_index()

In [275]:
first_cutoff = cutoffs.merge(first_cutoff,how='right')[['wiki_db','date']]

In [276]:
last_cutoff = cutoffs.groupby(['wiki_db']).date.max().reset_index()

In [277]:
last_cutoff = cutoffs.merge(last_cutoff,how='right')[['wiki_db','date']]

In [278]:
fl_cutoff = last_cutoff.merge(first_cutoff,on='wiki_db',suffixes=['_last','_first'])

In [284]:
rdd_period_length = pd.Timedelta(365,'d')

In [285]:
remember_dict['rdd_period_length'] = rdd_period_length

In [286]:
fl_cutoff['period2_end'] = fl_cutoff.date_last + rdd_period_length
fl_cutoff['period1_start'] = fl_cutoff.date_first - rdd_period_length

In [288]:
wmhist = spark.read.table("wmf.mediawiki_history")

In [289]:
# take a stratified sample of edits in the cutoffs
# stratify by wiki_db, is_newcomer, is_anon, is_reverted, and pre-ores and post-ores periods
wmhist = spark.read.table("wmf.mediawiki_history")

wmhist = wmhist.filter(f.col("snapshot") == "2019-10")
# ok we're ready to fire up spark and make a stratified sample
# we only need the latest snapshot

wmhist = wmhist.filter((f.col("event_entity") == "revision"))

In [290]:
import pyspark.sql.functions as f

In [291]:
from spark_functions import build_wmhist_step1, process_reverts, broadcast_match_comment, add_revert_types

In [292]:
broadcast_match_comment(spark.sparkContext)

<function spark_functions.broadcast_match_comment.<locals>.my_match_comment>

In [293]:
wmhist = wmhist.filter(wmhist.page_namespace == 0)

In [294]:
wmhist = build_wmhist_step1(wmhist)

TypeError: build_wmhist_step1() missing 1 required positional argument: 'remember_dict'

In [None]:
#wmhist.show()

In [None]:
reverts = process_reverts(wmhist,spark)

In [None]:
# select only the columns we need from reverts
reverts = reverts.select(['wiki_db_l','revert_timestamp','reverted_revision_id',f.col('role_type').alias("revert_role_type"),f.col('anon_new_established').alias('reverted_anon_new_established'),'is_damage','time_to_revert','revert_comment','revert_user_Nreverts_past_month','revert_user_text','revert_user_id'])


In [None]:
# for the time to revert analysis, we only want damaging ones, but for is_reverted we want all reverts
wmhist = wmhist.join(reverts, on =[wmhist.wiki_db == reverts.wiki_db_l, wmhist.revision_id == reverts.reverted_revision_id],how='left_outer')

In [None]:
#wmhist = wmhist.join(cutoffs, on=[wmhist.wiki_db==cutoffs.wiki_db_l, f.unix_timestamp(wmhist.event_timestamp) >= f.unix_timestamp(cutoffs.period_start), f.unix_timestamp(wmhist.event_timestamp) <= f.unix_timestamp(cutoffs.period_end)],how='right_outer')

In [None]:
#wmhist.show()

In [None]:
#wmhist = wmhist.withColumn("sec_to_cutoff", (f.unix_timestamp(f.col("event_timestamp")) - f.unix_timestamp(f.col("date"))) / 1000)

In [None]:
#wmhist = add_revert_types(wmhist, comment_column='revert_comment')

In [None]:
# instead of taking a sample around every cutoff. We're going to just look 3 months before the earliest cutoff and 3 months after the latest one.

In [None]:
fl_cutoff_df = spark.createDataFrame(fl_cutoff[['wiki_db','date_first','date_last','period1_start','period2_end']])
fl_cutoff_df = fl_cutoff_df.withColumnRenamed('wiki_db','wiki_db_l')
fl_cutoff_df = f.broadcast(fl_cutoff_df)

In [None]:
fl_cutoff_df

In [None]:
join_cond = (wmhist.wiki_db == fl_cutoff_df.wiki_db_l) & (f.unix_timestamp(wmhist.event_timestamp).between(f.unix_timestamp(fl_cutoff_df.period1_start),f.unix_timestamp(fl_cutoff_df.date_first)) | (f.unix_timestamp(wmhist.event_timestamp).between(f.unix_timestamp(fl_cutoff_df.date_last),f.unix_timestamp(fl_cutoff_df.period2_end))))

                                                          

In [None]:
#wmhist = wmhist.repartition(10000)

In [None]:
wmhist = wmhist.join(fl_cutoff_df,on=join_cond, how='inner')

In [None]:
# ores_cutoff_cond = None
# rcfilters_cutoff_cond = None
# rcfilters_watchlist_cutoff_cond = None
# for _, cutoff in cutoffs.iterrows():
#     cond = ((wmhist.wiki_db == cutoff.wiki_db) & (f.unix_timestamp(wmhist.event_timestamp) >= cutoff.period_start.timestamp()) & (f.unix_timestamp(wmhist.event_timestamp) <= cutoff.period_end.timestamp()))
            
#     if ores_cutoff_cond is None:
#         ores_cutoff_cond = (cond & f.lit(cutoff.ores_cutoff == True))
#         rcfilters_cutoff_cond = (cond & f.lit(cutoff.rcfilters_cutoff == True))
#         rcfilters_watchlist_cutoff_cond = (cond & f.lit(cutoff.watchlist_cutoff == True))
#     else:
#         ores_cutoff_cond = ores_cutoff_cond | (cond & f.lit(cutoff.ores_cutoff == True))
#         rcfilters_cutoff_cond = rcfilters_cutoff_cond | (cond & f.lit(cutoff.rcfilters_cutoff == True))
#         rcfilters_watchlist_cutoff_cond = rcfilters_watchlist_cutoff_cond | (cond & f.lit(cutoff.watchlist_cutoff == True))


In [None]:
# wmhist = wmhist.withColumn("cutoff_type", f.when(ores_cutoff_cond,'has_ores').otherwise(f.when(rcfilters_cutoff_cond,'has_rcfilters').otherwise(f.when(rcfilters_watchlist_cutoff_cond,'has_rcfilters_watchlist').otherwise(None))))

In [None]:
wmhist = wmhist.withColumn("period",f.when(f.unix_timestamp(wmhist.event_timestamp) <= f.unix_timestamp(wmhist.date_first),"period1").otherwise("period2"))

In [None]:
wmhist = wmhist.withColumn('strata',f.concat_ws('_',wmhist.wiki_db,wmhist.period,wmhist.revision_is_identity_reverted,wmhist.reverted_anon_new_established))

In [None]:
wmhist = wmhist.cache()

In [None]:
#wmhist.select(f.col("anon_new_established"),f.col("reverted_anon_new_established"),f.col("revision_id"),f.col("revert_role_type")).show()


In [None]:
wmhist = wmhist.withColumn("reverted_in_24h",(wmhist.revision_is_identity_reverted == True) & (wmhist.time_to_revert <= 60*60*24))

In [None]:
#wmhist = wmhist.repartition(1200,f.col("strata"))

In [None]:
wmhist.columns

In [None]:
wmhist_out = wmhist.select(['wiki_db','event_timestamp','page_id','page_title','user_id','user_text','event_user_isbot1','event_user_isbot2','revision_id','revision_is_identity_reverted','anon_new_established','event_user_is_newcomer','time_to_revert','reverted_in_24h','period','period1_start','period2_end','strata','date_first','date_last'])

In [295]:
wmhist_out = wmhist_out.repartition(1)

In [None]:
wmhist_out.write.csv("/user/nathante/ores_bias_data/cutoff_revisions_2periods.csv",mode='overwrite',compression="None",header=True)

In [None]:
wmhist = spark.read.csv("ores_bias_data/cutoff_revisions_2periods.csv", header=True)

In [None]:
wmhist = wmhist.repartition(500)

In [None]:
wmhist

In [None]:
# get the proportion of observations in each strata
strata_count = wmhist.groupby(f.col('strata')).count()
#all_count = wmhist.count()


In [None]:
strata_count = strata_count.withColumn("fraction", f.when( 20000 < f.col("count"),20000/f.col("count")).otherwise(1))
strata_count = strata_count.withColumn("weight",1/strata_count.fraction)


In [None]:
remember_dict['strata_sample_size'] = 20000

In [None]:
samp_design = strata_count.collect()

In [None]:
#fractions = samp_design.locj['strata','fraction']
fractions = {r.strata:r.fraction for r in samp_design}

In [None]:
sample = wmhist.sampleBy("strata",fractions=fractions)

In [None]:
sample = sample.join(strata_count,on='strata')

In [None]:
sample

In [None]:
## TODO Write the table. check if it exists and get the sample revids from there.

#sample.write.saveAsTable("nathante.cutoff_revisions_sample_simplestrata",mode='overwrite')

In [None]:
sample = sample.repartition(1)

In [None]:
sample.write.csv("/user/nathante/ores_bias_data/cutoff_revisions_sample_2periods.csv",mode='overwrite',compression='None',header=True)

In [None]:
strata_count.write.csv("/user/nathante/ores_bias_data/threshhold_strata_counts.csv",mode='overwrite',compression="None",header=True)

In [91]:
sample.filter(sample.wiki_db=='enwiki').select(f.mean(f.col('revision_is_identity_reverted').cast("long"))).show()

+--------------------------------------------------+
|avg(CAST(revision_is_identity_reverted AS BIGINT))|
+--------------------------------------------------+
|                                0.5336424575374225|
+--------------------------------------------------+



In [None]:
conf = spark.sparkContext.getConf()

In [None]:
conf.getAll()

In [None]:
edits2 = edits.select(['wiki_db','event_timestamp','event_user_is_anonymous','event_user_is_anonymous','revision_id','revision_is_identity_reverted','rcfilters_cutoff','week','sec_to_cutoff'])

In [None]:
edits2.show()

In [None]:
spark.catalog.listDatabases()

In [None]:
ores_scores = spark.read.table("ores.revision_score_public")

In [None]:
ores_scores = ores_scores.filter((f.col("model")=="damaging") & f.col("model_version") == "0.3.2")

In [None]:
edits = edits.join(ores_scores,on=[edits.wiki_db == ores_scores.wiki, edits.revision_id == ores_scores.rev_id])

In [None]:
edits2 = edits2.withColumn("wikiweek",f.concat_ws(' ', edits.wiki_db, f.date_format(edits.week,'MM-dd-yyyy')))
by_wiki_week = edits.groupby('wikiweek')

In [None]:
by_wiki_week = edits2.groupby(['wikiweek'])


In [None]:
# take a sample stratified of N = 5000 by wiki and week

samp_design = by_wiki_week.count()
samp_design = samp_design.withColumn("fraction", f.when( 5000 < f.col("count"),5000/f.col("count")).otherwise(1))
samp_design = samp_design.withColumn("weight",1/samp_design.fraction)


In [None]:
samp_design

In [None]:
fractions = samp_design.select(['wikiweek','fraction']).collect()

In [None]:
fractions = {r.wikiweek:r.fraction for r in fractions}

In [None]:
fractions

In [None]:
sample = edits2.sampleBy("wikiweek",fractions=fractions)

In [None]:
pddf_sample = sample.toPandas()