# conf.getAll()

In [1]:
from datetime import timezone
from datetime import datetime
import pandas as pd
import numpy as np
import pyspark.sql.functions as f
from pyspark.sql.types import *
from pyspark.sql import Window
from functools import reduce
from pyspark.sql.functions import udf
from dateutil import parser
import pickle

In [2]:
spark.sparkContext.addPyFile("../.local/lib/python3.5/site-packages/mwcomments-0.3.3-py3.5.egg")
spark.sparkContext.addPyFile("../.local/lib/python3.5/site-packages/sortedcontainers-2.1.0-py3.5.egg")
spark.sparkContext.addPyFile("../.local/lib/python3.5/site-packages/python_dateutil-2.8.0-py3.5.egg")

In [3]:
import mwcomments

In [4]:
wtm = mwcomments.WikiToolMap.load_WikiToolMap()

In [5]:
broad_wtm = sc.broadcast(wtm)

In [6]:
treated_wikis = pd.read_csv("data/ores_rcfilters_cutoffs.csv")

In [7]:
# select canonical cutoffs
cutoffs = treated_wikis.groupby(treated_wikis.wiki_db).min()

In [8]:
cutoffs.timestamp = pd.to_datetime(cutoffs.deploy_dt)

In [9]:
cutoffs = cutoffs.reset_index()

In [10]:
# ok we're ready to fire up spark and make a stratified sample
wmhist = spark.read.table("wmf.mediawiki_history")
# we only need the latest snapshot
wmhist = wmhist.filter(f.col("snapshot") == "2019-04")
wmhist = wmhist.filter((f.col("event_entity") == "revision"))

In [11]:
wmhist

DataFrame[wiki_db: string, event_entity: string, event_type: string, event_timestamp: string, event_comment: string, event_user_id: bigint, event_user_text_historical: string, event_user_text: string, event_user_blocks_historical: array<string>, event_user_blocks: array<string>, event_user_groups_historical: array<string>, event_user_groups: array<string>, event_user_is_bot_by_historical: array<string>, event_user_is_bot_by: array<string>, event_user_is_created_by_self: boolean, event_user_is_created_by_system: boolean, event_user_is_created_by_peer: boolean, event_user_is_anonymous: boolean, event_user_registration_timestamp: string, event_user_creation_timestamp: string, event_user_first_edit_timestamp: string, event_user_revision_count: bigint, event_user_seconds_since_previous_revision: bigint, page_id: bigint, page_title_historical: string, page_title: string, page_namespace_historical: int, page_namespace_is_content_historical: boolean, page_namespace: int, page_namespace_is_cont

In [12]:
wmhist = wmhist.select(['wiki_db','event_timestamp','event_comment','revision_id','revision_parent_id','revision_text_bytes','revision_text_bytes_diff','revision_text_sha1','revision_is_identity_reverted','revision_first_identity_reverting_revision_id','revision_is_identity_revert','revision_tags','event_user_id','event_user_text','event_user_is_anonymous','event_user_creation_timestamp','event_user_first_edit_timestamp','event_user_revision_count','event_user_seconds_since_previous_revision','page_id','page_title_historical','page_title',                'page_namespace','page_is_redirect','page_is_deleted','page_revision_count','page_seconds_since_previous_revision',"event_user_groups","event_user_is_bot_by","revision_deleted_parts",'revision_deleted_parts_are_suppressed','revision_tags'])


In [13]:
wmhist.filter(f.col("event_comment").like("%FastButtons%")).select(['wiki_db','event_timestamp','event_comment','revision_tags']).limit(10).collect()

[Row(wiki_db='ptwiki', event_timestamp='2014-10-28 04:32:30.0', event_comment='Aviso sobre a eliminação da página "[[Cur.to]]", usando [[WP:FastButtons|FastButtons]]', revision_tags=None),
 Row(wiki_db='cawiki', event_timestamp='2017-11-30 11:25:43.0', event_comment='Avís a usuari, usant [[:es:Wikipedia:FastButtons|FastButtons]]', revision_tags=None),
 Row(wiki_db='ptwiki', event_timestamp='2018-08-08 05:55:33.0', event_comment='Página marcada que existem referências sem formatação, usando [[WP:FastButtons|FastButtons]]', revision_tags=None),
 Row(wiki_db='ptwiki', event_timestamp='2015-08-25 16:28:07.0', event_comment='Página marcada como sem fontes, usando [[WP:FastButtons|FastButtons]]', revision_tags=None),
 Row(wiki_db='ptwiki', event_timestamp='2015-09-15 12:07:45.0', event_comment='Aviso sobre a eliminação da página "[[Sistema de alta pressão]]", usando [[WP:FastButtons|FastButtons]]', revision_tags=None),
 Row(wiki_db='ptwiki', event_timestamp='2019-01-30 14:36:29.0', event_com

In [14]:
treated_wikis = set(treated_wikis.wiki_db)

wmhist = wmhist.withColumn("treated",f.col("wiki_db").isin(treated_wikis))

wmhist = wmhist.filter(f.col("treated") == True)


In [15]:
if not os.path.exists("deleted_config_revisions_treated.csv"):

    missing_configs = wmhist.filter( (f.col("page_title").like("%Undo-summary%")) | (f.col("page_title").like("%Revertpage%"))).filter(f.col("page_is_deleted")).select('wiki_db','revision_id','page_id','page_title').toPandas()

    missing_configs.to_csv("deleted_config_revisions_treated.csv")
else:
    missing_configs = pd.read_csv("deleted_config_revisions_treated.csv")

In [16]:
import json

In [17]:
from spark_functions import build_wmhist_step1
wmhist = build_wmhist_step1(wmhist)


In [18]:
from spark_functions import process_reverts, add_revert_types
reverts = process_reverts(wmhist, spark)

In [19]:
from spark_functions import add_revert_types, broadcast_match_comment

In [20]:
broadcast_match_comment(spark.sparkContext)

<function spark_functions.broadcast_match_comment.<locals>.my_match_comment>

In [21]:
wmhist = add_revert_types(wmhist, comment_column='event_comment')

In [22]:
reverts = reverts.select(['wiki_db_l','revert_timestamp','reverted_revision_id',f.col('role_type').alias("revert_role_type"),f.col('anon_new_established').alias('reverted_anon_new_established'),'is_damage','time_to_revert','revert_comment','revert_user_Nreverts_past_month','revert_user_text','revert_user_id'])

In [23]:
wmhist = wmhist.join(reverts, on =[wmhist.wiki_db == reverts.wiki_db_l, wmhist.revision_id == reverts.reverted_revision_id],how='left_outer')

In [24]:
# testpd = wmhist.filter( (f.col("wiki_db") == 'enwiki') & 
#               (f.array_contains(col="revision_tags",value="mw-undo")) & 
#               (f.array_contains(col="revert_tools_match",value="undo"))).select(['event_comment',
#                                                                             'revert_tools_match',
#                                                                              'revision_tags',
#                                                                             'revision_id',
#                                                                             'event_timestamp']).limit(30).toPandas()

In [25]:
wmhist = wmhist.filter(f.col("wiki_db").isin(list(treated_wikis)))

In [26]:
# for convenience we'll use the start of the week as the time of intervention and ignore the fact that it was actually mid-week in some cases
wmhist = wmhist.withColumn("week",f.date_trunc("week",wmhist.event_timestamp))

In [27]:
min_time = f.lit("2007-01-01")
wmhist = wmhist.filter(f.col('week') > min_time)

In [28]:
edits_by_user_week = wmhist.filter(f.col("page_namespace")==0).groupBy(['wiki_db','week','event_user_id']).agg(f.count(wmhist.revision_id).alias("wiki_week_counts"))

In [29]:
wikis = wmhist.select("wiki_db").distinct().toPandas()
weeks = wmhist.select("week").distinct().toPandas()

In [30]:
wikis.wiki_db.to_csv("all_wikis.csv",index=False)

In [31]:
wikis = wikis.assign(key=1)
weeks = weeks.assign(key=1)
wiki_weeks = wikis.merge(weeks,on='key')
wiki_weeks = wiki_weeks.drop('key',axis=1)

In [32]:
wiki_weeks = spark.createDataFrame(wiki_weeks)

In [33]:
# spot check
#wmhist.filter((wmhist.event_user_text == "AaronSw") & (wmhist.wiki_db == "enwiki")).select(['event_user_id','week','wiki_db','event_user_text']).show()

In [34]:
# checks out
###edits_by_user_week.filter((edits_by_user_week.event_user_id==20842) & (edits_by_user_week.wiki_db=="enwiki")).orderBy(edits_by_user_week.week).show()

In [35]:
edits_by_user_week = edits_by_user_week.withColumn("week_unix", f.unix_timestamp(edits_by_user_week.week))

In [36]:
tmin_d28_in_s = (28+7)*24*60*60
tmin_d7_in_s =  (7)*24*60*60

active_window = Window.partitionBy(['wiki_db','event_user_id']).orderBy(['week_unix']).rangeBetween(-1 * tmin_d28_in_s,-1*tmin_d7_in_s)

edits_by_user_week = edits_by_user_week.withColumn("user_edits_last_month",f.sum("wiki_week_counts").over(active_window))
edits_by_user_week = edits_by_user_week.withColumn("is_active", (~ f.isnull(edits_by_user_week.user_edits_last_month)) &   (edits_by_user_week.user_edits_last_month >= 5))

In [37]:
# spot check
#edits_by_user_week.filter((edits_by_user_week.event_user_id==20842) & (edits_by_user_week.wiki_db=='enwiki')).show()

In [38]:
edits_by_week = edits_by_user_week.groupBy(['wiki_db','week']).agg(
    f.sum("wiki_week_counts").alias("total_edits"),
    f.sum(edits_by_user_week.is_active.cast(IntegerType())).alias("active_editors")
)

In [39]:
# break reverts out into: undo, huggle, bot 

In [40]:
#wmhist.filter((f.col("wiki_db")=="aawiki") & (f.col("week") == "2007-11-19 00:00:00") &( wmhist.revision_is_identity_revert == True)).show()

In [41]:
wiki_weeks = wiki_weeks.fillna(0,["mean_user_reverts","n_reverts"])

In [42]:
#wiki_weeks.filter(f.isnull(f.col("user_week_sd"))).show()

In [43]:
reverts = reverts.withColumnRenamed("wiki_db_l","wiki_db")

In [44]:
reverts = reverts.withColumn("week",f.date_trunc("week",f.col('revert_timestamp')))

In [45]:
# we only want to look at the damaging reverts for measuring our vandalism fighting outcomes

reverts = reverts.filter(f.col("is_damage") == True)      
# # exclude reverts with ttr > 30 days = 60 seconds * 60 minutes / second * 24hours / day * 30 days           
reverts = reverts.filter(f.col("time_to_revert") <= 30*24*60*60)                                                                                                                                               
reverts = reverts.withColumn("med_ttr", f.expr('percentile_approx(time_to_revert, 0.5,1)').over(Window.partitionBy(['wiki_db','week'])))                                                                       

In [46]:
# crap the tags only exist starting in 2018

In [47]:
#wmhist.filter( (f.col("revert_tool")=="undo") & f.col("wiki_db") == "enwiki").select(f.min(f.col("week"))).show()

In [48]:
# the spike in reverts is real

In [49]:
# for the distribution of reverting activity, we need the counts of reverts by user, wiki, and week
reverts_by_user_week = reverts.groupby(['wiki_db',"revert_user_text","week"]).agg(f.sum(f.when( (f.col("is_damage") == True),1).otherwise(0)).alias("user_week_reverts"))
                                                               

In [50]:
gb = reverts_by_user_week.groupby(['wiki_db','week']).agg(f.sum('user_week_reverts').alias("wiki_week_reverts"),
                                                         f.mean('user_week_reverts').alias("mean_user_reverts"))

In [51]:
reverts_by_user_week = reverts_by_user_week.join(gb,on=["wiki_db","week"])

In [52]:
reverts_by_user_week = reverts_by_user_week.withColumn("p_reverts",f.col("user_week_reverts")/f.col("wiki_week_reverts"))

reverts_by_user_week = reverts_by_user_week.withColumn("p_reverts_sq",f.col("p_reverts") * f.col("p_reverts"))

In [53]:
reverts_by_user_week = reverts_by_user_week.withColumn("user_week_deviance", reverts_by_user_week.user_week_reverts - reverts_by_user_week.mean_user_reverts)

reverts_by_user_week = reverts_by_user_week.withColumn("user_week_var_part", reverts_by_user_week.user_week_deviance * reverts_by_user_week.user_week_deviance)

In [54]:
reverts_by_week = reverts_by_user_week.groupby(["wiki_db","week"]).agg(f.mean("user_week_reverts"). \
                                                                       alias("mean_user_reverts"),
                                                                       f.sum("user_week_reverts"). \
                                                                       alias("n_reverts"),
                                                                       f.mean("user_week_var_part"). \
                                                                       alias("user_week_var"),
                                                                       f.sum("p_reverts_sq"). \
                                                                       alias("revert_hhi"))

In [55]:
reverts_by_week = reverts_by_week.withColumn("user_week_sd",f.sqrt(reverts_by_week.user_week_var))

In [56]:
reverts_by_week = reverts_by_week.withColumn("user_week_revert_cv",reverts_by_week.user_week_sd / reverts_by_week.mean_user_reverts)

In [57]:
wiki_weeks = wiki_weeks.join(reverts_by_week,on=['wiki_db','week'],how='left_outer')

In [58]:
wiki_weeks = wiki_weeks.join(edits_by_week,on=['wiki_db','week'], how='left_outer')

In [59]:
# spot check
#reverts.filter(f.col("is_damage") == False).filter(f.col("wiki_db")=="enwiki").show()

In [60]:
time_to_revert_gb = reverts.groupby(['wiki_db','week'])
time_to_revert = time_to_revert_gb.agg(
    f.mean(f.col("time_to_revert")).alias("mean_ttr"),
    f.stddev(f.col("time_to_revert")).alias("sd_ttr"),
    f.exp(f.mean(f.log(f.col("time_to_revert")))).alias("geom_mean_ttr"),
    f.first(f.col("med_ttr")).alias("med_ttr"),
    f.count(f.col("week")).alias("N_revert"))



In [61]:
wiki_weeks = wiki_weeks.join(time_to_revert,on=['wiki_db','week'],how='left_outer')

In [62]:
# so we see that the count in the main table can be wrong. Why?

In [63]:
wiki_weeks = wiki_weeks.fillna(0,["N_reverteds","active_editors","total_edits"])

In [64]:
#wiki_weeks.filter(f.isnull(f.col('geom_mean_ttr'))).show()

In [65]:
reverts = reverts.withColumnRenamed("role_type","revert_role_type")


In [66]:
reverts = reverts.withColumnRenamed("wiki_db_l","wiki_db")

In [67]:
reverts = reverts.withColumn("event_timestamp",f.col("revert_timestamp"))

In [68]:
reverts = add_revert_types(reverts, comment_column='revert_comment')

In [69]:
wiki_week_tools = reverts.groupby(["wiki_db","week"]).pivot("revert_tool",['undo','rollback','huggle','twinkle','otherTool','LiveRC','fastbuttons']).agg(f.mean(f.col("time_to_revert")).alias("mean_ttr"),
    f.stddev(f.col("time_to_revert")).alias("sd_ttr"),
    f.exp(f.mean(f.log(f.col("time_to_revert")))).alias("geom_mean_ttr"),
    f.first(f.col("med_ttr")).alias("med_ttr"),
    f.count("reverted_revision_id").alias("N_reverts"))

In [70]:

wiki_week_roles = reverts.groupby(["wiki_db","week"]).pivot("revert_role_type",['admin','bot','other','patroller']).agg(f.mean(f.col("time_to_revert")).alias("mean_ttr"),
    f.stddev(f.col("time_to_revert")).alias("sd_ttr"),
    f.exp(f.mean(f.log(f.col("time_to_revert")))).alias("geom_mean_ttr"),
    f.first(f.col("med_ttr")).alias("med_ttr"),
    f.count("reverted_revision_id").alias("N_reverts"))

In [71]:
wiki_week_editortypes = reverts.groupby(["wiki_db","week"]).pivot("reverted_anon_new_established",['anonymous','newcomer','established']).agg(f.mean(f.col("time_to_revert")).alias("mean_ttr"),
    f.stddev(f.col("time_to_revert")).alias("sd_ttr"),
    f.exp(f.mean(f.log(f.col("time_to_revert")))).alias("geom_mean_ttr"),
    f.first(f.col("med_ttr")).alias("med_ttr"),
    f.count("reverted_revision_id").alias("N_reverts"))

In [72]:
wiki_weeks = wiki_weeks.join(wiki_week_tools, on=['wiki_db','week'],how='left_outer')

In [73]:
wiki_weeks = wiki_weeks.join(wiki_week_roles, on=['wiki_db','week'],how='left_outer')

In [74]:
wiki_weeks = wiki_weeks.join(wiki_week_editortypes, on=['wiki_db','week'],how='left_outer')

In [75]:
reverts = reverts.withColumn('editortype_x_reverttool', f.concat_ws('_',f.col('reverted_anon_new_established'),f.col("revert_tool")))

In [76]:
pivot_values = [ed+'_'+tool for ed in ['anonymous','newcomer','established'] for tool in ['undo','rollback','huggle','twinkle','otherTool','fastbuttons','LiveRC']]

In [77]:
wiki_week_editortypesxtools = reverts.groupby(["wiki_db","week"]).pivot("editortype_x_reverttool",pivot_values).agg(
    f.exp(f.mean(f.log(f.col("time_to_revert")))).alias("geom_mean_ttr"),
    f.first(f.col("med_ttr")).alias("med_ttr"),
    f.count("reverted_revision_id").alias("N_reverts"))

In [78]:
wiki_weeks = wiki_weeks.join(wiki_week_editortypesxtools, on=['wiki_db','week'],how='left_outer')

In [79]:
interesting_namespaces = [0,1,2,3,4]

In [None]:
tm_vars = wmhist.withColumn("user_new_anon",f.when(wmhist.event_user_is_anonymous,"anon").otherwise(
    f.when(wmhist.event_user_is_newcomer,"newcomer").otherwise("non_anon_newcomer")))

tm_vars = tm_vars.filter(tm_vars.page_namespace.isin(interesting_namespaces)).withColumn("user_new_anon_namespace",f.concat_ws('_',tm_vars.page_namespace,tm_vars.user_new_anon))


In [None]:
tm_vars = tm_vars.groupby(['wiki_db','week']) \
                .pivot("user_new_anon_namespace") \
                .agg(f.count(wmhist.revision_id).alias("N_edits"),
                     f.countDistinct(wmhist.event_user_text).alias("N_editors"))

In [None]:
#tm_vars.show()

In [None]:
wiki_weeks = wiki_weeks.join(tm_vars,on=['wiki_db','week'],how='left_outer')

In [None]:
wiki_weeks = wiki_weeks.withColumn("year",f.year(f.col('week')))
wiki_weeks = wiki_weeks.withColumn("month",f.month(f.col('week')))

In [None]:
#wiki_weeks.show()

In [None]:
#wiki_weeks.write.parquet("/user/nathante/ores_bias/wiki_weeks.parquet",partitionBy=["wiki_db",'year','month'],mode='overwrite')

In [None]:
import re
import requests
from itertools import chain
response = requests.api.get("https://meta.wikimedia.org/w/api.php",{"action":"sitematrix","formatversion":2,"format":"json","maxage":3600,"smaxage":3600})
sitematrix = response.json()["sitematrix"]

In [None]:
projname_re = re.compile(r"https?://(.*).org")

In [None]:
mapping = {}
rev_mapping = {}
for _,v in sitematrix.items():
    try: 
        site = v['site']
        dbname = site[0]['dbname']
        url = site[0]['url']
        projname = projname_re.findall(url)[0]
        mapping[projname]=dbname
        rev_mapping[dbname]=projname
    except Exception as e:
        continue

In [None]:
spark_mapping = f.create_map([f.lit(x) for x in chain(*mapping.items())])

In [None]:
pageviews = spark.read.table("wmf.projectview_hourly")

pageviews = pageviews.filter(pageviews.agent_type == 'user').select(['project','year','month','day','hour','view_count'])
pageviews = pageviews.withColumn("date", f.concat_ws("-",pageviews.year,pageviews.month,pageviews.day))
pageviews = pageviews.groupBy(['project',"date"]).agg(f.sum('view_count').alias("view_count"))
pageviews = pageviews.withColumn("week", f.date_trunc('week',pageviews.date))
pageviews = pageviews.withColumn("wiki_db",spark_mapping.getItem(pageviews.project))

In [None]:
#missing_in_mapping = pageviews.filter(f.isnull(f.col("wiki_db"))).select(['project','wiki_db']).distinct().collect()

In [None]:
wiki_weeks = wiki_weeks.join(pageviews,on=['wiki_db','week'],how='left_outer')

In [None]:
#wiki_weeks.filter(f.col("wiki_db") == "enwiki").show()

In [None]:
missing_views = wiki_weeks.groupBy('wiki_db').agg(f.mean(f.isnull(f.col("view_count")).cast(IntegerType())).alias("p_no_views"))

In [None]:
# missing_views = missing_views.filter(f.col("p_no_views") == 1).toPandas()

In [None]:
week0 = wmhist.agg(f.min("week")).collect()[0]['min(week)']

In [None]:
page_history = spark.read.table("wmf.mediawiki_page_history")
page_history = page_history.filter(f.col("snapshot") == "2019-04")
# we only need revisions during our time period
# let's assume that pages don't get deleted for now
page_history = page_history.withColumn("page_creation_week",f.date_trunc("week","page_creation_timestamp"))

In [None]:
n_pages_baseline = page_history.filter(
    (page_history.page_creation_week < min_time)) \
.groupBy(['wiki_db']) \
.pivot("page_namespace",interesting_namespaces) \
.count().alias("n_pages_baseline")

cols = ["wiki_db"]
cols.extend("n_pages_baseline_ns_{0}".format(i) for i in n_pages_baseline.columns[1:])
n_pages_baseline = n_pages_baseline.toDF(* cols)

In [None]:
n_new_pages_by_week = page_history \
.filter((page_history.page_creation_week >= min_time)) \
.groupBy(["wiki_db","page_creation_week"]) \
.pivot("page_namespace",interesting_namespaces)\
.agg(f.count("page_id"))

In [None]:
cols = ["wiki_db",'page_creation_week']
cols.extend("n_pages_created_ns_{0}".format(i) for i in n_new_pages_by_week.columns[2:])
n_new_pages_by_week = n_new_pages_by_week.toDF(* cols)

n_pages = n_new_pages_by_week.join(n_pages_baseline, on=['wiki_db'], how='full_outer')

#n_pages = n_pages.withColumn("n_pages_ns",n_pages.n_pages_baseline + n_pages.n_created_pages)
n_pages = n_pages.withColumnRenamed('page_creation_week','week')

In [None]:
wiki_weeks = wiki_weeks.join(n_pages,on=['wiki_db','week'],how='full_outer')

In [None]:
# newcomer survival



In [None]:
wiki_weeks = wiki_weeks.withColumn("year",f.year(f.col("week")))
wiki_weeks = wiki_weeks.withColumn("month",f.month(f.col("week")))

In [None]:
wiki_weeks

# Change below to use new format

In [None]:
wiki_weeks_out = wiki_weeks.repartition(1)

In [None]:
wiki_weeks_out.write.csv("/user/nathante/ores_bias_data/wiki_weeks.csv",mode='overwrite',compression="none",header=True)

In [118]:
?wiki_weeks_out.write.csv

In [None]:
wiki_weeks.select("wiki_db").distinct().collect()

In [None]:
wiki_weeks.filter(f.col("wiki_db")=='enwiki').select(['other_huggle_N_reverts','week']).show()

In [None]:
wiki_weeks.write.parquet("/user/nathante/ores_bias/wiki_weeks.parquet",partitionBy=["wiki_db","year","month"],mode='overwrite')

In [None]:
spark.catalog.setCurrentDatabase("nathante")

In [None]:
spark.catalog.listTables()

In [None]:
wiki_weeks = spark.read.parquet("/user/nathante/ores_bias/wiki_weeks.parquet")

In [None]:
import pyarrow as pa
import pyarrow.parquet as pq

In [None]:

import os 
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-8-openjdk-amd64'


In [None]:
hdfs = pa.hdfs.connect()

# hdfs_file = pa.HdfsFile("/user/nathante/ores_bias/wiki_weeks.parquet")

In [None]:
wiki_weeks_out

In [None]:
dataset = pq.ParquetDataset("/user/nathante/ores_bias/wiki_weeks.parquet",filesystem=hdfs)

In [None]:
?pd.read_parquet(dataset)

In [None]:
# try pulling just the columns you really need. 

In [None]:
ww_pdf = dataset.read_pandas()

In [None]:
file = hdfs.read_parquet("/user/nathante/ores_bias/wiki_weeks.parquet")

In [None]:
dataset = pq.ParquetDataset("/user/nathante/ores_bias/wiki_weeks.parquet")

In [None]:
ww_pddf = wiki_weeks_out.toPandas()

In [None]:
ww_pdf.to_csv("ores_bias_data/wiki_weeks2.csv",index=False)