In [1]:
import org.apache.spark.sql.functions._

In [2]:
val conflicts = spark.read.parquet("/user/awight/clean_conflicts").cache()
val exits = spark.read.parquet("/user/awight/exits").cache()
val linked_exits = spark.read.parquet("/user/awight/linked_exits").cache()

conflicts = [conflict_timestamp: timestamp, wiki: string ... 11 more fields]
exits = [exit_timestamp: timestamp, start_time_ts_ms: bigint ... 10 more fields]
linked_exits = [conflict_timestamp: timestamp, wiki: string ... 14 more fields]


[conflict_timestamp: timestamp, wiki: string ... 14 more fields]

In [3]:
val exit_counts = linked_exits.groupBy("exit_action").count()
exit_counts
  .groupBy("exit_action")
  .agg(sum("count").alias("count"))
  .withColumn("percent", round(lit(100) * $"count" /  sum("count").over(), 1))
  .show()

+-----------+-----+-------+
|exit_action|count|percent|
+-----------+-----+-------+
|     cancel|   65|    2.9|
|    unknown|  961|   43.3|
|       save| 1191|   53.7|
+-----------+-----+-------+



exit_counts = [exit_action: string, count: bigint]


[exit_action: string, count: bigint]

In [4]:
val with_chunks = linked_exits.filter($"conflict_chunks" > 0)
with_chunks.describe("user_editcount", "conflict_chunks", "conflict_chars").show()
with_chunks
  .groupBy("exit_action")
  .agg(
    count($"*").alias("count"),
    round(mean($"conflict_chunks"), 1).alias("mean_overlapping_chunks"),
    round(stddev($"conflict_chunks"), 1).alias("std_overlapping_chunks"),
    round(mean($"conflict_chars"), 1).alias("mean_overlapping_chars"),
    round(stddev($"conflict_chars"), 1).alias("std_overlapping_chars")
  )
  .show()

+-------+-----------------+------------------+------------------+
|summary|   user_editcount|   conflict_chunks|    conflict_chars|
+-------+-----------------+------------------+------------------+
|  count|             1902|              1902|              1902|
|   mean|33114.45688748686|1.4668769716088328|1060.2917981072555|
| stddev|62829.38282481591|1.9352231152078023|  1959.28883584537|
|    min|                0|                 1|                 0|
|    max|           741811|                30|             22700|
+-------+-----------------+------------------+------------------+

+-----------+-----+-----------------------+----------------------+----------------------+---------------------+
|exit_action|count|mean_overlapping_chunks|std_overlapping_chunks|mean_overlapping_chars|std_overlapping_chars|
+-----------+-----+-----------------------+----------------------+----------------------+---------------------+
|     cancel|   55|                    1.4|                   1.6|   

with_chunks = [conflict_timestamp: timestamp, wiki: string ... 14 more fields]


[conflict_timestamp: timestamp, wiki: string ... 14 more fields]

In [5]:
linked_exits
  .filter($"exit_timestamp".isNotNull)
  .groupBy("exit_action")
  .agg(
    round(mean($"elapsed_s")).alias("mean_elapsed_s"),
    round(stddev($"elapsed_s")).alias("std_elapsed_s"),
    round(min($"elapsed_s")).alias("min_elapsed_s"),
    round(max($"elapsed_s")).alias("max_elapsed_s")
  ).show()

+-----------+--------------+-------------+-------------+-------------+
|exit_action|mean_elapsed_s|std_elapsed_s|min_elapsed_s|max_elapsed_s|
+-----------+--------------+-------------+-------------+-------------+
|     cancel|         116.0|        413.0|          -80|         3016|
|    unknown|         100.0|        693.0|        -1248|        16267|
|       save|         110.0|        768.0|        -2876|        25658|
+-----------+--------------+-------------+-------------+-------------+

