In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("lakeFSNotebook") \
    .master("local[1]") \
    .getOrCreate()


instructions = spark.read.text("lakefs://oreilly-challenge/main/instructions.txt")
instructions.show(instructions.count(), False)

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                                                                  |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|# Welcome to the lakeFS data challenge! You've found the first dataset to explore.                                                                                     |
|                                                                                                                                                                       |
|In order to succeed you'd need to know the following:                                                                                                

In [2]:
users = spark.read.parquet(f'lakefs://oreilly-challenge/main/datasets/users/*.parquet')
events = spark.read.parquet(f'lakefs://oreilly-challenge/main/datasets/user_events/EventType=*/*.parquet')

In [3]:
from pyspark.sql import functions as F
exprs = [F.max(x) for x in ["count"]]
countEventsDF = events.groupBy("UID").count()
countEventsDF.show()
countEventsDF.agg(*exprs).show()

+-----+-----+
|  UID|count|
+-----+-----+
|42852|   89|
|43367|   99|
|44342|    7|
|44901|   91|
|45726|   25|
|48280|    2|
|48603|    6|
|48899|    3|
|49326|    1|
|49586|   25|
|49754|   91|
|49967|    2|
|50124|   99|
|50287|   89|
|51056|    6|
|52001|    7|
|52051|   99|
|52611|    6|
|52743|    6|
|52910|    6|
+-----+-----+
only showing top 20 rows

+----------+
|max(count)|
+----------+
|        99|
+----------+



In [4]:
less_than_50 = countEventsDF.filter(countEventsDF["count"] < 50)
less_than_50.show()

+-----+-----+
|  UID|count|
+-----+-----+
|44342|    7|
|45726|   25|
|48280|    2|
|48603|    6|
|48899|    3|
|49326|    1|
|49586|   25|
|49967|    2|
|51056|    6|
|52001|    7|
|52611|    6|
|52743|    6|
|52910|    6|
|53294|    3|
|53721|    6|
|54039|    6|
|54415|    1|
|54536|   25|
|54660|    2|
|55426|    6|
+-----+-----+
only showing top 20 rows



In [5]:
events = spark.read.parquet(f'lakefs://oreilly-challenge/improvedDataJune2020/datasets/user_events/EventType=*/*.parquet').groupBy("UID").count()
more_than_50 = events.filter(events["count"] > 50)
more_than_50.show()
more_than_50

+-----+-----+
|  UID|count|
+-----+-----+
|15194|   91|
|15322|   89|
|17048|   99|
|18147|   91|
|18196|   91|
|18295|   89|
|19158|   89|
|19907|   91|
|21223|   89|
|21342|   89|
|  964|   89|
| 1697|   91|
| 1806|   91|
| 2250|   89|
| 2927|   89|
| 4590|   99|
| 5385|   89|
|37310|   89|
|38287|   91|
|38543|   99|
+-----+-----+
only showing top 20 rows



DataFrame[UID: bigint, count: bigint]

In [6]:
joined = more_than_50.join(less_than_50, more_than_50["UID"] == less_than_50["UID"],"inner")
joined.show()

+---+-----+---+-----+
|UID|count|UID|count|
+---+-----+---+-----+
|  0|   99|  0|    6|
|  1|   89|  1|    2|
|  9|   91|  9|    7|
| 14|   99| 14|    6|
| 21|   89| 21|    3|
| 28|   99| 28|    2|
| 31|   91| 31|    2|
| 33|   99| 33|    7|
| 35|   91| 35|    2|
| 53|   91| 53|    6|
| 54|   89| 54|    1|
| 82|   89| 82|    1|
|102|   99|102|    2|
|113|   91|113|    1|
|114|   99|114|    6|
|120|   99|120|   25|
|130|   89|130|    2|
|157|   91|157|    6|
|160|   99|160|    1|
|180|   91|180|   25|
+---+-----+---+-----+
only showing top 20 rows



In [7]:
joined.count()

103351

In [8]:
prizes = spark.read.parquet(f'lakefs://oreilly-challenge/prizes/datasets/treasure_chests/*.parquet')
prizes.filter(prizes["Key"] == 103351).show(truncate=False)

+----------------------+------+
|Chest                 |Key   |
+----------------------+------+
|https://bit.ly/3QbH1Rn|103351|
+----------------------+------+

