In [19]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import argparse

spark = (SparkSession.builder
    .config('spark.sql.sources.partitionOverwriteMode', 'dynamic')
    .config('hive.exec.dynamic.partition.mode', 'nonstrict')
    .enableHiveSupport().getOrCreate())

In [20]:
# ! hdfs dfs -ls /data/stackoverflow/parquet

In [21]:
# spark.catalog.createTable('stackoverflow.tags', path='/data/stackoverflow/parquet/Tags')

In [23]:
parser = argparse.ArgumentParser()
parser.add_argument('--date')
args = parser.parse_args()

DAY = args.date

In [5]:
posts = spark.table('stackoverflow.posts')

In [6]:
# posts.groupBy('PostTypeId').count().show()

In [7]:
is_question = (col('PostTypeId') == 1) # according to https://ia800107.us.archive.org/27/items/stackexchange/readme.txt
tag_popularity_in_questions = (posts.where(is_question & (col('CreationDate').cast('date') == DAY))
    .select('Id', explode('Tags').alias('tag'))
    .groupBy('tag').count())

In [8]:
# tag_popularity_in_questions.orderBy(desc('count')).show()

In [9]:
is_answer = (col('PostTypeId') == 2)

In [10]:
answers_today = posts.where(is_answer & (col('CreationDate').cast('date') == DAY)).alias('a')
all_questions = posts.where(is_question).alias('q')
tag_popularity_in_answers = (answers_today.join(all_questions, col('a.ParentId') == col('q.Id')).select('q.Tags')
    .select(explode('Tags').alias('tag')).groupBy('tag').count())

In [11]:
votes = spark.table('stackoverflow.votes')
votes_today = (votes.where(col('VoteTypeId').isin(2, 3))
    .where(col('CreationDate').cast('date') == DAY).select('PostId'))

In [12]:
# spark.conf.set('spark.sql.autoBroadcastJoinThreshold', '1000000')
# https://jaceklaskowski.gitbooks.io/mastering-spark-sql/spark-sql-joins-broadcast.html

In [13]:
all_questions = posts.where(is_question).drop('PostTypeId').alias('q')
all_posts = posts.alias('p')

In [14]:
tag_popularity_in_votes = (all_posts.join(broadcast(votes_today), votes_today.PostId == col('p.Id'))
    .join(all_questions, col('q.Id') == col('p.ParentId'), how='left')
    .select(coalesce('q.Tags', 'p.Tags').alias('valid_tags'))
    .select(explode('valid_tags').alias('tag')).groupBy('tag').count())

In [15]:
questions_points = tag_popularity_in_questions.withColumnRenamed('count', 'questions')
answers_points = tag_popularity_in_answers.withColumnRenamed('count', 'answers')
votes_points = tag_popularity_in_votes.withColumnRenamed('count', 'votes')

In [16]:
results = (questions_points.join(answers_points, 'tag', how='full')
    .join(votes_points, 'tag', how='full').fillna(0, ['questions', 'answers', 'votes'])
    .withColumn('dt', lit(DAY)))

In [17]:
create_table_statement = '''
create table if not exists stackoverflow.techstats (
  tag string,
  questions bigint,
  answers bigint,
  votes bigint
) 
partitioned by (dt string)
'''
spark.sql(create_table_statement)

DataFrame[]

In [18]:
results.write.insertInto('stackoverflow.techstats')