In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!pip install pyspark



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("ReadParquetFiles") \
    .getOrCreate()

In [11]:
qtb_df = spark.read.parquet("/content/drive/MyDrive/spark/tags/bridge tags..9904f03e6cb7-c000.snappy.parquet")
date_df = spark.read.parquet("/content/drive/MyDrive/spark/tags/date...bfe2-ee860b19f8da-c000.snappy.parquet")
question_df = spark.read.parquet("/content/drive/MyDrive/spark/tags/factttttttt_question2c273d075e62-c000.snappy.parquet")
tags_df = spark.read.parquet("/content/drive/MyDrive/spark/tags/tagggggs9336-b386ef5735ec-c000.snappy.parquet")


In [12]:
date_df.show()

+----------+--------+----------+-----+----+---------+---------+----------+---------+-------+----------+
|  FullDate| DateKey|DayOfMonth|Month|Year|DayOfWeek|  DayName|WeekOfYear|MonthName|Quarter|FiscalYear|
+----------+--------+----------+-----+----+---------+---------+----------+---------+-------+----------+
|2010-01-01|20100101|         1|    1|2010|        6|   Friday|        53|  January|      1|      2010|
|2010-01-02|20100102|         2|    1|2010|        7| Saturday|        53|  January|      1|      2010|
|2010-01-03|20100103|         3|    1|2010|        1|   Sunday|        53|  January|      1|      2010|
|2010-01-04|20100104|         4|    1|2010|        2|   Monday|         1|  January|      1|      2010|
|2010-01-05|20100105|         5|    1|2010|        3|  Tuesday|         1|  January|      1|      2010|
|2010-01-06|20100106|         6|    1|2010|        4|Wednesday|         1|  January|      1|      2010|
|2010-01-07|20100107|         7|    1|2010|        5| Thursday| 

In [13]:
tags_df.createOrReplaceTempView("tags")
qtb_df.createOrReplaceTempView("qtb")
question_df.createOrReplaceTempView("question")
date_df.createOrReplaceTempView("date")

In [29]:
result4 = spark.sql("""
WITH daily_counts AS (
    SELECT
        d.Month,
        d.Year,
        t.tagname,
        d.FullDate,
        COUNT(*) AS daily_count
    FROM tags t
    JOIN qtb q ON t.tag_sk = q.TagID
    JOIN question qn ON q.Questions_Desc_SK = qn.Questions_Desc_FK
    JOIN date d ON qn.CreationDate_FK = d.DateKey
    GROUP BY d.Month, d.Year, t.tagname, d.FullDate
),
monthly_counts AS (
    SELECT
        Month,
        Year,
        tagname,
        SUM(daily_count) AS monthly_count
    FROM daily_counts
    GROUP BY Month, Year, tagname
),
with_cumulative AS (
    SELECT
        mc.Month,
        mc.Year,
        mc.tagname,
        mc.monthly_count,
        SUM(mc.monthly_count) OVER (
            PARTITION BY mc.tagname
            ORDER BY mc.Year, mc.Month
            ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
        ) AS monthly_cumulative_count
    FROM monthly_counts mc
)
SELECT
    Month,
    Year,
    tagname,
    monthly_count,
    monthly_cumulative_count
FROM with_cumulative
ORDER BY tagname, Year, Month
""")

In [30]:
result4.show()

+-----+----+--------------+-------------+------------------------+
|Month|Year|       tagname|monthly_count|monthly_cumulative_count|
+-----+----+--------------+-------------+------------------------+
|   10|2010|absolute-value|            2|                       2|
|   11|2010|absolute-value|            1|                       3|
|    1|2011|absolute-value|            2|                       5|
|    2|2011|absolute-value|            1|                       6|
|    3|2011|absolute-value|            4|                      10|
|    5|2011|absolute-value|            1|                      11|
|    8|2011|absolute-value|            3|                      14|
|    9|2011|absolute-value|            2|                      16|
|   10|2011|absolute-value|            2|                      18|
|   11|2011|absolute-value|            3|                      21|
|   12|2011|absolute-value|            2|                      23|
|    1|2012|absolute-value|            3|                     

In [35]:
output_path = "/content/drive/MyDrive/spark/output/result4.parquet"
result.coalesce(1).write.parquet(output_path)