In [1]:
from pyspark.sql import SparkSession
import getpass
username = getpass.getuser()
spark = SparkSession. \
builder. \
config('spark.ui.port','0'). \
config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
enableHiveSupport(). \
master('yarn'). \
getOrCreate()

In [2]:
logs_data = [("DEBUG","2014-6-22 21:30:49"),
("WARN","2013-12-6 17:54:15"),
("DEBUG","2017-1-12 10:47:02"),
("DEBUG","2016-6-25 11:06:42"),
("ERROR","2015-6-28 19:25:05"),
("DEBUG","2012-6-24 01:06:37"),
("INFO","2014-12-9 09:53:54"),
("DEBUG","2015-11-8 19:20:08"),
("INFO","2017-12-21 18:34:18")]

In [3]:
log_df = spark.createDataFrame(logs_data).toDF('loglevel','logtime')

In [4]:
log_df.show()

+--------+-------------------+
|loglevel|            logtime|
+--------+-------------------+
|   DEBUG| 2014-6-22 21:30:49|
|    WARN| 2013-12-6 17:54:15|
|   DEBUG| 2017-1-12 10:47:02|
|   DEBUG| 2016-6-25 11:06:42|
|   ERROR| 2015-6-28 19:25:05|
|   DEBUG| 2012-6-24 01:06:37|
|    INFO| 2014-12-9 09:53:54|
|   DEBUG| 2015-11-8 19:20:08|
|    INFO|2017-12-21 18:34:18|
+--------+-------------------+



In [5]:
log_df.printSchema()

root
 |-- loglevel: string (nullable = true)
 |-- logtime: string (nullable = true)



In [6]:
from pyspark.sql.functions import *

In [7]:
new_log_df = log_df.withColumn("logtime", to_timestamp("logtime"))

In [8]:
new_log_df.printSchema()

root
 |-- loglevel: string (nullable = true)
 |-- logtime: timestamp (nullable = true)



In [9]:
new_log_df.createOrReplaceTempView("serverlogs")

In [10]:
spark.sql("select * from serverlogs").show()

+--------+-------------------+
|loglevel|            logtime|
+--------+-------------------+
|   DEBUG|2014-06-22 21:30:49|
|    WARN|2013-12-06 17:54:15|
|   DEBUG|2017-01-12 10:47:02|
|   DEBUG|2016-06-25 11:06:42|
|   ERROR|2015-06-28 19:25:05|
|   DEBUG|2012-06-24 01:06:37|
|    INFO|2014-12-09 09:53:54|
|   DEBUG|2015-11-08 19:20:08|
|    INFO|2017-12-21 18:34:18|
+--------+-------------------+



In [11]:
spark.sql("select loglevel, date_format(logtime, 'MMMM') as month, count(*) as total_occurence from serverlogs group by loglevel,month").show()

+--------+--------+---------------+
|loglevel|   month|total_occurence|
+--------+--------+---------------+
|    WARN|December|              1|
|   ERROR|    June|              1|
|   DEBUG| January|              1|
|   DEBUG|November|              1|
|   DEBUG|    June|              3|
|    INFO|December|              2|
+--------+--------+---------------+



In [12]:
logschema = "loglevel string, logtime timestamp"

In [13]:
log_df = spark.read \
.format("csv") \
.schema(logschema) \
.load("/public/trendytech/datasets/logdata1m.csv")

In [14]:
log_df.show()

+--------+-------------------+
|loglevel|            logtime|
+--------+-------------------+
|    INFO|2015-08-08 20:49:22|
|    WARN|2015-01-14 20:05:00|
|    INFO|2017-06-14 00:08:35|
|    INFO|2016-01-18 11:50:14|
|   DEBUG|2017-07-01 12:55:02|
|    INFO|2014-02-26 12:34:21|
|    INFO|2015-07-12 11:13:47|
|    INFO|2017-04-15 01:20:18|
|   DEBUG|2016-11-02 20:19:23|
|    INFO|2012-08-20 10:09:44|
|   DEBUG|2014-04-22 21:30:49|
|    WARN|2013-12-06 17:54:15|
|   DEBUG|2017-01-12 10:47:02|
|   DEBUG|2016-06-25 11:06:42|
|   ERROR|2015-06-28 19:25:05|
|   DEBUG|2012-06-24 01:06:37|
|    INFO|2014-12-09 09:53:54|
|   DEBUG|2015-11-08 19:20:08|
|    INFO|2017-07-21 18:34:18|
|   DEBUG|2014-12-26 06:38:42|
+--------+-------------------+
only showing top 20 rows



In [15]:
log_df.count()

1000000

In [16]:
log_df.createOrReplaceTempView("serverlogs")

In [17]:
spark.sql("select * from serverlogs").show()

+--------+-------------------+
|loglevel|            logtime|
+--------+-------------------+
|    INFO|2015-08-08 20:49:22|
|    WARN|2015-01-14 20:05:00|
|    INFO|2017-06-14 00:08:35|
|    INFO|2016-01-18 11:50:14|
|   DEBUG|2017-07-01 12:55:02|
|    INFO|2014-02-26 12:34:21|
|    INFO|2015-07-12 11:13:47|
|    INFO|2017-04-15 01:20:18|
|   DEBUG|2016-11-02 20:19:23|
|    INFO|2012-08-20 10:09:44|
|   DEBUG|2014-04-22 21:30:49|
|    WARN|2013-12-06 17:54:15|
|   DEBUG|2017-01-12 10:47:02|
|   DEBUG|2016-06-25 11:06:42|
|   ERROR|2015-06-28 19:25:05|
|   DEBUG|2012-06-24 01:06:37|
|    INFO|2014-12-09 09:53:54|
|   DEBUG|2015-11-08 19:20:08|
|    INFO|2017-07-21 18:34:18|
|   DEBUG|2014-12-26 06:38:42|
+--------+-------------------+
only showing top 20 rows



In [18]:
spark.sql("select loglevel, date_format(logtime, 'MMMM') as month from serverlogs").show()

+--------+--------+
|loglevel|   month|
+--------+--------+
|    INFO|  August|
|    WARN| January|
|    INFO|    June|
|    INFO| January|
|   DEBUG|    July|
|    INFO|February|
|    INFO|    July|
|    INFO|   April|
|   DEBUG|November|
|    INFO|  August|
|   DEBUG|   April|
|    WARN|December|
|   DEBUG| January|
|   DEBUG|    June|
|   ERROR|    June|
|   DEBUG|    June|
|    INFO|December|
|   DEBUG|November|
|    INFO|    July|
|   DEBUG|December|
+--------+--------+
only showing top 20 rows



In [19]:
spark.sql("select loglevel, date_format(logtime, 'MMMM') as month, count(*) as total_occurences from serverlogs group by loglevel,month").show()

+--------+---------+----------------+
|loglevel|    month|total_occurences|
+--------+---------+----------------+
|    WARN|     June|            8191|
|    INFO|     June|           29143|
|   ERROR| November|            3389|
|   FATAL|  January|              94|
|    WARN| December|            8328|
|    WARN|    March|            8165|
|   DEBUG|     July|           42085|
|   ERROR|    April|            4107|
|   ERROR|  January|            4054|
|   FATAL|September|              81|
|   FATAL|    April|              83|
|    INFO|September|           29038|
|   FATAL| November|           16797|
|   FATAL|  October|              92|
|    INFO| February|           28983|
|    WARN|    April|            8277|
|   DEBUG| December|           41749|
|   FATAL| December|              94|
|    WARN|      May|            8403|
|   ERROR|     June|            4059|
+--------+---------+----------------+
only showing top 20 rows



In [20]:
spark.sql("select loglevel, date_format(logtime, 'MMMM') as month, count(*) as total_occurences from serverlogs group by loglevel,month order by month").show()

+--------+--------+----------------+
|loglevel|   month|total_occurences|
+--------+--------+----------------+
|   FATAL|   April|              83|
|    INFO|   April|           29302|
|    WARN|   April|            8277|
|   ERROR|   April|            4107|
|   DEBUG|   April|           41869|
|    INFO|  August|           28993|
|   FATAL|  August|              80|
|   ERROR|  August|            3987|
|   DEBUG|  August|           42147|
|    WARN|  August|            8381|
|    WARN|December|            8328|
|   ERROR|December|            4106|
|   DEBUG|December|           41749|
|    INFO|December|           28874|
|   FATAL|December|              94|
|    WARN|February|            8266|
|   FATAL|February|              72|
|   DEBUG|February|           41734|
|    INFO|February|           28983|
|   ERROR|February|            4013|
+--------+--------+----------------+
only showing top 20 rows



In [21]:
spark.sql("select loglevel, date_format(logtime, 'MMMM') as month, date_format(logtime, 'M') as month_num, count(*) as total_occurences from serverlogs group by loglevel,month, month_num order by month_num").show()

+--------+--------+---------+----------------+
|loglevel|   month|month_num|total_occurences|
+--------+--------+---------+----------------+
|   DEBUG| January|        1|           41961|
|   ERROR| January|        1|            4054|
|    INFO| January|        1|           29119|
|    WARN| January|        1|            8217|
|   FATAL| January|        1|              94|
|    WARN| October|       10|            8226|
|   ERROR| October|       10|            4040|
|   FATAL| October|       10|              92|
|   DEBUG| October|       10|           41936|
|    INFO| October|       10|           29018|
|   DEBUG|November|       11|           33366|
|   ERROR|November|       11|            3389|
|    INFO|November|       11|           23301|
|   FATAL|November|       11|           16797|
|    WARN|November|       11|            6616|
|    INFO|December|       12|           28874|
|   ERROR|December|       12|            4106|
|   FATAL|December|       12|              94|
|    WARN|Dec

In [22]:
spark.sql("select loglevel, date_format(logtime, 'MMMM') as month, int(date_format(logtime, 'M')) as month_num, count(*) as total_occurences from serverlogs group by loglevel,month, month_num order by month_num").show()

+--------+--------+---------+----------------+
|loglevel|   month|month_num|total_occurences|
+--------+--------+---------+----------------+
|    WARN| January|        1|            8217|
|   FATAL| January|        1|              94|
|   DEBUG| January|        1|           41961|
|    INFO| January|        1|           29119|
|   ERROR| January|        1|            4054|
|    INFO|February|        2|           28983|
|    WARN|February|        2|            8266|
|   DEBUG|February|        2|           41734|
|   ERROR|February|        2|            4013|
|   FATAL|February|        2|              72|
|    INFO|   March|        3|           29095|
|   FATAL|   March|        3|              70|
|   DEBUG|   March|        3|           41652|
|    WARN|   March|        3|            8165|
|   ERROR|   March|        3|            4122|
|   DEBUG|   April|        4|           41869|
|   FATAL|   April|        4|              83|
|    INFO|   April|        4|           29302|
|    WARN|   

In [23]:
spark.sql("select loglevel, date_format(logtime, 'MMMM') as month, date_format(logtime, 'MM') as month_num, count(*) as total_occurences from serverlogs group by loglevel,month, month_num order by month_num").show()

+--------+--------+---------+----------------+
|loglevel|   month|month_num|total_occurences|
+--------+--------+---------+----------------+
|   DEBUG| January|       01|           41961|
|    INFO| January|       01|           29119|
|    WARN| January|       01|            8217|
|   ERROR| January|       01|            4054|
|   FATAL| January|       01|              94|
|    WARN|February|       02|            8266|
|   FATAL|February|       02|              72|
|   DEBUG|February|       02|           41734|
|   ERROR|February|       02|            4013|
|    INFO|February|       02|           28983|
|   ERROR|   March|       03|            4122|
|   DEBUG|   March|       03|           41652|
|   FATAL|   March|       03|              70|
|    WARN|   March|       03|            8165|
|    INFO|   March|       03|           29095|
|    WARN|   April|       04|            8277|
|   FATAL|   April|       04|              83|
|   DEBUG|   April|       04|           41869|
|    INFO|   

In [24]:
spark.sql("select loglevel, date_format(logtime, 'MMMM') as month, first(date_format(logtime, 'MM')) as month_num, count(*) as total_occurences from serverlogs group by loglevel,month order by month_num").show()

+--------+--------+---------+----------------+
|loglevel|   month|month_num|total_occurences|
+--------+--------+---------+----------------+
|    INFO| January|       01|           29119|
|   ERROR| January|       01|            4054|
|   FATAL| January|       01|              94|
|    WARN| January|       01|            8217|
|   DEBUG| January|       01|           41961|
|   ERROR|February|       02|            4013|
|   DEBUG|February|       02|           41734|
|   FATAL|February|       02|              72|
|    INFO|February|       02|           28983|
|    WARN|February|       02|            8266|
|   ERROR|   March|       03|            4122|
|    WARN|   March|       03|            8165|
|    INFO|   March|       03|           29095|
|   DEBUG|   March|       03|           41652|
|   FATAL|   March|       03|              70|
|    WARN|   April|       04|            8277|
|   ERROR|   April|       04|            4107|
|   FATAL|   April|       04|              83|
|    INFO|   

In [25]:
result_df = spark.sql("select loglevel, date_format(logtime, 'MMMM') as month, first(date_format(logtime, 'MM')) as month_num, count(*) as total_occurences from serverlogs group by loglevel,month order by month_num")

In [26]:
result_df.show()

+--------+--------+---------+----------------+
|loglevel|   month|month_num|total_occurences|
+--------+--------+---------+----------------+
|    WARN| January|       01|            8217|
|   FATAL| January|       01|              94|
|    INFO| January|       01|           29119|
|   ERROR| January|       01|            4054|
|   DEBUG| January|       01|           41961|
|    WARN|February|       02|            8266|
|   DEBUG|February|       02|           41734|
|   FATAL|February|       02|              72|
|    INFO|February|       02|           28983|
|   ERROR|February|       02|            4013|
|   ERROR|   March|       03|            4122|
|   FATAL|   March|       03|              70|
|   DEBUG|   March|       03|           41652|
|    WARN|   March|       03|            8165|
|    INFO|   March|       03|           29095|
|   ERROR|   April|       04|            4107|
|   FATAL|   April|       04|              83|
|    WARN|   April|       04|            8277|
|    INFO|   

In [27]:
final_df = result_df.drop("month_num")

In [28]:
final_df.show()

+--------+--------+----------------+
|loglevel|   month|total_occurences|
+--------+--------+----------------+
|   FATAL| January|              94|
|    WARN| January|            8217|
|   ERROR| January|            4054|
|   DEBUG| January|           41961|
|    INFO| January|           29119|
|   ERROR|February|            4013|
|   FATAL|February|              72|
|    INFO|February|           28983|
|    WARN|February|            8266|
|   DEBUG|February|           41734|
|   ERROR|   March|            4122|
|    INFO|   March|           29095|
|    WARN|   March|            8165|
|   DEBUG|   March|           41652|
|   FATAL|   March|              70|
|    WARN|   April|            8277|
|   ERROR|   April|            4107|
|    INFO|   April|           29302|
|   DEBUG|   April|           41869|
|   FATAL|   April|              83|
+--------+--------+----------------+
only showing top 20 rows



In [29]:
spark.sql("select loglevel,date_format(logtime,'MMMM') as month from serverlogs").show()

+--------+--------+
|loglevel|   month|
+--------+--------+
|    INFO|  August|
|    WARN| January|
|    INFO|    June|
|    INFO| January|
|   DEBUG|    July|
|    INFO|February|
|    INFO|    July|
|    INFO|   April|
|   DEBUG|November|
|    INFO|  August|
|   DEBUG|   April|
|    WARN|December|
|   DEBUG| January|
|   DEBUG|    June|
|   ERROR|    June|
|   DEBUG|    June|
|    INFO|December|
|   DEBUG|November|
|    INFO|    July|
|   DEBUG|December|
+--------+--------+
only showing top 20 rows



In [30]:
spark.sql("select loglevel,date_format(logtime,'MMMM') as month from serverlogs").groupBy('loglevel').pivot('month').count().show()

+--------+-----+------+--------+--------+-------+-----+-----+-----+-----+--------+-------+---------+
|loglevel|April|August|December|February|January| July| June|March|  May|November|October|September|
+--------+-----+------+--------+--------+-------+-----+-----+-----+-----+--------+-------+---------+
|    INFO|29302| 28993|   28874|   28983|  29119|29300|29143|29095|28900|   23301|  29018|    29038|
|   ERROR| 4107|  3987|    4106|    4013|   4054| 3976| 4059| 4122| 4086|    3389|   4040|     4161|
|    WARN| 8277|  8381|    8328|    8266|   8217| 8222| 8191| 8165| 8403|    6616|   8226|     8352|
|   DEBUG|41869| 42147|   41749|   41734|  41961|42085|41774|41652|41785|   33366|  41936|    41433|
|   FATAL|   83|    80|      94|      72|     94|   98|   78|   70|   60|   16797|     92|       81|
+--------+-----+------+--------+--------+-------+-----+-----+-----+-----+--------+-------+---------+



In [31]:
spark.sql("select loglevel,date_format(logtime,'MM') as month from serverlogs").groupBy('loglevel').pivot('month').count().show()

+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
|loglevel|   01|   02|   03|   04|   05|   06|   07|   08|   09|   10|   11|   12|
+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
|    INFO|29119|28983|29095|29302|28900|29143|29300|28993|29038|29018|23301|28874|
|   ERROR| 4054| 4013| 4122| 4107| 4086| 4059| 3976| 3987| 4161| 4040| 3389| 4106|
|    WARN| 8217| 8266| 8165| 8277| 8403| 8191| 8222| 8381| 8352| 8226| 6616| 8328|
|   DEBUG|41961|41734|41652|41869|41785|41774|42085|42147|41433|41936|33366|41749|
|   FATAL|   94|   72|   70|   83|   60|   78|   98|   80|   81|   92|16797|   94|
+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+



In [32]:
month_list=['January','February','March','April','May','June','July','August','September','October','November','December']

In [33]:
spark.sql("select loglevel,date_format(logtime,'MMMM') as month from serverlogs").groupBy('loglevel').pivot('month',month_list).count().show()

+--------+-------+--------+-----+-----+-----+-----+-----+------+---------+-------+--------+--------+
|loglevel|January|February|March|April|  May| June| July|August|September|October|November|December|
+--------+-------+--------+-----+-----+-----+-----+-----+------+---------+-------+--------+--------+
|    INFO|  29119|   28983|29095|29302|28900|29143|29300| 28993|    29038|  29018|   23301|   28874|
|   ERROR|   4054|    4013| 4122| 4107| 4086| 4059| 3976|  3987|     4161|   4040|    3389|    4106|
|    WARN|   8217|    8266| 8165| 8277| 8403| 8191| 8222|  8381|     8352|   8226|    6616|    8328|
|   FATAL|     94|      72|   70|   83|   60|   78|   98|    80|       81|     92|   16797|      94|
|   DEBUG|  41961|   41734|41652|41869|41785|41774|42085| 42147|    41433|  41936|   33366|   41749|
+--------+-------+--------+-----+-----+-----+-----+-----+------+---------+-------+--------+--------+



In [34]:
month_list=['Jan','February','March','April','May','June','July','August','September','October','November','December']

In [35]:
spark.sql("select loglevel,date_format(logtime,'MMMM') as month from serverlogs").groupBy('loglevel').pivot('month',month_list).count().show()

+--------+----+--------+-----+-----+-----+-----+-----+------+---------+-------+--------+--------+
|loglevel| Jan|February|March|April|  May| June| July|August|September|October|November|December|
+--------+----+--------+-----+-----+-----+-----+-----+------+---------+-------+--------+--------+
|    INFO|null|   28983|29095|29302|28900|29143|29300| 28993|    29038|  29018|   23301|   28874|
|   ERROR|null|    4013| 4122| 4107| 4086| 4059| 3976|  3987|     4161|   4040|    3389|    4106|
|    WARN|null|    8266| 8165| 8277| 8403| 8191| 8222|  8381|     8352|   8226|    6616|    8328|
|   FATAL|null|      72|   70|   83|   60|   78|   98|    80|       81|     92|   16797|      94|
|   DEBUG|null|   41734|41652|41869|41785|41774|42085| 42147|    41433|  41936|   33366|   41749|
+--------+----+--------+-----+-----+-----+-----+-----+------+---------+-------+--------+--------+

