In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import * #for window

spark = SparkSession\
.builder\
.master("yarn")\
.appName("window functions")\
.enableHiveSupport()\
.config("spark.sql.warehouse.dir","/user/itv009490/warehouse")\
.getOrCreate()

spark

In [9]:
log_list = [("INFO","2015-8-8 20:49:22"),
("WARN","2015-1-14 20:05:00"),
("INFO","2017-6-14 00:08:35"),
("INFO","2016-1-18 11:50:14"),
("DEBUG","2017-7-1 12:55:02"),
("INFO","2014-2-26 12:34:21"),
("INFO","2015-7-12 11:13:47"),
("INFO","2017-4-15 01:20:18"),
("DEBUG","2016-11-2 20:19:23"),
("INFO","2012-8-20 10:09:44"),
("DEBUG","2014-4-22 21:30:49"),
("WARN","2013-12-6 17:54:15"),
("DEBUG","2017-1-12 10:47:02"),
("DEBUG","2016-6-25 11:06:42"),
("ERROR","2015-6-28 19:25:05"),
("DEBUG","2012-6-24 01:06:37")
]

In [10]:
log_df=spark.createDataFrame(log_list).toDF("loglevel","logtime")

In [11]:
log_df.show()

+--------+------------------+
|loglevel|           logtime|
+--------+------------------+
|    INFO| 2015-8-8 20:49:22|
|    WARN|2015-1-14 20:05:00|
|    INFO|2017-6-14 00:08:35|
|    INFO|2016-1-18 11:50:14|
|   DEBUG| 2017-7-1 12:55:02|
|    INFO|2014-2-26 12:34:21|
|    INFO|2015-7-12 11:13:47|
|    INFO|2017-4-15 01:20:18|
|   DEBUG|2016-11-2 20:19:23|
|    INFO|2012-8-20 10:09:44|
|   DEBUG|2014-4-22 21:30:49|
|    WARN|2013-12-6 17:54:15|
|   DEBUG|2017-1-12 10:47:02|
|   DEBUG|2016-6-25 11:06:42|
|   ERROR|2015-6-28 19:25:05|
|   DEBUG|2012-6-24 01:06:37|
+--------+------------------+



In [12]:
log_df.printSchema()

root
 |-- loglevel: string (nullable = true)
 |-- logtime: string (nullable = true)



In [15]:
from pyspark.sql.functions import *
new_df=log_df.withColumn("logtime",to_timestamp("logtime"))
new_df.printSchema()

root
 |-- loglevel: string (nullable = true)
 |-- logtime: timestamp (nullable = true)



In [16]:
new_df.groupBy("loglevel").agg(max("logtime")).show()

+--------+-------------------+
|loglevel|       max(logtime)|
+--------+-------------------+
|    INFO|2017-06-14 00:08:35|
|   ERROR|2015-06-28 19:25:05|
|    WARN|2015-01-14 20:05:00|
|   DEBUG|2017-07-01 12:55:02|
+--------+-------------------+



In [17]:
new_df.createOrReplaceTempView("serverlogs")

In [18]:
spark.sql("select * from serverlogs").show()

+--------+-------------------+
|loglevel|            logtime|
+--------+-------------------+
|    INFO|2015-08-08 20:49:22|
|    WARN|2015-01-14 20:05:00|
|    INFO|2017-06-14 00:08:35|
|    INFO|2016-01-18 11:50:14|
|   DEBUG|2017-07-01 12:55:02|
|    INFO|2014-02-26 12:34:21|
|    INFO|2015-07-12 11:13:47|
|    INFO|2017-04-15 01:20:18|
|   DEBUG|2016-11-02 20:19:23|
|    INFO|2012-08-20 10:09:44|
|   DEBUG|2014-04-22 21:30:49|
|    WARN|2013-12-06 17:54:15|
|   DEBUG|2017-01-12 10:47:02|
|   DEBUG|2016-06-25 11:06:42|
|   ERROR|2015-06-28 19:25:05|
|   DEBUG|2012-06-24 01:06:37|
+--------+-------------------+



In [23]:
spark.sql("""select loglevel,date_format(logtime,'MMMM') as month  from serverlogs""").show()

+--------+--------+
|loglevel|   month|
+--------+--------+
|    INFO|  August|
|    WARN| January|
|    INFO|    June|
|    INFO| January|
|   DEBUG|    July|
|    INFO|February|
|    INFO|    July|
|    INFO|   April|
|   DEBUG|November|
|    INFO|  August|
|   DEBUG|   April|
|    WARN|December|
|   DEBUG| January|
|   DEBUG|    June|
|   ERROR|    June|
|   DEBUG|    June|
+--------+--------+



In [24]:
spark.sql("""select loglevel,date_format(logtime,'MMMM') as month, count(*) as total_occurence
from serverlogs
group by loglevel,month""").show()

+--------+--------+---------------+
|loglevel|   month|total_occurence|
+--------+--------+---------------+
|    INFO|    June|              1|
|    WARN|December|              1|
|   DEBUG|    July|              1|
|    INFO|February|              1|
|   ERROR|    June|              1|
|    WARN| January|              1|
|   DEBUG| January|              1|
|    INFO|  August|              2|
|   DEBUG|November|              1|
|    INFO|   April|              1|
|   DEBUG|    June|              2|
|    INFO| January|              1|
|   DEBUG|   April|              1|
|    INFO|    July|              1|
+--------+--------+---------------+



In [31]:
schema = "loglevel string, logtime timestamp"
df = spark.read.format("csv").\
option("inferSchema","true").\
schema(schema).\
load("/public/trendytech/datasets/logdata1m.csv")

In [32]:
df.show()

+--------+-------------------+
|loglevel|            logtime|
+--------+-------------------+
|    INFO|2015-08-08 20:49:22|
|    WARN|2015-01-14 20:05:00|
|    INFO|2017-06-14 00:08:35|
|    INFO|2016-01-18 11:50:14|
|   DEBUG|2017-07-01 12:55:02|
|    INFO|2014-02-26 12:34:21|
|    INFO|2015-07-12 11:13:47|
|    INFO|2017-04-15 01:20:18|
|   DEBUG|2016-11-02 20:19:23|
|    INFO|2012-08-20 10:09:44|
|   DEBUG|2014-04-22 21:30:49|
|    WARN|2013-12-06 17:54:15|
|   DEBUG|2017-01-12 10:47:02|
|   DEBUG|2016-06-25 11:06:42|
|   ERROR|2015-06-28 19:25:05|
|   DEBUG|2012-06-24 01:06:37|
|    INFO|2014-12-09 09:53:54|
|   DEBUG|2015-11-08 19:20:08|
|    INFO|2017-07-21 18:34:18|
|   DEBUG|2014-12-26 06:38:42|
+--------+-------------------+
only showing top 20 rows



In [34]:
df.createOrReplaceTempView("serverlogs")
spark.sql("""cache table serverlogs""")

In [35]:
df2 = spark.sql("""select loglevel,date_format(logtime,'MMMM') as month, count(*) as total_occurence
from serverlogs
group by loglevel,month""")
df2.show()

+--------+---------+---------------+
|loglevel|    month|total_occurence|
+--------+---------+---------------+
|    WARN|     June|           8191|
|    INFO|     June|          29143|
|   ERROR| November|           3389|
|   FATAL|  January|             94|
|    WARN| December|           8328|
|    WARN|    March|           8165|
|   DEBUG|     July|          42085|
|   ERROR|    April|           4107|
|   ERROR|  January|           4054|
|   FATAL|September|             81|
|   FATAL|    April|             83|
|    INFO|September|          29038|
|   FATAL| November|          16797|
|   FATAL|  October|             92|
|    INFO| February|          28983|
|    WARN|    April|           8277|
|   DEBUG| December|          41749|
|   FATAL| December|             94|
|    WARN|      May|           8403|
|   ERROR|     June|           4059|
+--------+---------+---------------+
only showing top 20 rows



In [36]:
df.count()

1000000

In [37]:
df2.count()

60

In [44]:
df2 = spark.sql("""select loglevel,date_format(logtime,'MMMM') as month, 
cast(date_format(logtime,'M') as int) as month_num,
count(*) as total_occurence
from serverlogs
group by loglevel,month,month_num order by month_num""").drop("month_num")
df2.show()

+--------+--------+---------------+
|loglevel|   month|total_occurence|
+--------+--------+---------------+
|    WARN| January|           8217|
|   DEBUG| January|          41961|
|   FATAL| January|             94|
|    INFO| January|          29119|
|   ERROR| January|           4054|
|   DEBUG|February|          41734|
|    WARN|February|           8266|
|    INFO|February|          28983|
|   ERROR|February|           4013|
|   FATAL|February|             72|
|   FATAL|   March|             70|
|   ERROR|   March|           4122|
|   DEBUG|   March|          41652|
|    WARN|   March|           8165|
|    INFO|   March|          29095|
|    INFO|   April|          29302|
|   FATAL|   April|             83|
|    WARN|   April|           8277|
|   DEBUG|   April|          41869|
|   ERROR|   April|           4107|
+--------+--------+---------------+
only showing top 20 rows



In [45]:
df2 = spark.sql("""select loglevel,date_format(logtime,'MMMM') as month, 
date_format(logtime,'MM') as month_num,
count(*) as total_occurence
from serverlogs
group by loglevel,month,month_num order by month_num""").drop("month_num")
df2.show()

+--------+--------+---------------+
|loglevel|   month|total_occurence|
+--------+--------+---------------+
|    INFO| January|          29119|
|   DEBUG| January|          41961|
|    WARN| January|           8217|
|   FATAL| January|             94|
|   ERROR| January|           4054|
|   FATAL|February|             72|
|   ERROR|February|           4013|
|   DEBUG|February|          41734|
|    WARN|February|           8266|
|    INFO|February|          28983|
|   ERROR|   March|           4122|
|   DEBUG|   March|          41652|
|   FATAL|   March|             70|
|    WARN|   March|           8165|
|    INFO|   March|          29095|
|    WARN|   April|           8277|
|   ERROR|   April|           4107|
|    INFO|   April|          29302|
|   FATAL|   April|             83|
|   DEBUG|   April|          41869|
+--------+--------+---------------+
only showing top 20 rows



In [46]:
df2 = spark.sql("""select loglevel,date_format(logtime,'MMMM') as month, 
first(date_format(logtime,'MM')) as month_num,
count(*) as total_occurence
from serverlogs
group by loglevel,month order by month_num""").drop("month_num")
df2.show()

+--------+--------+---------------+
|loglevel|   month|total_occurence|
+--------+--------+---------------+
|   DEBUG| January|          41961|
|   FATAL| January|             94|
|    INFO| January|          29119|
|   ERROR| January|           4054|
|    WARN| January|           8217|
|    WARN|February|           8266|
|   ERROR|February|           4013|
|   DEBUG|February|          41734|
|   FATAL|February|             72|
|    INFO|February|          28983|
|   ERROR|   March|           4122|
|    WARN|   March|           8165|
|    INFO|   March|          29095|
|   DEBUG|   March|          41652|
|   FATAL|   March|             70|
|   ERROR|   April|           4107|
|    WARN|   April|           8277|
|   FATAL|   April|             83|
|    INFO|   April|          29302|
|   DEBUG|   April|          41869|
+--------+--------+---------------+
only showing top 20 rows



In [48]:
final_df = df2

In [49]:
final_df.show()

+--------+--------+---------------+
|loglevel|   month|total_occurence|
+--------+--------+---------------+
|   FATAL| January|             94|
|   DEBUG| January|          41961|
|    INFO| January|          29119|
|   ERROR| January|           4054|
|    WARN| January|           8217|
|   ERROR|February|           4013|
|    WARN|February|           8266|
|   DEBUG|February|          41734|
|   FATAL|February|             72|
|    INFO|February|          28983|
|   ERROR|   March|           4122|
|    WARN|   March|           8165|
|   DEBUG|   March|          41652|
|   FATAL|   March|             70|
|    INFO|   March|          29095|
|    WARN|   April|           8277|
|   ERROR|   April|           4107|
|   FATAL|   April|             83|
|    INFO|   April|          29302|
|   DEBUG|   April|          41869|
+--------+--------+---------------+
only showing top 20 rows



In [56]:
df3 = spark.sql("""select loglevel,date_format(logtime,'MM') as month
from serverlogs
""").groupBy("loglevel").pivot('month').count()

df3.show()

+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
|loglevel|   01|   02|   03|   04|   05|   06|   07|   08|   09|   10|   11|   12|
+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
|    INFO|29119|28983|29095|29302|28900|29143|29300|28993|29038|29018|23301|28874|
|   ERROR| 4054| 4013| 4122| 4107| 4086| 4059| 3976| 3987| 4161| 4040| 3389| 4106|
|    WARN| 8217| 8266| 8165| 8277| 8403| 8191| 8222| 8381| 8352| 8226| 6616| 8328|
|   DEBUG|41961|41734|41652|41869|41785|41774|42085|42147|41433|41936|33366|41749|
|   FATAL|   94|   72|   70|   83|   60|   78|   98|   80|   81|   92|16797|   94|
+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+



In [57]:
df3 = spark.sql("""select loglevel,date_format(logtime,'MMM') as month
from serverlogs
""").groupBy("loglevel").pivot('month').count()

df3.show()

+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
|loglevel|  Apr|  Aug|  Dec|  Feb|  Jan|  Jul|  Jun|  Mar|  May|  Nov|  Oct|  Sep|
+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+
|    INFO|29302|28993|28874|28983|29119|29300|29143|29095|28900|23301|29018|29038|
|   ERROR| 4107| 3987| 4106| 4013| 4054| 3976| 4059| 4122| 4086| 3389| 4040| 4161|
|    WARN| 8277| 8381| 8328| 8266| 8217| 8222| 8191| 8165| 8403| 6616| 8226| 8352|
|   FATAL|   83|   80|   94|   72|   94|   98|   78|   70|   60|16797|   92|   81|
|   DEBUG|41869|42147|41749|41734|41961|42085|41774|41652|41785|33366|41936|41433|
+--------+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+-----+



In [58]:
df3 = spark.sql("""select loglevel,date_format(logtime,'MMM') as month
from serverlogs
""").groupBy("month").pivot('loglevel').count()

df3.show()

+-----+-----+-----+-----+-----+----+
|month|DEBUG|ERROR|FATAL| INFO|WARN|
+-----+-----+-----+-----+-----+----+
|  Oct|41936| 4040|   92|29018|8226|
|  Sep|41433| 4161|   81|29038|8352|
|  Dec|41749| 4106|   94|28874|8328|
|  Aug|42147| 3987|   80|28993|8381|
|  May|41785| 4086|   60|28900|8403|
|  Jun|41774| 4059|   78|29143|8191|
|  Feb|41734| 4013|   72|28983|8266|
|  Nov|33366| 3389|16797|23301|6616|
|  Mar|41652| 4122|   70|29095|8165|
|  Jan|41961| 4054|   94|29119|8217|
|  Apr|41869| 4107|   83|29302|8277|
|  Jul|42085| 3976|   98|29300|8222|
+-----+-----+-----+-----+-----+----+



In [59]:
#quick way
months_list = [
    'January', 'February', 'March', 'April', 'May', 'June',
    'July', 'August', 'September', 'October', 'November', 'December'
]

In [62]:
df3 = spark.sql("""select loglevel,date_format(logtime,'MMMM') as month
from serverlogs
""").groupBy("loglevel").pivot('month',months_list).count()

df3.show()

+--------+-------+--------+-----+-----+-----+-----+-----+------+---------+-------+--------+--------+
|loglevel|January|February|March|April|  May| June| July|August|September|October|November|December|
+--------+-------+--------+-----+-----+-----+-----+-----+------+---------+-------+--------+--------+
|    INFO|  29119|   28983|29095|29302|28900|29143|29300| 28993|    29038|  29018|   23301|   28874|
|   ERROR|   4054|    4013| 4122| 4107| 4086| 4059| 3976|  3987|     4161|   4040|    3389|    4106|
|    WARN|   8217|    8266| 8165| 8277| 8403| 8191| 8222|  8381|     8352|   8226|    6616|    8328|
|   FATAL|     94|      72|   70|   83|   60|   78|   98|    80|       81|     92|   16797|      94|
|   DEBUG|  41961|   41734|41652|41869|41785|41774|42085| 42147|    41433|  41936|   33366|   41749|
+--------+-------+--------+-----+-----+-----+-----+-----+------+---------+-------+--------+--------+

