In [13]:
import os
from pyspark import SparkContext,SparkConf
from pyspark.sql import SQLContext,Row
from pyspark.sql.functions import col,split,regexp_extract
import re
import logging

In [25]:
logger=logging.getLogger(__name__)
logger.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s:%(created)f:%(filename)s:%(message)s:%(message)s')
file_handler=logging.FileHandler('log_analysis.txt')
file_handler.setLevel(logging.INFO)
file_handler.setFormatter(formatter)

In [15]:
conf=SparkConf().setAppName('e5')
sc=SparkContext(conf=conf)

In [16]:
sql=SQLContext(sc)

In [17]:
log=sql.read.text('hdfs://nameservice1/user/edureka_37986/access_old.txt')

In [18]:
split_df=log.select(regexp_extract('value',r'(\d.*) - -',1).alias('host'),
                    regexp_extract('value',r' - - \[([0-9]{2}/[A-Z][a-z]{2}/[0-9]{4})',1).alias('date'),
                   regexp_extract('value',r'HTTP/[0-9]{1}.[0-9]{1} ([0-9]{3}) [0-9]{1,5} ',1).alias('status'),
                   regexp_extract('value',r' HTTP/[0-9]{1}.[0-9]{1} [0-9]{3} ([0-9]{1,5}) ',1).alias('fourth'),
                   regexp_extract('value',r'(http://[A-Za-z-.]+)',1).alias('url'))

In [28]:
split_df.show(20,truncate=False)

+---------------+-----------+------+------+-------------------------+
|host           |date       |status|fourth|url                      |
+---------------+-----------+------+------+-------------------------+
|109.169.248.247|12/Dec/2015|200   |4263  |                         |
|109.169.248.247|12/Dec/2015|200   |4494  |http://almhuette-raith.at|
|46.72.177.4    |12/Dec/2015|200   |4263  |                         |
|46.72.177.4    |12/Dec/2015|200   |4494  |http://almhuette-raith.at|
|83.167.113.100 |12/Dec/2015|200   |4263  |                         |
|83.167.113.100 |12/Dec/2015|200   |4494  |http://almhuette-raith.at|
|95.29.198.15   |12/Dec/2015|200   |4263  |                         |
|95.29.198.15   |12/Dec/2015|200   |4494  |http://almhuette-raith.at|
|109.184.11.34  |12/Dec/2015|200   |4263  |                         |
|109.184.11.34  |12/Dec/2015|200   |4494  |http://almhuette-raith.at|
|91.227.29.79   |12/Dec/2015|200   |4263  |                         |
|91.227.29.79   |12/

In [19]:
split_df.cache()

DataFrame[host: string, date: string, status: string, fourth: string, url: string]

In [35]:
split_df.columns

['host', 'date', 'status', 'fourth', 'url']

In [37]:
bad_rows=split_df.filter(split_df['host'].isNull()|split_df['date'].isNull()|split_df['status'].isNull()|split_df['fourth'].isNull()|split_df['url'].isNull())

In [38]:
bad_rows.count()

0

In [20]:
one=split_df.na.fill({'host':'unknown','date':'unknown','status':'unknown','fourth':'unknown','url':'unknown'})

# Find out how many 404 HTTP codes are in access logs

In [21]:
one.registerTempTable('play')

In [29]:
p1=sql.sql('select status,count(status)as Count from play group by status having status == "404" ')

In [30]:
logger.info(p1)

In [31]:
p1.show(10,truncate=False)

+------+------+
|status|Count |
+------+------+
|404   |137767|
+------+------+



# Find out which URLs are broke

In [32]:
p2=sql.sql('select distinct url from play where status =="404"')

In [33]:
logger.info(p2)

In [46]:
p2.show(10,truncate=False)

+-------------------------------+
|url                            |
+-------------------------------+
|http://www.pluscolleg.de       |
|http://www.federacia.by        |
|http://www.semrush.com         |
|http://www.rinkebyfolketshus.se|
|http://www.                    |
|http://www.bing.com            |
|http://vsmart-extensions.com   |
|http://www.googlebot.com       |
|http://www.wimvandiem.nl       |
|http://github.com              |
+-------------------------------+
only showing top 10 rows



# Describe which HTTP status values appear in data and how many

In [34]:
p3=sql.sql('select distinct status,count(status)as Frequency from play group by status order by frequency desc')

In [35]:
logger.info(p3)

In [55]:
p3.show(4,truncate=False)

+------+---------+
|status|Frequency|
+------+---------+
|200   |1041039  |
|206   |474352   |
|404   |137767   |
|      |64071    |
+------+---------+
only showing top 4 rows

