In [1]:
import pyspark.sql.functions as F
import os
from pyspark.sql import SparkSession, SQLContext
from pyspark import SparkContext, SparkConf

from pyspark.sql.functions import explode
from pyspark.sql.functions import countDistinct, avg
from pyspark.sql.functions import dayofmonth,dayofyear,year,month,hour,weekofyear,date_format
from pyspark.sql.functions import col as func_col
from pyspark.sql.functions import lit
from pyspark.sql.functions import *
from pyspark.ml import Pipeline
import re

In [2]:
#Spark session is being created
#Configuration of the Spark Session
app_name="apple_logs"
conf = SparkConf()  # create the configuration
conf.set('spark.driver.extraClassPath', "/usr/share/cmf/common_jars/mysql-connector-java-5.1.15.jar")  # set the spark.jars
conf.set('spark.executor.extraClassPath', "/usr/share/cmf/common_jars/mysql-connector-java-5.1.15.jar")

#Spark Session object
spark = SparkSession.builder.config(conf=conf).appName(app_name).getOrCreate()

In [3]:
#Reading the file from the hdfs wherein the file has the data not comma separated but the tab separated
data=spark.read.text("/user/edureka_960126/access.clean.log")
data.show(20,truncate=False)
data.printSchema()

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                                                                                                        |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|109.169.248.247 - - [12/Dec/2015:18:25:11 +0100] GET /administrator/ HTTP/1.1 200 4263 - Mozilla/5.0 (Windows NT 6.0; rv:34.0) Gecko/20100101 Firefox/34.0 -                                                 |
|109.169.248.247 - - [12/Dec/2015:18:25:11 +0100] POST /administrator/index.php HTTP/1.1 200 4494 http://almhuette-raith.at/administrator/ Mozilla/5.0 (Windows NT 6.0; 

In [None]:
#Now we need to parse this data to convert into relevant tabular form in order to work out various queries over it

# Data Wrangling

#data displayed above is in Common Log Format.


#remotehost----rfc931----authuser ------[date] "request" status bytes


#remotehost   Remote hostname


#rfc931   The remote logname of the user if at all it is present.


#authuser The username of the remote user after authentication by the HTTP server.


#[date]  Date and time of the request.

#“request” The request, exactly as it came from the browser or client.

#status The HTTP status code the server sent back to the client.

#bytes The number of bytes (Content-Length) transferred to the client.








In [4]:
sample_logs = [item['value'] for item in data.take(15)]
sample_logs

[u'109.169.248.247 - - [12/Dec/2015:18:25:11 +0100] GET /administrator/ HTTP/1.1 200 4263 - Mozilla/5.0 (Windows NT 6.0; rv:34.0) Gecko/20100101 Firefox/34.0 -',
 u'109.169.248.247 - - [12/Dec/2015:18:25:11 +0100] POST /administrator/index.php HTTP/1.1 200 4494 http://almhuette-raith.at/administrator/ Mozilla/5.0 (Windows NT 6.0; rv:34.0) Gecko/20100101 Firefox/34.0 -',
 u'46.72.177.4 - - [12/Dec/2015:18:31:08 +0100] GET /administrator/ HTTP/1.1 200 4263 - Mozilla/5.0 (Windows NT 6.0; rv:34.0) Gecko/20100101 Firefox/34.0 -',
 u'46.72.177.4 - - [12/Dec/2015:18:31:08 +0100] POST /administrator/index.php HTTP/1.1 200 4494 http://almhuette-raith.at/administrator/ Mozilla/5.0 (Windows NT 6.0; rv:34.0) Gecko/20100101 Firefox/34.0 -',
 u'83.167.113.100 - - [12/Dec/2015:18:31:25 +0100] GET /administrator/ HTTP/1.1 200 4263 - Mozilla/5.0 (Windows NT 6.0; rv:34.0) Gecko/20100101 Firefox/34.0 -',
 u'83.167.113.100 - - [12/Dec/2015:18:31:25 +0100] POST /administrator/index.php HTTP/1.1 200 4494 ht

In [5]:
#finding of the Host IP addresses from the log files

host_pattern = r'(^\S+\.[\S+\.]+[\S+\.]+\S)+\s'

hosts = [re.search(host_pattern, item).group(1)
           if re.search(host_pattern, item)
           else 'no match'
           for item in sample_logs]
hosts

[u'109.169.248.247',
 u'109.169.248.247',
 u'46.72.177.4',
 u'46.72.177.4',
 u'83.167.113.100',
 u'83.167.113.100',
 u'95.29.198.15',
 u'95.29.198.15',
 u'109.184.11.34',
 u'109.184.11.34',
 u'91.227.29.79',
 u'91.227.29.79',
 u'90.154.66.233',
 u'90.154.66.233',
 u'95.140.24.131']

In [6]:
#estimating datetime from the text files
datetime_pattern=r'\[(\d+/[A-Z][a-z]{2}/[0-9]{4}:[0-9]{2}:[0-9]{2}:[0-9]{2}\s\+\d{4})]'

datetime = [re.search(datetime_pattern, item).group(1)
           if re.search(datetime_pattern, item)
           else 'no match'
           for item in sample_logs]
datetime

#r'(GET | POST)\s+/[a-z]+/[a-z.]+\sHTTP/1.1\s[0-9]{3}\s[0-9]{4}'



[u'12/Dec/2015:18:25:11 +0100',
 u'12/Dec/2015:18:25:11 +0100',
 u'12/Dec/2015:18:31:08 +0100',
 u'12/Dec/2015:18:31:08 +0100',
 u'12/Dec/2015:18:31:25 +0100',
 u'12/Dec/2015:18:31:25 +0100',
 u'12/Dec/2015:18:32:10 +0100',
 u'12/Dec/2015:18:32:11 +0100',
 u'12/Dec/2015:18:32:56 +0100',
 u'12/Dec/2015:18:32:56 +0100',
 u'12/Dec/2015:18:33:51 +0100',
 u'12/Dec/2015:18:33:52 +0100',
 u'12/Dec/2015:18:36:16 +0100',
 u'12/Dec/2015:18:36:16 +0100',
 u'12/Dec/2015:18:38:42 +0100']

In [7]:
method=r'(GET | POST)\s*'

#(GET | POST)\s*

#/[a-z]+/[a-z.]+\sHTTP/1.1\s[0-9]{3}\s[0-9]{4}

method_result = [re.search(method, item).group(1)
               if re.search(method, item)
               else 'no match'
              for item in sample_logs]

method_result

[u'GET ',
 u' POST',
 u'GET ',
 u' POST',
 u'GET ',
 u' POST',
 u'GET ',
 u' POST',
 u'GET ',
 u' POST',
 u'GET ',
 u' POST',
 u'GET ',
 u' POST',
 u'GET ']

In [8]:
sample_logs



#(GET | POST)\s*

#/[a-z]+/[a-z.]+\sHTTP/1.1\s[0-9]{3}\s[0-9]{4}



[u'109.169.248.247 - - [12/Dec/2015:18:25:11 +0100] GET /administrator/ HTTP/1.1 200 4263 - Mozilla/5.0 (Windows NT 6.0; rv:34.0) Gecko/20100101 Firefox/34.0 -',
 u'109.169.248.247 - - [12/Dec/2015:18:25:11 +0100] POST /administrator/index.php HTTP/1.1 200 4494 http://almhuette-raith.at/administrator/ Mozilla/5.0 (Windows NT 6.0; rv:34.0) Gecko/20100101 Firefox/34.0 -',
 u'46.72.177.4 - - [12/Dec/2015:18:31:08 +0100] GET /administrator/ HTTP/1.1 200 4263 - Mozilla/5.0 (Windows NT 6.0; rv:34.0) Gecko/20100101 Firefox/34.0 -',
 u'46.72.177.4 - - [12/Dec/2015:18:31:08 +0100] POST /administrator/index.php HTTP/1.1 200 4494 http://almhuette-raith.at/administrator/ Mozilla/5.0 (Windows NT 6.0; rv:34.0) Gecko/20100101 Firefox/34.0 -',
 u'83.167.113.100 - - [12/Dec/2015:18:31:25 +0100] GET /administrator/ HTTP/1.1 200 4263 - Mozilla/5.0 (Windows NT 6.0; rv:34.0) Gecko/20100101 Firefox/34.0 -',
 u'83.167.113.100 - - [12/Dec/2015:18:31:25 +0100] POST /administrator/index.php HTTP/1.1 200 4494 ht

In [9]:
method_url=r']\s*(GET | POST)\s*(/\S*)\s*HTTP/\S*\s([0-9]{3})\s([0-9]*)'

method = [re.search(method_url, item).group(1) if re.search(method_url, item) else 'no match' for item in sample_logs]
url = [re.search(method_url, item).group(2) if re.search(method_url, item) else 'no match' for item in sample_logs]



status= [re.search(method_url, item).group(3) if re.search(method_url, item) else 'no match' for item in sample_logs]

content_size_response=[re.search(method_url, item).group(4) if re.search(method_url, item) else 'no match' for item in sample_logs]

print(method)
print(url)
print(status)
print(content_size_response)

[u'GET ', u' POST', u'GET ', u' POST', u'GET ', u' POST', u'GET ', u' POST', u'GET ', u' POST', u'GET ', u' POST', u'GET ', u' POST', u'GET ']
[u'/administrator/', u'/administrator/index.php', u'/administrator/', u'/administrator/index.php', u'/administrator/', u'/administrator/index.php', u'/administrator/', u'/administrator/index.php', u'/administrator/', u'/administrator/index.php', u'/administrator/', u'/administrator/index.php', u'/administrator/', u'/administrator/index.php', u'/administrator/']
[u'200', u'200', u'200', u'200', u'200', u'200', u'200', u'200', u'200', u'200', u'200', u'200', u'200', u'200', u'200']
[u'4263', u'4494', u'4263', u'4494', u'4263', u'4494', u'4263', u'4494', u'4263', u'4494', u'4263', u'4494', u'4263', u'4494', u'4263']


In [38]:
#Now we have extracted the required fields for our analysis let us put them in Dataframe and then perform various queries


#1.Load data into Spark DataFrame --- Note that the relevant columns to the queries have been extracted using regex 
#And those are put into the dataframe

from pyspark.sql.functions import trim

from pyspark.sql.functions import regexp_extract

logs_df = data.select(regexp_extract('value', host_pattern, 1).alias('host'),
                         regexp_extract('value',datetime_pattern , 1).alias('timestamp'),
                         regexp_extract('value', method_url , 1).alias('method'),
                         regexp_extract('value', method_url, 2).alias('endpoint'),
                         regexp_extract('value', method_url, 3).cast('integer').alias('status'),
                         regexp_extract('value', method_url, 4).cast('integer').alias('content_size'))

logs_df.show(1000, truncate=False)

logs_df=logs_df.withColumn("method",trim(logs_df["method"]))

print((logs_df.count(), len(logs_df.columns)))





+---------------+--------------------------+------+-----------------------------------------------------------------------------+------+------------+
|host           |timestamp                 |method|endpoint                                                                     |status|content_size|
+---------------+--------------------------+------+-----------------------------------------------------------------------------+------+------------+
|109.169.248.247|12/Dec/2015:18:25:11 +0100|GET   |/administrator/                                                              |200   |4263        |
|109.169.248.247|12/Dec/2015:18:25:11 +0100| POST |/administrator/index.php                                                     |200   |4494        |
|46.72.177.4    |12/Dec/2015:18:31:08 +0100|GET   |/administrator/                                                              |200   |4263        |
|46.72.177.4    |12/Dec/2015:18:31:08 +0100| POST |/administrator/index.php                         

(2338006, 6)


In [39]:
#2 Find out how many 404 HTTP codes are in access logs. 

logs_df.createOrReplaceTempView("logs")

spark.sql('Select count(*) as count_404 From logs where status=404').show(truncate=False)





+---------+
|count_404|
+---------+
|186787   |
+---------+



In [50]:
#3 Find out which URLs are broken. 

#Here are some examples of error codes that a web server may present for a broken link:
#404 Page Not Found: the page/resource doesn’t exist on the server
#400 Bad Request: the host server cannot understand the URL on your page
#Bad host: Invalid host name: the server with that name doesn’t exist or is unreachable
#Bad URL: Malformed URL (e.g. a missing bracket, extra slashes, wrong protocol, etc.)
#Bad Code: Invalid HTTP response code: the server response violates HTTP spec
#Empty: the host server returns “empty” responses with no content and no response code
#Timeout: Timeout: HTTP requests constantly timed out during the link check
#Reset: the host server drops connections. It is either misconfigured or too busy


spark.sql('Select endpoint  From logs where  status=404 or status=400 ').show(truncate=False)

#below are the url with status code showing the broken link


+----------------------------------+
|endpoint                          |
+----------------------------------+
|/templates/_system/css/general.css|
|/templates/_system/css/general.css|
|/favicon.ico                      |
|/icons/text.gif                   |
|/templates/_system/css/general.css|
|/apache-log/error.log.44.gz       |
|/apache-log/access.log.69.gz      |
|/apache-log/error.log.55.gz       |
|/templates/_system/css/general.css|
|/favicon.ico                      |
|/templates/_system/css/general.css|
|/templates/_system/css/general.css|
|/templates/_system/css/general.css|
|/templates/_system/css/general.css|
|/templates/_system/css/general.css|
|/favicon.ico                      |
|/favicon.ico                      |
|/templates/_system/css/general.css|
|/templates/_system/css/general.css|
|/templates/_system/css/general.css|
+----------------------------------+
only showing top 20 rows



In [41]:
#4Verify there are no null columns in the original dataset. 

(data.filter(data['value'].isNull()).count())


#so there were no null rows in our data which we loaded from the access logs text file

#Let us check the null values in our regex parsed to check any null columns 

null_df = logs_df.filter(logs_df['content_size'].isNull()  )
null_df.count()

#so the count below shows that there are the null values spread in our various columns as 66366 rows are having null values


66366

In [42]:
#check column wise null values

from pyspark.sql.functions import col
from pyspark.sql.functions import sum as spark_sum

def count_null(col_name):
    return spark_sum(col(col_name).isNull().cast('integer')).alias(col_name)

# Build up a list of column expressions, one per column.
exprs = [count_null(col_name) for col_name in logs_df.columns]

# Run the aggregation. The *exprs converts the list of expressions into
# variable function arguments.
logs_df.agg(*exprs).show()

+----+---------+------+--------+------+------------+
|host|timestamp|method|endpoint|status|content_size|
+----+---------+------+--------+------+------------+
|   0|        0|     0|       0| 60032|       66366|
+----+---------+------+--------+------+------------+



In [43]:
#5. Replace null values with constants such as 0 
from pyspark.sql.functions import when

#changed only these two column as they only have null values

logs_df=logs_df.withColumn('status' , when(logs_df['status'].isNull(), 0 ).otherwise(logs_df['status']))

logs_df=logs_df.withColumn('content_size' , when(logs_df['content_size'].isNull(), 0 ).otherwise(logs_df['content_size']))





In [44]:
#checking whether all null are removed or not

bad_rows_df = logs_df.filter(logs_df['host'].isNull()| 
                             logs_df['timestamp'].isNull() | 
                             logs_df['method'].isNull() |
                             logs_df['endpoint'].isNull() |
                             logs_df['status'].isNull() |
                             logs_df['content_size'].isNull())
bad_rows_df.count()



0

In [45]:
#6. Parse timestamp to readable date. 

from pyspark.sql.functions import udf

month_map = {
  'Jan': 1, 'Feb': 2, 'Mar':3, 'Apr':4, 'May':5, 'Jun':6, 'Jul':7,
  'Aug':8,  'Sep': 9, 'Oct':10, 'Nov': 11, 'Dec': 12
}

def time_formatter(text):

    return "{0:04d}-{1:02d}-{2:02d} {3:02d}:{4:02d}:{5:02d}".format(
      int(text[7:11]),
      month_map[text[3:6]],
      int(text[0:2]),
      int(text[12:14]),
      int(text[15:17]),
      int(text[18:20])
    )

udf_parse_time = udf(time_formatter)

logs_df = logs_df.select('*', udf_parse_time(logs_df['timestamp']).cast('timestamp').alias('time'))
           
           
#logs_df = logs_df.drop()

logs_df=logs_df.drop('timestamp')
logs_df=logs_df.withColumnRenamed('time','timestamp')

logs_df.show()





+---------------+------+--------------------+------+------------+--------------------+
|           host|method|            endpoint|status|content_size|           timestamp|
+---------------+------+--------------------+------+------------+--------------------+
|109.169.248.247|   GET|     /administrator/|   200|        4263|2015-12-12 18:25:...|
|109.169.248.247|  POST|/administrator/in...|   200|        4494|2015-12-12 18:25:...|
|    46.72.177.4|   GET|     /administrator/|   200|        4263|2015-12-12 18:31:...|
|    46.72.177.4|  POST|/administrator/in...|   200|        4494|2015-12-12 18:31:...|
| 83.167.113.100|   GET|     /administrator/|   200|        4263|2015-12-12 18:31:...|
| 83.167.113.100|  POST|/administrator/in...|   200|        4494|2015-12-12 18:31:...|
|   95.29.198.15|   GET|     /administrator/|   200|        4263|2015-12-12 18:32:...|
|   95.29.198.15|  POST|/administrator/in...|   200|        4494|2015-12-12 18:32:...|
|  109.184.11.34|   GET|     /administrator

In [46]:
#7. Describe which HTTP status values appear in data and how many. 

spark.sql('select status , count(status) as status_count from logs group by status').show()





+------+------------+
|status|status_count|
+------+------------+
|   412|          19|
|   406|          53|
|  null|           0|
|   206|      939908|
|   500|        3225|
|   301|         609|
|   400|          20|
|   403|        2217|
|   404|      186787|
|   200|     1138403|
|   303|         244|
|   304|        6329|
|   405|           7|
|   401|         153|
+------+------------+



In [47]:
#8. How many unique hosts are there in the entire log and their average request 


spark.sql('select host , count(host) as ind_req from logs group by host').show()


spark.sql('select avg(a.ind_req) as average_request from (select host , count(host) as ind_req from logs group by host) as a').show()

+--------------+-------+
|          host|ind_req|
+--------------+-------+
|   46.72.177.4|      8|
| 194.48.218.78|      2|
| 31.181.253.16|      2|
|  37.112.46.76|      2|
| 95.107.90.225|      2|
|  5.138.58.118|      2|
|95.188.228.228|      2|
|  66.7.119.112|      1|
| 145.255.2.176|      4|
| 176.59.208.95|      2|
| 62.133.162.65|      4|
| 95.29.129.235|      2|
|  66.249.64.64|     41|
| 207.46.13.165|      6|
| 180.76.15.162|     75|
|  37.139.52.40|     16|
| 89.144.209.67|     26|
|23.106.216.107|      3|
|  195.20.125.6|     18|
| 92.113.63.101|      6|
+--------------+-------+
only showing top 20 rows

+-----------------+
|  average_request|
+-----------------+
|57.25355078851993|
+-----------------+



In [48]:
#9. Create a spark-submit application for the same and print the findings in the log

#Jupyter Notebook has been submitted for all of the above tasks





