## Nasa logs and PySpark

In [29]:
# imports
import pandas as pd
import matplotlib
from pyspark.sql import SparkSession

In [30]:
# create spark session
session = SparkSession.builder.getOrCreate()

In [31]:
# read data
dfLog = session.read.text("data/NASA_access_log_Jul95.gz")

In [32]:
dfLog.printSchema()

root
 |-- value: string (nullable = true)



In [33]:
dfLog.count()

1891715

### Different ways to observe data

In [34]:
dfLog.show(5)

+--------------------+
|               value|
+--------------------+
|199.72.81.55 - - ...|
|unicomp6.unicomp....|
|199.120.110.21 - ...|
|burger.letters.co...|
|199.120.110.21 - ...|
+--------------------+
only showing top 5 rows



In [None]:
dfLog.show(truncate=False)

In [36]:
pd.set_option('max_colwidth', 200)

In [None]:
dfLog.limit(20).toPandas()

### Parse Data

### Lesson on RegEx Functions
The **re** module offers a set of functions that allows us to search a string for a match:

**Function Description**  
findall - Returns a list containing all matches  
search - Returns a Match object if there is a match anywhere in the string  
split - Returns a list where the string has been split at each match  
sub - Replaces one or many matches with a string

**Metacharacters**
Metacharacters are characters with a special meaning:

**Character	Description	Example	Try it**  
[]	- A set of characters	"[a-m]"	
\	- Signals a special sequence (can also be used to escape special characters)	"\d"	
.	- Any character (except newline character)	"he..o"	
\^	- Starts with	\"\^hello\"	 
\\$	- Ends with	"world\\$"	
\*	- Zero or more occurrences	"aix*"	
\+	- One or more occurrences	"aix+"	
{}	- Exactly the specified number of occurrences	"al{2}"	
|	- Either or	"falls|stays"	
()	- Capture and group	 	 
Special Sequences - 
A special sequence is a \\ followed by one of the characters in the list below, and has a special meaning:

**Character	Description	Example	Try it:**  
\A	- Returns a match if the specified characters are at the beginning of the string	"\AThe"	
\b	- Returns a match where the specified characters are at the beginning or at the end of a word	r"\bain"
r"ain\b"  	
\B	- Returns a match where the specified characters are present, but NOT at the beginning (or at the end) of a word	r"\Bain"
r"ain\B" - 	
\d	- Returns a match where the string contains digits (numbers from 0-9)	"\d"	
\D	- Returns a match where the string DOES NOT contain digits	"\D"	
\s	- Returns a match where the string contains a white space character	"\s"	
\S	- Returns a match where the string DOES NOT contain a white space character	"\S"	
\w	- Returns a match where the string contains any word characters (characters from a to Z, digits from 0-9, and the underscore _ character)	"\w"	
\W	- Returns a match where the string DOES NOT contain any word characters	"\W"	
\Z	- Returns a match if the specified characters are at the end of the string	"Spain\Z"	
Sets - A set is a set of characters inside a pair of square brackets [] with a special meaning:

**Set	Description	Try it**
[arn]	Returns a match where one of the specified characters (a, r, or n) are present	
[a-n]	Returns a match for any lower case character, alphabetically between a and n	
[^arn]	Returns a match for any character EXCEPT a, r, and n	
[0123]	Returns a match where any of the specified digits (0, 1, 2, or 3) are present	
[0-9]	Returns a match for any digit between 0 and 9	
[0-5][0-9]	Returns a match for any two-digit numbers from 00 and 59	
[a-zA-Z]	Returns a match for any character alphabetically between a and z, lower case OR upper case	
[+]	In sets, +, *, ., |, (), $,{} has no special meaning, so [+] means: return a match for any + character in the string	


In [38]:
from pyspark.sql.functions import udf
from pyspark.sql.types import MapType, StringType
import re
# 199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] "GET /history/apollo/ HTTP/1.0" 200 6245
@udf(MapType(StringType(), StringType()))
def parseUDF(line):
    PATTERN = '^(\S+) (\S+) (\S+) \[([\w:/]+\s+-\d{4})\] "(\S+) (\S+)\s*(\S*)" (\d{3}) (\S+)'
    match = re.search(PATTERN, line)
    if match is None:
        return (line, 0)
    size_field = match.group(9)
    if size_field == '-':
        size = 0
    else:
        size = match.group(9)
    return {
        "host": match.group()1),
        "client_id": match.group(2),
        "user_id": match.group(3),
        "date_time": match.group(4),
        "method": match.group(5),
        "endpoint": match.group(6),
        "protocol": match.group(7),
        "response_code": int(match.group(8)),
        "content_size": size
    }

In [None]:
dfParsed = dfLog.withColumn("parsed", parseUDF("value"))
dfParsed.limit(10).toPandas()

In [40]:
dfParsed.printSchema()

root
 |-- value: string (nullable = true)
 |-- parsed: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



In [None]:
dfParsed.selectExpr("parsed['host'] as host", "parsed['date_time'] as datetime").limit(5).show(5)

In [42]:
fields = ["host", "client_id", "user_id", "date_time", "method", "endpoint", "protocol", "response_code", "content_size"]
exprs = ["parsed['{}'] as {}".format(field, field) for field in fields]

In [None]:
dfClean = dfParsed.selectExpr(*exprs)
dfClean = dfClean.filter(dfClean.date_time.isNotNull())
dfClean.limit(5).toPandas()

In [None]:
from pyspark.sql.functions import desc
dfClean.groupBy("host").count().orderBy(desc("count")).limit(10).toPandas()

In [None]:
from pyspark.sql.functions import expr
dfClean.groupBy("endpoint").count().orderBy(desc("count")).limit(10).toPandas()

#### Casting: Method 1

In [None]:
dfClean.createOrReplaceTempView("dfCleanTable")
session.sql("""
            SELECT DISTINCT endpoint, cast(content_size as int) 
            FROM dfCleanTable
            ORDER BY content_size DESC
            """).show()

#### Casting: Method 2

In [50]:
from pyspark.sql.functions import expr
dfCleanTyped = dfClean.withColumn("content_size_int", expr("cast(content_size as int)"))

In [None]:
dfCleanTyped.createOrReplaceTempView("dfCleanTable")
session.sql("""
            SELECT endpoint, content_size_int 
            FROM dfCleanTable
            ORDER BY content_size_int DESC
            """).show()

In [91]:
from pyspark.sql.functions import udf
# 01/Jul/1995:00:00:01 -0400
@udf
def parseDate(line):
    pos = line.find('-')
    line = line.replace(line[pos:], "").strip()
    date_part = line.replace("-0400", "")
    date_part2 = date_part.replace("/", "-")
    date_part3 = date_part2.strip()
    pos1 = date_part3.find(':')
    date_part4 = date_part3.replace(date_part3[:pos1+1], date_part3[:pos1] + " ")
    month_dict = {'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 'Jun': '06', 'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'}    
    pos2 = date_part4.find("-")
    month = date_part4[pos2+1:pos2+4]
    date_part5 = date_part4.replace(month, month_dict.get(month))
    return date_part5

In [None]:
dfClean2 = dfClean.withColumn("date_part", parseDate("date_time"))
dfClean2.limit(10).toPandas()

In [None]:
from pyspark.sql.functions import unix_timestamp
dfClean2.withColumn("date_formatted", unix_timestamp(dfClean2.date_part, 'dd-MM-yyyy HH:mm:ss')).limit(20).orderBy("date_formatted").toPandas()