## Nasa logs and PySpark

In [1]:
# imports
import pandas as pd
import matplotlib
from pyspark.sql import SparkSession

In [2]:
# create spark session
session = SparkSession.builder.getOrCreate()

In [11]:
# read data
dfLog = session.read.text("data/NASA_access_log_Jul95.gz")

In [12]:
dfLog.printSchema()

root
 |-- value: string (nullable = true)



In [13]:
dfLog.count()

1891715

### Different ways to observe data

In [14]:
dfLog.show(5)

+--------------------+
|               value|
+--------------------+
|199.72.81.55 - - ...|
|unicomp6.unicomp....|
|199.120.110.21 - ...|
|burger.letters.co...|
|199.120.110.21 - ...|
+--------------------+
only showing top 5 rows



In [15]:
dfLog.show(truncate=False)

+------------------------------------------------------------------------------------------------------------------------------+
|value                                                                                                                         |
+------------------------------------------------------------------------------------------------------------------------------+
|199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] "GET /history/apollo/ HTTP/1.0" 200 6245                                        |
|unicomp6.unicomp.net - - [01/Jul/1995:00:00:06 -0400] "GET /shuttle/countdown/ HTTP/1.0" 200 3985                             |
|199.120.110.21 - - [01/Jul/1995:00:00:09 -0400] "GET /shuttle/missions/sts-73/mission-sts-73.html HTTP/1.0" 200 4085          |
|burger.letters.com - - [01/Jul/1995:00:00:11 -0400] "GET /shuttle/countdown/liftoff.html HTTP/1.0" 304 0                      |
|199.120.110.21 - - [01/Jul/1995:00:00:11 -0400] "GET /shuttle/missions/sts-73/sts-73-patch-small

In [16]:
pd.set_option('max_colwidth', 200)

In [18]:
dfLog.limit(20).toPandas()

Unnamed: 0,value
0,"199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] ""GET /history/apollo/ HTTP/1.0"" 200 6245"
1,"unicomp6.unicomp.net - - [01/Jul/1995:00:00:06 -0400] ""GET /shuttle/countdown/ HTTP/1.0"" 200 3985"
2,"199.120.110.21 - - [01/Jul/1995:00:00:09 -0400] ""GET /shuttle/missions/sts-73/mission-sts-73.html HTTP/1.0"" 200 4085"
3,"burger.letters.com - - [01/Jul/1995:00:00:11 -0400] ""GET /shuttle/countdown/liftoff.html HTTP/1.0"" 304 0"
4,"199.120.110.21 - - [01/Jul/1995:00:00:11 -0400] ""GET /shuttle/missions/sts-73/sts-73-patch-small.gif HTTP/1.0"" 200 4179"
5,"burger.letters.com - - [01/Jul/1995:00:00:12 -0400] ""GET /images/NASA-logosmall.gif HTTP/1.0"" 304 0"
6,"burger.letters.com - - [01/Jul/1995:00:00:12 -0400] ""GET /shuttle/countdown/video/livevideo.gif HTTP/1.0"" 200 0"
7,"205.212.115.106 - - [01/Jul/1995:00:00:12 -0400] ""GET /shuttle/countdown/countdown.html HTTP/1.0"" 200 3985"
8,"d104.aa.net - - [01/Jul/1995:00:00:13 -0400] ""GET /shuttle/countdown/ HTTP/1.0"" 200 3985"
9,"129.94.144.152 - - [01/Jul/1995:00:00:13 -0400] ""GET / HTTP/1.0"" 200 7074"


### Parse Data

### Lesson on RegEx Functions
The **re** module offers a set of functions that allows us to search a string for a match:

**Function Description**  
findall - Returns a list containing all matches  
search - Returns a Match object if there is a match anywhere in the string  
split - Returns a list where the string has been split at each match  
sub - Replaces one or many matches with a string

**Metacharacters**
Metacharacters are characters with a special meaning:

**Character	Description	Example	Try it**  
[]	- A set of characters	"[a-m]"	
\	- Signals a special sequence (can also be used to escape special characters)	"\d"	
.	- Any character (except newline character)	"he..o"	
\^	- Starts with	\"\^hello\"	 
\\$	- Ends with	"world\\$"	
\*	- Zero or more occurrences	"aix*"	
\+	- One or more occurrences	"aix+"	
{}	- Exactly the specified number of occurrences	"al{2}"	
|	- Either or	"falls|stays"	
()	- Capture and group	 	 
Special Sequences - 
A special sequence is a \\ followed by one of the characters in the list below, and has a special meaning:

**Character	Description	Example	Try it:**  
\A	- Returns a match if the specified characters are at the beginning of the string	"\AThe"	
\b	- Returns a match where the specified characters are at the beginning or at the end of a word	r"\bain"
r"ain\b"  	
\B	- Returns a match where the specified characters are present, but NOT at the beginning (or at the end) of a word	r"\Bain"
r"ain\B" - 	
\d	- Returns a match where the string contains digits (numbers from 0-9)	"\d"	
\D	- Returns a match where the string DOES NOT contain digits	"\D"	
\s	- Returns a match where the string contains a white space character	"\s"	
\S	- Returns a match where the string DOES NOT contain a white space character	"\S"	
\w	- Returns a match where the string contains any word characters (characters from a to Z, digits from 0-9, and the underscore _ character)	"\w"	
\W	- Returns a match where the string DOES NOT contain any word characters	"\W"	
\Z	- Returns a match if the specified characters are at the end of the string	"Spain\Z"	
Sets - A set is a set of characters inside a pair of square brackets [] with a special meaning:

**Set	Description	Try it**
[arn]	Returns a match where one of the specified characters (a, r, or n) are present	
[a-n]	Returns a match for any lower case character, alphabetically between a and n	
[^arn]	Returns a match for any character EXCEPT a, r, and n	
[0123]	Returns a match where any of the specified digits (0, 1, 2, or 3) are present	
[0-9]	Returns a match for any digit between 0 and 9	
[0-5][0-9]	Returns a match for any two-digit numbers from 00 and 59	
[a-zA-Z]	Returns a match for any character alphabetically between a and z, lower case OR upper case	
[+]	In sets, +, *, ., |, (), $,{} has no special meaning, so [+] means: return a match for any + character in the string	


In [None]:
from pyspark.sql.functions import udf
import re

@udf(MapType(StringType(), StringType()))
def parseUDF(line):
    PATTERN = '^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+)\s*(\S*)" (\d{3}) (\S+)'
    match = re.search(PATTERN, line)
    if match is None:
        return (line, 0)
    size_field = match.group(9)
    if size_field == '-':
        size = 0
    else:
        size = match.group(9)
    return (
        "":,
        "":,
        "":,
        "":,
        "":,
        "":,
        "":,
        "":,
        "":
    )