# Get Set Up

## Import Libraries

In [0]:
# Pandas provides an extremely useful data structure
import pandas as pd

# RE provides regular expression pattern matching
import re

# datetime provides a datetime object class and conversion utilities
from datetime import datetime

# Web file access
from urllib.request import urlopen

# Math provides additional math functions
import math


## Define Some Functions

In [0]:
def log_ReadFile(logfile):
  with open(logfile) as fh:
    loglines = fh.readlines()
  loglines = [line.strip() for line in loglines]
  return loglines

In [0]:
def log_ReadURL(logfile):
  loglines = urlopen(logfile).readlines()
  loglines = [line.decode().strip() for line in loglines]
  return loglines

In [0]:
def log_Parser(log_list, regx_obj, col_list):
  # initialize empty lists for results
  logs_parsed = []
  parse_fails = []
  
  # parse logs using supplied regex and column list
  for line in log_list:
    match = regx_obj.match(line)
    if match:
      logs_parsed.append([match.group(col) for col in col_list]) 
    else:
      parse_fails.append(line)
      
  # return parsed data and list of lines that were not parsed correctly
  return logs_parsed, parse_fails

## Load Data

In [0]:
# URL of source data file
access_url = "https://raw.githubusercontent.com/urbansec/ds101/master/access.log.2019-03-22"

# Read log files into lists
access_logs = log_ReadURL(access_url)

In [0]:
# IP reputation indicators
intel_url = "https://raw.githubusercontent.com/urbansec/ds101/master/av_ip_reputation_2019-04-07.csv"
intel_cols = ['ip', 'risk', 'reliability', 'activity', 'country', 'city', 'lat_lon', 'unknown']
intel_df = pd.read_csv(intel_url, sep='#', header=None, names=intel_cols)
intel_df = intel_df.drop(columns=['unknown'])

In [0]:
# display first 5 lines in list
#display(error_logs[:5])

## Parse Data

In [0]:
# define a regex pattern to parse lines into fields
# sample line:
# ['54.36.148.18 - - [22/Mar/2019:01:58:55 -0700] "GET /self.logs/error.log.2016-04-07.gz HTTP/1.1" 404 284 "-" "Mozilla/5.0 (compatible; AhrefsBot/6.1; +http://ahrefs.com/robot/)"']
web_access_pattern = re.compile('(?P<client_ip>\S+)'
                                '\s+(?P<identity>\S+)'
                                '\s+(?P<username>\S+)'
                                '\s+\[(?P<date>[^\]]+)\]'
                                '\s+\"(?P<request>[^"]+)\"'
                                '\s+(?P<http_response>\d+)'
                                '\s+(?P<bytes>\d+)'
                                '\s+\"(?P<something_else>[^"]+)\"'
                                '\s+\"(?P<user_agent>.*)')

# define list of columns to use
access_column_list = ['date', 'client_ip', 'request', 'http_response', 'bytes', 'something_else', 'user_agent']


# call parser
access_logs_parsed, access_logs_parsefail = log_Parser(access_logs, web_access_pattern, access_column_list)


### Troubleshooting Only

In [0]:
# test parsing
#access_logs_parsed[:5]

In [0]:
# did any lines fail to parse?
#access_logs_parsefail[:5]

## Data Frame

In [46]:
# convert to Pandas DataFrame and display it
access_logs_df = pd.DataFrame.from_records(access_logs_parsed, columns=access_column_list)
access_logs_df['date'] = pd.to_datetime(access_logs_df['date'], format='%d/%b/%Y:%H:%M:%S -0700')
display(access_logs_df.head())

Unnamed: 0,date,client_ip,request,http_response,bytes,something_else,user_agent
0,2019-03-22 01:55:15,101.89.29.92,GET / HTTP/1.1,200,46424,-,Mozilla/5.0 (iPhone; CPU iPhone OS 10_2_1 like...
1,2019-03-22 01:57:37,54.36.148.43,GET /self.logs/2015/access.log.2015-05-08.gz H...,200,4813,-,Mozilla/5.0 (compatible; AhrefsBot/6.1; +http:...
2,2019-03-22 01:58:55,54.36.148.18,GET /self.logs/error.log.2016-04-07.gz HTTP/1.1,404,284,-,Mozilla/5.0 (compatible; AhrefsBot/6.1; +http:...
3,2019-03-22 02:04:26,54.36.148.62,GET /self.logs/access.log.2016-10-30.gz HTTP/1.1,404,284,-,Mozilla/5.0 (compatible; AhrefsBot/6.1; +http:...
4,2019-03-22 02:04:51,54.36.149.57,GET /self.logs/2016/error.log.2016-05-19.gz HT...,200,867,-,Mozilla/5.0 (compatible; AhrefsBot/6.1; +http:...


# Instructions

In this lab, you will combine techniques from the previous labs to explore an additional data set.  Namely, you will look at the web access logs from the same server as the error logs we looked at in Lab #1.

* To save time, the logs have been parsed already.  They are stored in a Pandas dataframe called "error_logs_df".

* Conduct exploratory data analysis to gain an understanding into these logs, and answer the questions defined below.  We recommend inserting text blocks to organize your work, if helpful.  Several collapsible "header" sections have been provided to guide your analysis.

* The IP reputation data used in the previous labs is also available as a DataFrame.  It is stored in the variable "intel_df".

**To begin, choose "Runtime -> Run All" from the menu options.**

# Exploratory Analysis: Web Access Logs

To get started, here are two easy steps provided for you.  Take a look at a sample of the data.  Then examine some basic summary statistics for the entire dataframe (you should still examine inidividual columns more carefully).

## Examine the Data

In [47]:
access_logs_df.head()

Unnamed: 0,date,client_ip,request,http_response,bytes,something_else,user_agent
0,2019-03-22 01:55:15,101.89.29.92,GET / HTTP/1.1,200,46424,-,Mozilla/5.0 (iPhone; CPU iPhone OS 10_2_1 like...
1,2019-03-22 01:57:37,54.36.148.43,GET /self.logs/2015/access.log.2015-05-08.gz H...,200,4813,-,Mozilla/5.0 (compatible; AhrefsBot/6.1; +http:...
2,2019-03-22 01:58:55,54.36.148.18,GET /self.logs/error.log.2016-04-07.gz HTTP/1.1,404,284,-,Mozilla/5.0 (compatible; AhrefsBot/6.1; +http:...
3,2019-03-22 02:04:26,54.36.148.62,GET /self.logs/access.log.2016-10-30.gz HTTP/1.1,404,284,-,Mozilla/5.0 (compatible; AhrefsBot/6.1; +http:...
4,2019-03-22 02:04:51,54.36.149.57,GET /self.logs/2016/error.log.2016-05-19.gz HT...,200,867,-,Mozilla/5.0 (compatible; AhrefsBot/6.1; +http:...


## Summary Statistics

In [48]:
access_logs_df.describe(include='all')

Unnamed: 0,date,client_ip,request,http_response,bytes,something_else,user_agent
count,1305,1305,1305,1305.0,1305.0,1305,1305
unique,974,576,503,5.0,380.0,67,135
top,2019-03-23 01:52:10,222.186.59.44,GET / HTTP/1.1,200.0,12511.0,-,Mozilla/5.0 (compatible; AhrefsBot/6.1; +http:...
freq,32,23,306,837.0,168.0,886,241
first,2019-03-22 01:55:15,,,,,,
last,2019-03-23 02:08:53,,,,,,
