In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
from datetime import datetime
import seaborn as sns
%matplotlib inline

## Text Processing and Dates

In [2]:
!head log.txt

169.237.46.168 - - [26/Jan/2014:10:47:58 -0800] "GET /stat141/Winter04/ HTTP/1.1" 200 2585 "http://anson.ucdavis.edu/courses/"
193.205.203.3 - - [2/Feb/2005:17:23:6 -0800] "GET /stat141/Notes/dim.html HTTP/1.0" 404 302 "http://eeyore.ucdavis.edu/stat141/Notes/session.html"
169.237.46.240 - "" [3/Feb/2006:10:18:37 -0800] "GET /stat141/homework/Solutions/hw1Sol.pdf HTTP/1.1"


In [3]:
lines = open('log.txt').readlines()
first = lines[0]
first

'169.237.46.168 - - [26/Jan/2014:10:47:58 -0800] "GET /stat141/Winter04/ HTTP/1.1" 200 2585 "http://anson.ucdavis.edu/courses/"\n'

**How can I extract the date and time?**

In [4]:
line = lines[0]
line

'169.237.46.168 - - [26/Jan/2014:10:47:58 -0800] "GET /stat141/Winter04/ HTTP/1.1" 200 2585 "http://anson.ucdavis.edu/courses/"\n'

In [5]:
line.split(' [')

['169.237.46.168 - -',
 '26/Jan/2014:10:47:58 -0800] "GET /stat141/Winter04/ HTTP/1.1" 200 2585 "http://anson.ucdavis.edu/courses/"\n']

In [6]:
(line
 .split(' [')[1]
 .split('] ')[0]
)

'26/Jan/2014:10:47:58 -0800'

Works, but what if I want the individual components? E.g.

    ['26', 'Jan', '2014', '10', '47', '58', '-0800']

In [8]:
day, month, rest = (line
 .split(' [')[1]
 .split('] ')[0]
 .split('/')
)

year, hour, minute, rest = rest.split(':')
sec, timezone = rest.split(' ')

day, month, year, hour, minute, sec, timezone

('26', 'Jan', '2014', '10', '47', '58', '-0800')

Putting it together

In [9]:
time_str = first.split('[', 1)[1].split(' ', 1)[0]
day, month, rest = time_str.split('/')
year, hour, minute, second = rest.split(':')
year, month, day, hour, minute, second

('2014', 'Jan', '26', '10', '47', '58')

**How to do it with regular expressions.**

In [10]:
re.findall('\[26/Jan/2014:10:47:58 -0800\]', line)

['[26/Jan/2014:10:47:58 -0800]']

In [11]:
re.findall('\[(26)/(Jan)/(2014):(10):(47):(58) (-0800)\]', line)

[('26', 'Jan', '2014', '10', '47', '58', '-0800')]

The `.` character is a wildcard (anything goes except newlines):

In [12]:
re.findall('\[(..)/(...)/(....):(..):(..):(..) (.....)\]', line)

[('26', 'Jan', '2014', '10', '47', '58', '-0800')]

In [13]:
[re.findall('\[(..)/(...)/(....):(..):(..):(..) (.....)\]', line)
 for line in lines]

[[('26', 'Jan', '2014', '10', '47', '58', '-0800')], [], []]

In [14]:
# Only one digit for day!
lines[1]

'193.205.203.3 - - [2/Feb/2005:17:23:6 -0800] "GET /stat141/Notes/dim.html HTTP/1.0" 404 302 "http://eeyore.ucdavis.edu/stat141/Notes/session.html"\n'

In [15]:
# Ok!
[re.findall('\[(.+)/(.+)/(.+):(.+):(.+):(.+) (.+)\]', line)
 for line in lines]

[[('26', 'Jan', '2014', '10', '47', '58', '-0800')],
 [('2', 'Feb', '2005', '17', '23', '6', '-0800')],
 [('3', 'Feb', '2006', '10', '18', '37', '-0800')]]

**Question:** What happens if we remove the brackets?

In [16]:
re.findall('(.+)/(.+)/(.+):(.+):(.+):(.+) (.+)', line)

[('169.237.46.168 - - [26',
  'Jan',
  '2014',
  '10',
  '47',
  '58 -0800] "GET /stat141/Winter04/ HTTP/1.1" 200 2585',
  '"http://anson.ucdavis.edu/courses/"')]

In [17]:
[re.findall(
    '\[(\d+)/([a-zA-z]+)/(\d+):(\d+):(\d+):(\d+) (.\d+)\]',
    line
) for line in lines]

[[('26', 'Jan', '2014', '10', '47', '58', '-0800')],
 [('2', 'Feb', '2005', '17', '23', '6', '-0800')],
 [('3', 'Feb', '2006', '10', '18', '37', '-0800')]]

**How to do it with pandas?**

In [18]:
pd.Series(lines).str.split('[', 1, expand=True)[1]

0    26/Jan/2014:10:47:58 -0800] "GET /stat141/Wint...
1    2/Feb/2005:17:23:6 -0800] "GET /stat141/Notes/...
2    3/Feb/2006:10:18:37 -0800] "GET /stat141/homew...
Name: 1, dtype: object

In [19]:
time_strs = pd.Series(lines) \
    .str.split('[', 1, expand=True)[1] \
    .str.split(' ', 1, expand=True)[0]
day_month_rest = time_strs.str.split('/', expand=True)
pd.concat([day_month_rest.loc[:, 0:1], 
           day_month_rest[2].str.split(':', expand=True)], axis=1)

Unnamed: 0,0,1,0.1,1.1,2,3
0,26,Jan,2014,10,47,58
1,2,Feb,2005,17,23,6
2,3,Feb,2006,10,18,37


In [20]:
pattern = r'(\d+)/(\w+)/(\d+):(\d+):(\d+):(\d+)'
day, month, year, hour, minute, second = re.search(pattern, first).groups()
year, month, day, hour, minute, second

('2014', 'Jan', '26', '10', '47', '58')

In [21]:
pd.Series(lines).str.extract(pattern)

Unnamed: 0,0,1,2,3,4,5
0,26,Jan,2014,10,47,58
1,2,Feb,2005,17,23,6
2,3,Feb,2006,10,18,37


**Date parsing using the `datetime` module.**

In [22]:
ts = datetime.strptime(time_str, '%d/%b/%Y:%H:%M:%S')
ts

datetime.datetime(2014, 1, 26, 10, 47, 58)

In [23]:
datetime.strftime(ts, '%d/%b/%Y:%H:%M:%S')

'26/Jan/2014:10:47:58'

In [24]:
time_str

'26/Jan/2014:10:47:58'

In [25]:
pd.Series(lines)\
    .str.extract(r'\[(.*) -0800\]')[0] \
    .apply(lambda s: datetime.strptime(s, '%d/%b/%Y:%H:%M:%S'))

0   2014-01-26 10:47:58
1   2005-02-02 17:23:06
2   2006-02-03 10:18:37
Name: 0, dtype: datetime64[ns]