In [1]:
import re
import pandas as pd
from datetime import datetime

In [2]:
log_file_path = "adubovyk.txt"

with open(log_file_path, "rb") as fin:
    data = fin.read()

In [3]:
ts_regex = (r'\d\d-\w\w\w-\d\d\d\d \d\d:\d\d:\d\d')
prjct_regex = (r' \[\w+\] \- [A-Z][A-Z]')

In [4]:
lines = data.decode().split('\n') # split on newline characters

In [5]:
re.match(ts_regex, lines[2], flags=0)

In [6]:
# extract lines beginning with time-stamp, assuming other lines as irrelevant.
lines_relevant = [(datetime.strptime(line[:20], "%d-%b-%Y %H:%M:%S"),line[22:].strip()) 
                              for line in lines if (re.match(ts_regex, line, flags=0))]

## 1. Parse the file into two columns: timestamp and event​.

In [7]:
pd.DataFrame(lines_relevant, columns=['time_stamp', 'event']).head()

Unnamed: 0,time_stamp,event
0,2017-08-15 11:56:50,needToCreateFontConfig = true
1,2017-08-15 11:56:50,Loading configuration from: D:\axiom\client\pr...
2,2017-08-15 11:56:51,Frame opened: Controller View [hrk_cv_co]
3,2017-08-15 11:57:02,Connecting to 192.168.138.17:2020
4,2017-08-15 11:57:03,Connection type was overrided to UNIDIRECTIONA...


## 2. Extract two entities : Project and Branch

In [8]:
project_lines = [(datetime.date(line[0]), re.search('\[(.*?)\]', line[1]).group(1), \
                  line[1].split('-')[-1].strip().replace('.trunk',''))\
                  for line in lines_relevant if  re.search(prjct_regex, line[1])]

In [9]:
project_df = pd.DataFrame(project_lines, columns=['date','project', 'branch'])
project_df.head()

Unnamed: 0,date,project,branch
0,2017-08-15,hrk_cv_co,REPORTE_TRANSACCIONES_EFECTIVO
1,2017-08-15,hrk_cv_co,REPORTE_TRANSACCIONES_EFECTIVO
2,2017-08-15,hrk_cv_co,REPORTE_TRANSACCIONES_EFECTIVO
3,2017-08-15,hrk_cv_co,REPORTE_TRANSACCIONES_EFECTIVO
4,2017-08-15,hrk_cv_co,REPORTE_TRANSACCIONES_EFECTIVO


## 3. Simple report that count the number of events per day per project per branch.

In [10]:
# groupby(by=['question', 'answer'])['answer'].count()
project_df.groupby(by=['project','branch', 'date'])['date'].count()

project    branch                             date      
hrk_cv_co  CVT                                2017-08-28      1
           FATCA_DATA                         2017-08-16      1
           FATCA_DATA.2016_v02                2017-08-16      1
           FORMATO_322_323                    2017-08-16      3
                                              2017-09-05      9
           FORMATO_473                        2017-09-04      2
           FORMATO_531                        2017-09-06      4
           REPORTE_DEP_INDIV_FOGAFIN          2017-09-05    201
                                              2017-09-06    126
                                              2017-09-08     51
           REPORTE_DEP_INDIV_FOGAFIN *        2017-09-05      1
           REPORTE_DEP_INDIV_FOGAFIN1         2017-09-06     26
           REPORTE_TRANSACCIONES_EFECTIVO     2017-08-15     51
                                              2017-08-16     44
                                              2