This notebook is being used to experiment with and develop analysis tools and visualisations for crawl processes. The production version will generate HTML outputs.

In [25]:
import re
import json

ALL = 'All'
S200 = '200 Found'
SM9998 = '-9998 Blocked by robots.txt'
CAPPED = 'Hit data cap'
DEDUP = 'De-duplicated responses'

hours = []
lines = {}
lines[ALL] = []
lines[S200] = []
lines[SM9998] = []
lines[CAPPED] = []
lines[DEDUP] = []

summary_file = '../tasks/process/extract/test-data/weekly-20170220090024-crawl-logs-12.analysis.tsjson'

with open(summary_file) as f:
    for line in f:
        key, json_str = re.split('\t', line, maxsplit=1)
        if key.startswith("BY-HOUR"):
            tag, time = re.split(' ', key, maxsplit=1)
            stats = json.loads(json_str)
            if len(hours) == 10:
                for stat in stats:
                    if not stat.startswith('source:') and not stat.startswith('ip:') and not stat.startswith('content_type:') and not stat.startswith('status_code:') and not stat.startswith('host:'):
                        print("%s %i" % (stat, stats[stat]))
            hours.append(time)
            lines[ALL].append(stats['lines'])
            lines[S200].append(stats['status_code:200'])
            lines[SM9998].append(stats['status_code:-9998'])
            lines[CAPPED].append(stats.get('Q:serverMaxSuccessKb',None))
            #
            lines[DEDUP].append(stats.get('duplicate:digest', None))
            

# And plot:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as FF
init_notebook_mode(connected=True)

py.iplot( { 'data': [ go.Scatter(x=hours,y=lines[CAPPED]) ],
            'layout': {'title': 'URLs dropped due to site downloads cap'}
          })

py.iplot( { 'data': [ go.Scatter(x=hours,y=lines[DEDUP]) ],
            'layout': {'title': 'URLs deduplicated'}
          })

data = []
for key in lines:
    data.append( go.Scatter(x=hours,y=lines[key],name=key) )
py.iplot({ 
      'data': data, 
      'layout': {'title': 'Status codes over time (whole crawl)'}
    })




tries:5t 110
extractorSWFRelToBase 5
unsatisfiableCharsetInHeader:ISO 8859-1 1
tries:4t 11
duplicate:digest 130434
usingCharsetInHTML:windows-1252 1375
Q:serverMaxSuccessKb 38
unsatisfiableCharsetInHeader:binary 268
extractorSWFRelToBoth 2
hop:I 655
hop:L 365853
hop:E 105773
hop:X 13330
hop:R 20620
hop:P 764
unsatisfiableContentEncoding:utf8 1
lines 506995
usingCharsetInHTML:UTF-8 1525
unsatisfiableContentEncoding: 1
extractorSWFRelToVia 4
tries:3t 421
timeTrunc 3
tries:2t 1764


In [26]:
sources = {}
order_by_lines = {}
with open(summary_file) as f:
    for line in f:
        key, json_str = re.split('\t', line, maxsplit=1)
        if key.startswith("BY-HOST"):
            tag, source = re.split(' ', key, maxsplit=1)
            stats = json.loads(json_str)
            sources[source] = stats
            order_by_lines[source] = stats['lines']
                        
#            data_matrix = [
#                        ['Seed', source],
#                        ['Total Crawl Events', stats['lines']],
#                        ['Deduplicated', stats.get('duplicate:digest','-')]
#                       ]
#            table = FF.create_table(data_matrix)
#            table.layout.update({'title': 'Summary statistics for %s' % source})
#            py.iplot(table, filename='simple_table')
#            if len(sources) == 3:
#                break

# Extract top fives:
print("TOP...")
data_matrix = []
data_matrix.append(['Seed', 'Total Hits'])
top_sources = sorted(order_by_lines, key=order_by_lines.get, reverse=True)[:20]
print(sources[top_sources[19]])
for source in top_sources:
    data_matrix.append([source,sources[source]['lines']])

table = FF.create_table(data_matrix)
py.iplot(table)




TOP...
{u'status_code:1': 6, u'status_code:0': 1, u'host:descrier.co.uk': 27423, u'content_type:unknown': 11, u'source:http://descrier.co.uk/': 27420, u'tries:2t': 21, u'content_type:text/css': 12, u'content_type:application/xml': 1, u'hop:I': 3, u'content_type:text/dns': 6, u'ip:54.209.222.62': 27411, u'hop:L': 8132, u'tries:4t': 1, u'source:http://www.1000londoners.com/': 3, u'status_code:405': 2, u'status_code:404': 42, u'hop:E': 17137, u'hop:X': 1752, u'content_type:image/x-icon': 1, u'hop:R': 388, u'hop:P': 10, u'content_type:application/javascript': 24, u'hop:-': 1, u'Q:serverMaxSuccessKb': 1, u'content_type:application/json': 5092, u'content_type:text/plain': 6, u'status_code:301': 840, u'content_type:image/gif': 16, u'duplicate:digest': 18914, u'content_type:text/html': 9689, u'tries:3t': 4, u'status_code:200': 25425, u'content_type:image/vnd.microsoft.icon': 2, u'WrenderedURL': 1, u'tries:5t': 1, u'content_type:image/png': 202, u'content_type:text/xml': 5092, u'content_type:im

In [16]:
targets = {}
order_by_lines = {}
with open(summary_file) as f:
    for line in f:
        key, json_str = re.split('\t', line, maxsplit=1)
        if key.startswith("BY-TARGET"):
            tag, tid = re.split(' ', key, maxsplit=1)
            stats = json.loads(json_str)
            targets[tid] = stats
            order_by_lines[tid] = stats['lines']
                        
#            data_matrix = [
#                        ['Seed', source],
#                        ['Total Crawl Events', stats['lines']],
#                        ['Deduplicated', stats.get('duplicate:digest','-')]
#                       ]
#            table = FF.create_table(data_matrix)
#            table.layout.update({'title': 'Summary statistics for %s' % source})
#            py.iplot(table, filename='simple_table')
#            if len(sources) == 3:
#                break

# Extract top fives:
print("TOP...")
data_matrix = []
data_matrix.append(['Target ID', 'Seeds', 'Total Hits'])
top_targets = sorted(order_by_lines, key=order_by_lines.get, reverse=False)[:10]
for tid in top_targets:
    hosts = []
    for key in targets[tid]:
        if key.startswith('source:'):
            hosts.append(key[7:])
    data_matrix.append([tid,"\n".join(hosts), targets[tid]['lines']])

table = FF.create_table(data_matrix)
py.iplot(table)

print(targets[top_targets[0]])


TOP...


{u'WrenderedURL': 1, u'hop:-': 1, u'status_code:404': 1, u'lines': 1, u'content_type:unknown': 1, u'host:twitter.com': 1, u'source:https://twitter.com/alanbissett': 1}
