In [None]:
This notebook is being used to experiment with and develop analysis tools and visualisations for crawl processes. The production version will generate HTML outputs.

In [1]:
import re
import sys
import json
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as FF
init_notebook_mode(connected=True)

sys.path.append('..')
from crawl.reports.plotto import *

summary_file = '../tasks/process/test-data/weekly-20170220090024-crawl-logs-14.analysis.tsjson.sorted'


hours, stats = load_timeline(summary_file)

py.iplot(plot_status_codes_timeline(hours, stats))

for plot in plot_single_timelines(hours,stats):
    py.iplot(plot)

py.iplot(plot_dedup_timeline(hours,stats))

In [48]:
sources = {}
order_by_lines = {}
lines = []
with open(summary_file) as f:
    for line in f:
        key, json_str = re.split('\t', line, maxsplit=1)
        if key.startswith("BY-HOST"):
            tag, source = re.split(' ', key, maxsplit=1)
            stats = json.loads(json_str)
            sources[source] = stats
            order_by_lines[source] = stats['lines']
            if stats.get('lines', '-') != '-' and stats['lines'] != 0:
                lines.append(stats['lines'])
                        
#            data_matrix = [
#                        ['Seed', source],
#                        ['Total Crawl Events', stats['lines']],
#                        ['Deduplicated', stats.get('duplicate:digest','-')]
#                       ]
#            table = FF.create_table(data_matrix)
#            table.layout.update({'title': 'Summary statistics for %s' % source})
#            py.iplot(table, filename='simple_table')
#            if len(sources) == 3:
#                break

data = [go.Histogram(x=lines)]

layout = go.Layout(
    xaxis=dict(
        type='log',
        autorange=True
    ),
    yaxis=dict(
        type='log',
        autorange=True
    )
)
fig = go.Figure(data=data)#, layout=layout)

py.iplot(fig)

# Extract top fives:
print("TOP...")
data_matrix = []
data_matrix.append(['Seed', 'Total Hits'])
top_sources = sorted(order_by_lines, key=order_by_lines.get, reverse=False)[:20]
print(sources[top_sources[19]])
for source in top_sources:
    data_matrix.append([source,sources[source]['lines']])

table = FF.create_table(data_matrix)
py.iplot(table)




TOP...
{u'tries:5t': 1, u'source:http://www.theshoppie.com/': 2, u'lines': 2, u'status_code:-6': 1, u'content_type:unknown': 1, u'hop:E': 1, u'sum:content_length:-': 2, u'tries:2t': 1, u'host:zev.lacounty.gov': 2, u'status_code:-1': 1, u'content_type:text/dns': 1, u'hop:P': 1}


In [44]:
def load_targets(summary_file):
    targets = {}
    with open(summary_file) as f:
        for line in f:
            key, json_str = re.split('\t', line, maxsplit=1)
            if key.startswith("BY-TARGET"):
                tag, tid = re.split(' ', key, maxsplit=1)
                stats = json.loads(json_str)
                targets[tid] = stats
    return targets

def pie_for(stats, prefix, title, label_max_length=40, max_slices=20):
    labels=[]
    values=[]
    total = 0
    for key in stats:
        if key.startswith(prefix):
            label = key[len(prefix):]
            if len(label) > label_max_length:
                label = "...%s" % label[-label_max_length:]
            if total < max_slices:
                labels.append(label)
                values.append(stats[key])
            total += 1
            
    layout = go.Layout(
        legend=dict(
            bgcolor='rgba(1.0,1.0,1.0,0.25)',
            borderwidth=0
        ),
        title="%s (%i in total)" %(title, total),
        height=400
    )

    trace=go.Pie(labels=labels,values=values, textinfo='none')
    fig = go.Figure(data=[trace], layout=layout)
    return fig
    
def annotations(stats):
    skip_prefixes = [ 'ip:', 'host:', 'status_code:', 'hop:', 'content_type:', 'source:']
    annots = {}
    for key in stats:
        skip = False
        for skip_prefix in skip_prefixes:
            if key.startswith(skip_prefix):
                skip = True
                break
        if not skip:
            annots[key] = stats[key]
    print(annots)
    
targets = load_targets(summary_file)
tid = '43611'
py.iplot(pie_for(targets[tid], 'status_code:', 'Status Codes'))
py.iplot(pie_for(targets[tid], 'tries:', 'Retries'))
py.iplot(pie_for(targets[tid], 'hop:', 'Crawl Hops'))
py.iplot(pie_for(targets[tid], 'host:', 'Hosts'))
py.iplot(pie_for(targets[tid], 'ip:', 'IP Addresses'))
py.iplot(pie_for(targets[tid], 'content_type:', 'Content Types'))
annotations(targets[tid])



{u'usingCharsetInHTML:UTF-8': 20, u'sum:content_length': 8316642946, u'WrenderedURL': 1, u'tries:5t': 72, u'tries:4t': 6, u'Q:serverMaxSuccessKb': 15, u'lines': 126992, u'sum:content_length:-': 18198, u'tries:2t': 1452, u'duplicate:digest': 61718, u'tries:3t': 721, u'unsatisfiableContentEncoding:utf8': 1}
