add log analysis scripts

For internal use only Change-Id: I0c1303b6eae1f1bae6e94bca1baa4c2893f51ba4
wikimedia · Sep 22, 2015 · 3cd1600 · 3cd1600
1 parent ce786b8
commit 3cd1600
Show file tree

Hide file tree

Showing 5 changed files with 173 additions and 0 deletions.
diff --git a/log-analysis/analyze.py b/log-analysis/analyze.py
@@ -0,0 +1,91 @@
+import sys
+import os
+import subprocess
+import os.path
+import re
+
+def getOut(cmd, file=None):
+   if file == None:
+       file = mainfile
+   return subprocess.check_output(cmd % file, shell = True).rstrip('0').replace('V1', '')
+
+def getSummary(cmd, file=None):
+    return getOut("cat %%s | %s | ~/bin/r summary -" % cmd, file)
+
+meanre = re.compile('\s*Mean\s*:\s*([\d.]+)', re.MULTILINE)
+def getAvg(summary):
+    m = meanre.search(summary)
+    #print m, summary
+    return float(m.group(1))
+
+logfile = sys.argv[1]
+outfile = open(logfile + ".out.md", "w")
+mainfile = logfile + ".part"
+
+if not os.path.exists(mainfile):
+    print "Splitting %s into parts" % logfile
+    os.system("perl split.pl %s" % logfile)
+
+print ("Main file %s " % mainfile)
+
+batches = int(getOut("cat %s | grep 'changes in batch' | wc -l"))
+print "%d batches in file" % batches
+outfile.write("Logfile %s contains %d batches\n\n" % (logfile, batches))
+outfile.write("= Batch Stats =\n")
+
+modified_triples = getSummary("grep 'and modified' | cut -d ' ' -f 13")
+modified_avg = getAvg(modified_triples)
+
+changes_per_batch = getSummary("grep 'changes in batch' | cut -d ' ' -f 6")
+print "%f changes per batch" % getAvg(changes_per_batch)
+outfile.write("Changes per batch:\n%s\n" % changes_per_batch)
+
+filtered_per_batch = getSummary("grep 'Filtered' | cut -d ' ' -f 9")
+print "%f real changes per batch" % getAvg(filtered_per_batch)
+outfile.write("Filtered changes per batch:\n%s\n" % filtered_per_batch)
+filter_avg = getAvg(filtered_per_batch)
+
+time_per_batch =  getSummary("grep 'Polled up to' | cut -d ' ' -f 1 | perl -ne '/(\d+):(\d+):(\d+)\.(\d+)/; $ms = $4+1000*($3+60*($2+60*$1)); if($lastms) { print $ms-$lastms, chr(0xA); } $lastms = $ms;'")
+outfile.write("Time per batch (ms):\n%s\n" % time_per_batch)
+time_avg = getAvg(time_per_batch)
+
+advance_per_batch = getSummary("grep 'Polled up to' | cut -d ' ' -f 10 | perl -ne '/T(\d+):(\d+):(\d+)Z/; $ms = ($3+60*($2+60*$1)); if($lastms) { print $ms-$lastms,chr(0xA); } $lastms = $ms;'")
+outfile.write("Advance per batch (s):\n%s\n" % advance_per_batch)
+adv_avg = getAvg(advance_per_batch)
+
+outfile.write("Modified triples:\n%s\n" % modified_triples)
+
+outfile.write("Modified triples - %f per entity\n" % (modified_avg/filter_avg))
+
+outfile.write("= Processing Stats =\n")
+
+outfile.write("Processing time per change: %f ms, %f updates/second\n" %(time_avg/filter_avg, filter_avg/time_avg*1000))
+print ("Processing time per change: %f ms, %f updates/second\n" %(time_avg/filter_avg, filter_avg/time_avg*1000))
+
+print "Stream is %f updates/s\n" % (filter_avg/adv_avg)
+outfile.write("\nStream is %f updates/s\n" % (filter_avg/adv_avg))
+outfile.write("\nUpdate/stream ratio is %f\n\n" % (adv_avg/time_avg*1000))
+
+update_preparation = getSummary("grep 'Preparing update' | cut -d ' ' -f 10")
+outfile.write("Update preparation time (ms):\n%s\n" % update_preparation)
+
+sparqlname = logfile + ".q"
+if not os.path.exists(sparqlname):
+    os.system("cat %s | perl getsparql.pl | sort > %s" %(logfile, sparqlname))
+print "Queries in %s" % sparqlname
+
+filter_times = getSummary("grep 'SELECT DISTINCT ?s WHERE' | cut -d ' ' -f 7", sparqlname)
+update_times = getSummary("grep 'Clear' | cut -d ' ' -f 9", sparqlname)
+tsupdate_times = getSummary("grep 'DELETE {' | cut -d ' ' -f 4", sparqlname)
+values_times = getSummary("grep 'SELECT DISTINCT ?entity ?s' | cut -d ' ' -f 6", sparqlname)
+
+outfile.write("== Query Stats ==\n")
+outfile.write("FILTER query (ms):\n%s\n" % filter_times)
+outfile.write("SELECT values query (ms):\n%s\n" % values_times)
+outfile.write("UPDATE query (ms):\n%s\n" % update_times)
+outfile.write("Timestamp UPDATE query (ms):\n%s\n" % tsupdate_times)
+
+update_avg = getAvg(update_times)
+
+outfile.write("Update query speed is %f entities/s, %f triples/s\n" % (filter_avg/update_avg*1000, modified_avg/update_avg*1000))
+os.system("cat %s.out.md | pbcopy" % logfile)
diff --git a/log-analysis/datediff.pl b/log-analysis/datediff.pl
@@ -0,0 +1,7 @@
+while(<>) {
+if(/(\d+):(\d+):(\d+)\.(\d+)/) {
+	 $ms = $4+1000*($3+60*($2+60*$1)); 
+	if($lastms) { print $ms-$lastms, "\n"; } 
+	$lastms = $ms;
+}
+}
diff --git a/log-analysis/getq.pl b/log-analysis/getq.pl
@@ -0,0 +1,32 @@
+$qtype = undef;
+while(<>) {
+	if(/Running SPARQL/) {
+		$qtype = 'other';
+		$cnt = 1;
+	}
+	if(/Completed in (\d+)/) {
+		if($qtype) {
+			print "$1 $qtype $cnt\n";
+		}
+		$qtype = undef;
+		next;
+	}	
+	next unless $qtype;
+	$cnt++;
+	if(/ASK {/) {
+		$qtype = 'version';
+	}
+	if(/SELECT DISTINCT \?s/) {
+		$qtype = 'fetch'
+	}
+	if($qtype eq 'fetch' && m|<http://www.w3.org/ns/prov#wasDerivedFrom>|) {
+		$qtype = 'refs';
+	}
+	if($qtype eq 'refs' && /} UNION {/) {
+		$qtype = 'values';
+	}
+	if(/INSERT {/) {
+		$qtype = "update";
+	}
+}
+
diff --git a/log-analysis/getsparql.pl b/log-analysis/getsparql.pl
@@ -0,0 +1,15 @@
+$sparql = 0;
+$name = "wtf";
+while(<>) {
+	if(!$sparql) {
+		if(/Running SPARQL: (.+)/) {
+			$name = $1;
+			$sparql = 1;
+		}
+	} else {
+		if(/- Completed in (\d+) ms/) {
+			$sparql = 0;
+			print "$name => $1\n";
+		}
+	}
+}
diff --git a/log-analysis/split.pl b/log-analysis/split.pl
@@ -0,0 +1,28 @@
+
+$f = $ARGV[0];
+
+open $in, $ARGV[0] or die $!;
+%FILES = {};
+$out = '';
+$outf = STDOUT;
+
+while(<$in>) {
+	if(/\[(main|update (\d+))\]/) {
+		$newout = $1;
+		if($out ne $newout) {
+			$outf = switchout($newout, $2);
+			$out = $newout;
+		}
+	}
+	print $outf $_;
+}
+
+sub switchout {
+	my ($newout, $name) = @_;
+	if(!defined($FILES{$newout})) {
+		my $f = $ARGV[0].".part".$name;
+		print "Opening $f for $newout\n";
+		open $FILES{$newout}, ">", $f;
+	}
+	return $FILES{$newout};
+}