In [None]:
# Copyright 2016 Google Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
################################################################################
#
# Notebook to do various data slicing after collecting ratings.

In [None]:
%matplotlib notebook
from __future__ import division

import bs4
import collections
import csv
import itertools
import jsonpickle
import math
import numpy as np
import os
import os.path
import sys

import sklearn.cross_validation

from pyclick.click_models.PBM import PBM as pyclick_PBM
from pyclick.search_session.SearchResult import SearchResult as pyclick_SearchResult
from pyclick.search_session.SearchSession import SearchSession as pyclick_SearchSession

sys.path.append(
    os.path.abspath(os.path.join(os.path.dirname('__file__'), os.path.pardir)))

from logs_processing.click_model import *
from logs_processing.create_tasks import Action, LogItem
from logs_processing.fields import orig_query, rel_column

In [None]:
CF = '<DIRECTORY_WITH_EXPORTED_CROWD_FLOWER_RESULT_CSVs>'
SPAMMER_FILENAMES = ['spammer1.txt', 'spammer2.txt']
RESULTS_D = 'f789260_D.full.csv'
RESULTS_AR = ['f842336_A+R.part1.csv', 'f845369_A+R.part2.csv', 'f845808_A+R.part3.csv', 'f846814_A+R.part4.csv']
TASK_FILE = 'task.csv'

In [None]:
USE_CF_TRUST = True

### Read the spammers data

In [None]:
spammers = set()
for s_file_name in SPAMMER_FILENAMES:
    with open(os.path.join(CF, s_file_name)) as f:
        for worker_id in f:
            spammers.add(worker_id.rstrip())

print '%d spammers' % len(spammers)

### Read the ratings

In [None]:
log_id_to_rel = collections.defaultdict(RelContainer)
log_id_to_query = {}
good_worker_ratings = 0
total_ratings = 0
d_workers = set()
with open(os.path.join(CF, RESULTS_D)) as results_D:
    for row in csv.DictReader(results_D):
        worker_id = row['_worker_id']
        d_workers.add(worker_id)
        total_ratings += 1
        if worker_id not in spammers:
            good_worker_ratings += 1
            trust = float(row['_trust']) if USE_CF_TRUST else 1
            log_id = row['log_id']
            RelContainer.add_rel(log_id_to_rel[log_id].Ds, row[rel_column['D']], trust)
            log_id_to_query[log_id] = row[orig_query['D']]
print '(D) %.1f%% ratings form spammers' % (100 - 100 * good_worker_ratings / total_ratings)
print '(D) workers', len(d_workers)   
print '(D) ratings', total_ratings
    
good_worker_ratings = 0
total_ratings = 0
r_workers = set()
for result_AR in RESULTS_AR:
    with open(os.path.join(CF, result_AR)) as results_AR:
        for row in csv.DictReader(results_AR):
            worker_id = row['_worker_id']
            r_workers.add(worker_id)
            total_ratings += 1
            if worker_id not in spammers:
                good_worker_ratings +=1
                trust = float(row['_trust']) if USE_CF_TRUST else 1
                log_id = row['log_id']
                RelContainer.add_rel(log_id_to_rel[log_id].Rs, row[rel_column['R']], trust)
                query = row[orig_query['R']]
                old_query = log_id_to_query.setdefault(log_id, query)
                if old_query != query:
                    print >>sys.stderr, ('The same log_id '
                            '(%s) maps to two different queries: [%s] and [%s]' % (
                                    log_id, old_query, query))
                    sys.exit(1)

print '%d items with complete relevance' % sum(
        1 for r in log_id_to_rel.itervalues() if r)

print '%d queries with at least one completely judged document' % len(set(
        log_id_to_query[k] for k, r in log_id_to_rel.iteritems() if r))

print '(R) workers', len(r_workers)
print '(R) ratings', total_ratings

print '(R) %.1f%% ratings form spammers' % (100 - 100 * good_worker_ratings / total_ratings)

### Read the SERPs

In [None]:
data = []
with open(os.path.join(CF, TASK_FILE)) as task_file:
    sat_labels = []
    num_skipped = 0
    num_sat_true = 0
    num_total = 0
    reader = csv.DictReader(task_file)
    for key, query_rows_iter in itertools.groupby(reader,
                    key=lambda row: (row['log_id'].split('_')[:-1], # SERP id
                                     row[orig_query['query']],
                                     row['sat_feedback'])):
        sat = key[2]
        if sat == 'undefined':
            print >>sys.stderr, 'Undefined sat label for query [%s]' % query
        sat_labels.append(sat)
        sat = parse_sat(sat)
        if sat is None:
            num_skipped += 1
            continue
        elif sat:
            num_sat_true += 1
        data_row = {'query': key[1], 'sat': sat, 'session': [], 'serp': []}
        for row in query_rows_iter:
            data_row['session'].append(jsonpickle.decode(row['actions']))
            data_row['serp'].append(
                    bs4.BeautifulSoup(row['snippet'], 'html.parser').li)
        data.append(data_row)
        num_total += 1
    print collections.Counter(sat_labels)
    print 'Skipped %d rows out of %d' % (num_skipped, num_total + num_skipped)
    print '%.1f%% of SAT labels in the data' % (num_sat_true / num_total * 100)

### Examine CSS classes and geometry

In [None]:
all_classes = []
min_geo_features = sys.maxint * np.ones(4)
max_geo_features = -1* np.ones(4)
offsets_left = []
for i, d in enumerate(data):
    for s in d['serp']:
        all_classes.append(frozenset(s['class']))
        geo_features = [int(f) for f in s['emup'].split(';')[1:]]
        offsets_left.append(geo_features[0])
        min_geo_features = np.minimum(min_geo_features, geo_features)
        max_geo_features = np.maximum(max_geo_features, geo_features)
print 'classes:', collections.Counter(all_classes)
print 'min:', min_geo_features
print 'max:', max_geo_features
print 'offsets left:', collections.Counter(offsets_left)

### Examine a session

In [None]:
all_classes = []
one_classes = []
for i, d in enumerate(data):
    for s in d['session']:
        print s.actions
        break
    break