# Filter evaluations by precision-recall analysis

## 1 Setup

Flags and settings.

In [1]:
SAMPLE_SIZE = 100

Imports and database setup.

In [2]:
from random import sample
from textwrap import indent, fill

import numpy as np

%cd -q ..
from brainscopypaste.conf import settings
%cd -q notebooks
from brainscopypaste.db import Cluster, Quote
from brainscopypaste.utils import init_db, session_scope, langdetect
from brainscopypaste.filter import filter_quote_offset
engine = init_db()

## 2 Evaluate language filtering

In [3]:
with session_scope() as session:
    #quote_ids = sample([id for (id,) in session.query(Quote.id).filter(Quote.filtered == False)], SAMPLE_SIZE)
    strings = [session.query(Quote).get(id).string for id in quote_ids]
strings_langs = [(string, langdetect(string)) for string in strings]

In [4]:
print("Over a sample of {}, {} quotes are rejected because their detected language is not English"
      .format(SAMPLE_SIZE, np.sum([lang != 'en' for _, lang in strings_langs])))

Over a sample of 100, 17 quotes are rejected because their detected language is not English


Here are the individual strings and their detected languages.

In [5]:
for i, (string, lang) in enumerate(strings_langs):
    title = ' {} / {}'.format(i + 1, SAMPLE_SIZE)
    print('-' * (80 - len(title)) + title)
    print('Language:', lang)
    print()
    print(indent(fill(string), ' ' * 5))
    print()

------------------------------------------------------------------------ 1 / 100
Language: en

     they want to set an example for harlow nicole and joel are doing this
     to seal their love legally they want the ceremony to coincide with
     harlow's first birthday and be one big joyous occasion for everybody

------------------------------------------------------------------------ 2 / 100
Language: en

     i cannot believe what i have seen in the last 36 hours i have seen
     dead bodies blood everywhere and only heard gunshots

------------------------------------------------------------------------ 3 / 100
Language: en

     but i have to reiterate once again that we only have one president at
     a time and i want to be very careful that we are sending the right
     signals to the world as a whole that i am not the president and i
     won't be until january 20th

------------------------------------------------------------------------ 4 / 100
Language: en

     i extracte

## 2 Evaluate full cluster filtering

In [6]:
with session_scope() as session:
    #cluster_ids = sample([id for (id,) in session.query(Cluster.id).filter(Cluster.filtered == False)], SAMPLE_SIZE)
    clusters = [session.query(Cluster).get(id) for id in cluster_ids]
    strings_kepts = []
    for c in clusters:
        fcluster = c.filter()
        if fcluster is not None:
            kept_quote_ids = set([q.id - filter_quote_offset() for q in fcluster.quotes])
        else:
            kept_quote_ids = set([])
        strings_kepts.append(([(q.string, q.id in kept_quote_ids) for q in c.quotes], c.filter() is not None))

In [7]:
print("Over a sample of {}, {} clusters are rejected by the cluster filter"
      .format(SAMPLE_SIZE, SAMPLE_SIZE - np.sum([kept for _, kept in strings_kepts])))

Over a sample of 100, 35 clusters are rejected by the cluster filter


Here are the individual cluster strings and their respective rejected/kept status.

In [8]:
for i, (strings, ckept) in enumerate(strings_kepts):
    title = ' {} / {}'.format(i + 1, SAMPLE_SIZE)
    print('-' * (80 - len(title)) + title)
    print('Kept:',  'yes' if ckept else 'no')
    print()
    for string, skept in strings:
        fstring = indent(fill(string), ' ' * 5)
        if not skept:
            fstring = fstring[0] + 'x' + fstring[2:]
        print(fstring)
        print()

------------------------------------------------------------------------ 1 / 100
Kept: no

 x   it was actually the first time that i cried since the whole incident
     started

 x   the enormity of the situation

 x   i thought i was in the end zone

 x   i thought i was in the clear

 x   and it was the first time that i thought i was in the mafia

 x   just last night actually for the first time i saw some of the
     reenactments about how the airplane actually had to land on the water
     with the nose high and i think for the first time kind of the enormity
     of the situation really hit me and it was actually the first time that
     i cried since the whole incident started

------------------------------------------------------------------------ 2 / 100
Kept: yes

 x   everyone is entitled to an informed opinion

     everyone is entitled to an opinion and so is obama and his staff then
     again you know what they say about opinions

     you know what they say about it

