In [None]:
import csv
import random
from math import log
from pathlib import Path

import spacy
from spacy.util import minibatch, decaying, compounding

use_titles = True
do_clean_text = True
model = 'en_core_web_sm'
#model = 'en_core_web_lg'

fp_texts = []
fn_texts = []

## Load data

In [None]:
def load_data(path):
    texts = []
    true_labels = []
    with path.open('r') as file_:
        for row in csv.DictReader(file_, delimiter=','):
            text = row['title'] if use_titles else row['text']
            labels = row['labels'] # could be multiple labels per row
            for label in labels.split(';'):
                texts.append(text)
                true_labels.append(label.strip())
    return texts, true_labels


def format_data_for_spacy(texts, labels, all_labels):
    ys = []
    for true_label in labels:
        cats = {wrong_label: 0.0 for wrong_label in all_labels}
        cats[true_label] = 1.0
        ys.append({'cats': cats})
    return list(zip(texts, ys))

## Clean text

In [None]:
import re
from itertools import groupby
regex = re.compile(r"n[ar]\d+[a-z]*") # e.g: na18020,nr18030ml

def clean_text(nlp, text):
    doc = nlp(text)
    
    # merge entities
    #for span in doc.ents:
    #    span.merge()
    
    # normalize & filter tokens
    words = []
    for t in doc:
        w = normalize_word(t)
        if (None != w):
            words.append(w)
    
    # remove duplicated consecutive terms (e.g: DATE DATE... -> DATE)
    words = [x[0] for x in groupby(words)]
    
    # to string
    return ' '.join(words)

def normalize_word(t):
    if (t.ent_type_ in ('DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL')):
        return t.ent_type_
    if t.like_num:
        return 'LIKE_NUM'
    if t.like_email:
        return 'LIKE_EMAIL'
    if t.like_url:
        return 'LIKE_URL'
    if t.is_punct:
        return None
#     if t.is_stop:
#         return None
#     if len(t.lemma_) < 3:
#         return None
    if regex.match(t.lemma_):
        return None
        
    return t.lemma_

## Train

In [None]:
def train(nlp, texts, labels, split=0.8, n_iter=10):
    
    # add the text classifier to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'textcat' not in nlp.pipe_names:
        textcat = nlp.create_pipe('textcat')
        nlp.add_pipe(textcat, last=True)
    # otherwise, get it, so we can add labels to it
    else:
        textcat = nlp.get_pipe('textcat')

    # add labels
    label_set = set(labels)
    for label in label_set:
        textcat.add_label(label)
    
    # split train/eval
    data = format_data_for_spacy(texts, labels, label_set)
    random.shuffle(data)
    split = int(len(data) * split)
    train_data = data[:split]
    eval_data = data[split:]
    dev_texts = []
    dev_cats = []
    for index, (t, c) in enumerate(eval_data):
        dev_texts.append(t)
        dev_cats.append(c)

    print("Using {} examples ({} training, {} evaluation)"
          .format(len(texts), len(train_data), len(eval_data)))

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        print("Training the model...")
        print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
        for i in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
#             batches = minibatch(train_data, size=compounding(4., 32., 1.001))
#             for batch in batches:
            for batch in minibatch(train_data, size=128):
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, losses=losses, drop=0.3)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)

            print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  # print a simple table
                  .format(losses['textcat'], scores['textcat_p'],
                          scores['textcat_r'], scores['textcat_f']))

def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 1e-8  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 1e-8  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]['cats']
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.
                fp_texts.append((label, texts[i]))
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
                fn_texts.append((label, texts[i]))
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_score = 2 * (precision * recall) / (precision + recall)
    return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}

## Main

In [None]:
def main(data_dir = '../../../data/lapd.labeled'):
    
    print("loading model '{}'...".format(model))
    nlp = spacy.load(model)
    print('model loaded.')

    print()
    print("loading data from '{}'...".format(data_dir))
    data_dir = Path(data_dir)
    texts = []
    labels = []
    for year in ['2010', '2011', '2012', '2013', '2014', '2015', '2016', '2018']:
    #for year in ['1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2017', '2018']:
        t, l = load_data(data_dir / 'lapd_news_{}.csv'.format(year))
        texts = texts + t
        labels = labels + l
    
    if do_clean_text:
        print()
        print('cleaning texts...')
        with nlp.disable_pipes('parser'):
            clean_texts = [clean_text(nlp, text) for text in texts]
            print('clean texts completed.')
    else:
        clean_texts = texts
        
#     print()
#     for i in range(5):
#         print('label:\t', labels[i])
#         print('text:\t', texts[i])
#         if do_clean_text:
#             print('clean:\t', clean_texts[i])
#         print()

    print()
    train(nlp, clean_texts, labels)

    print()
    print('Done.')
    
    # FP TEXTS
    fp_csv = '../../../data/fp.csv'
    with open(fp_csv, 'w') as _file:
        writer = csv.writer(_file)
        writer.writerow(['label', 'text'])
        for t in fp_texts:
            writer.writerow([t[0], t[1]])
    
    # FN TEXTS
    fn_csv = '../../../data/fn.csv'
    with open(fn_csv, 'w') as _file:
        writer = csv.writer(_file)
        writer.writerow(['label', 'text'])
        for t in fn_texts:
            writer.writerow([t[0], t[1]])

main()