In [1]:
import os
import csv
import random
from math import log
from pathlib import Path

import spacy
from spacy.util import minibatch, decaying, compounding

use_titles = False
do_clean_text = False
model = 'en_core_web_sm'

n_iter=30
drop=0.3
score_treshold=0.6

fp_texts = []
fn_texts = []

## Load data

In [2]:
def load_data(path):
    texts = []
    true_labels = []
    with path.open('r') as file_:
        for row in csv.DictReader(file_, delimiter=','):
            # text
            text = row['title'] if use_titles else row['text']
            texts.append(text)
            # labels
            raw_labels = row['labels'] # could be multiple labels per row (sparated by ';')
            labels = list(map(lambda label: label.strip(), raw_labels.split('|')))
            true_labels.append(labels)
                
    return texts, true_labels


def format_data_for_spacy(texts, labels, all_labels):
    ys = []    
    for true_labels in labels:
        cats = {wrong_label: 0.0 for wrong_label in all_labels}
        for true_label in true_labels:
            cats[true_label] = 1.0
        ys.append({'cats': cats})
    return list(zip(texts, ys))

## Clean text

In [3]:
import re
from itertools import groupby
regex = re.compile(r"n[ar]\d+[a-z]*") # e.g: na18020,nr18030ml

def clean_text(nlp, text):
    doc = nlp(text)
    
    # merge entities
    #for span in doc.ents:
    #    span.merge()
    
    # normalize & filter tokens
    words = []
    for t in doc:
        w = normalize_word(t)
        if (None != w):
            words.append(w)
    
    # remove duplicated consecutive terms (e.g: DATE DATE... -> DATE)
    words = [x[0] for x in groupby(words)]
    
    # to string
    return ' '.join(words)

def normalize_word(t):
    if (t.ent_type_ in ('DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL')):
        return t.ent_type_
    if t.like_num:
        return 'LIKE_NUM'
    if t.like_email:
        return 'LIKE_EMAIL'
    if t.like_url:
        return 'LIKE_URL'
    if t.is_punct:
        return None
#     if t.is_stop:
#         return None
#     if len(t.lemma_) < 3:
#         return None
    if regex.match(t.lemma_):
        return None
        
    return t.lemma_

## Train

In [4]:
def train(nlp, texts, labels, split=0.8, n_iter=n_iter):
    
    # add the text classifier to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'textcat' not in nlp.pipe_names:
        textcat = nlp.create_pipe('textcat')
        nlp.add_pipe(textcat, last=True)
    # otherwise, get it, so we can add labels to it
    else:
        textcat = nlp.get_pipe('textcat')

    # add labels
    labels2 = []
    for labels_ in labels:
        for label in labels_:
            labels2.append(label)
    
    label_set = set(labels2)
    for label in label_set:
        textcat.add_label(label)
    
    # split train/eval
    data = format_data_for_spacy(texts, labels, label_set)
    random.shuffle(data)
    split = int(len(data) * split)
    train_data = data[:split]
    eval_data = data[split:]
    dev_texts = []
    dev_cats = []
    for index, (t, c) in enumerate(eval_data):
        dev_texts.append(t)
        dev_cats.append(c)

    print("Using {} examples ({} training, {} evaluation)"
          .format(len(texts), len(train_data), len(eval_data)))

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        print("Training the model...")
        print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
        for i in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
            #for batch in minibatch(train_data, size=compounding(16., 128., 1.01)):
            for batch in minibatch(train_data, size=128):
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, losses=losses, drop=drop)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats, i+1 == n_iter)

            print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  # print a simple table
                  .format(losses['textcat'], scores['textcat_p'],
                          scores['textcat_r'], scores['textcat_f']))

def evaluate(tokenizer, textcat, texts, cats, do_fp_fn):
    docs = (tokenizer(text) for text in texts)
    tp = 1e-8  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 1e-8  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]['cats']
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if score >= score_treshold and gold[label] >= 0.5:
                tp += 1.
            elif score >= score_treshold and gold[label] < 0.5:
                fp += 1.
                if do_fp_fn:
                    fp_texts.append((label, texts[i]))
            elif score < score_treshold and gold[label] < 0.5:
                tn += 1
            elif score < score_treshold and gold[label] >= 0.5:
                fn += 1
                if do_fp_fn:
                    fn_texts.append((label, texts[i]))
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_score = 2 * (precision * recall) / (precision + recall)
    return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}

## Main

In [5]:
def main(data_dir = '../../../data/lapd.labeled'):
    
    print("loading model '{}'...".format(model))
    nlp = spacy.load(model)

    print()
    print("loading data from '{}'...".format(data_dir))
    data_dir = Path(data_dir)
    texts = []
    labels = []
    #for year in ['2018']:
    for year in ['2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018']:
    #for year in ['1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018']:
        csv_path = data_dir / 'lapd_news_{}.csv'.format(year)
        if csv_path.exists():
            t, l = load_data(csv_path)
            texts = texts + t   # array concatination
            labels = labels + l # array concatination
    
    if do_clean_text:
        print()
        print('cleaning texts...')
        with nlp.disable_pipes('parser'):
            clean_texts = [clean_text(nlp, text) for text in texts]
            print('clean texts completed.')
    else:
        clean_texts = texts
        
#     print()
#     for i in range(5):
#         print('label:\t', *labels[i])
#         print('text:\t', texts[i])
#         if do_clean_text:
#             print('clean:\t', clean_texts[i])
#         print()

    print()
    train(nlp, clean_texts, labels)

    print()
    print('Done.')
    
    # FP TEXTS
    fp_csv = '../../../data/fp.csv'
    with open(fp_csv, 'w') as _file:
        writer = csv.writer(_file)
        writer.writerow(['label', 'text'])
        for t in set(fp_texts):
            writer.writerow([t[0], t[1]])
    
    # FN TEXTS
    fn_csv = '../../../data/fn.csv'
    with open(fn_csv, 'w') as _file:
        writer = csv.writer(_file)
        writer.writerow(['label', 'text'])
        for t in set(fn_texts):
            writer.writerow([t[0], t[1]])

main()

loading model 'en_core_web_sm'...

loading data from '../../../data/lapd.labeled'...

Using 4879 examples (3903 training, 976 evaluation)
Training the model...
LOSS 	  P  	  R  	  F  
53.666	0.845	0.433	0.573
18.561	0.862	0.430	0.574
16.091	0.873	0.462	0.604
14.715	0.866	0.479	0.617
14.119	0.859	0.509	0.639
12.518	0.858	0.518	0.646
11.197	0.852	0.546	0.665
10.585	0.849	0.558	0.674
10.097	0.850	0.567	0.680
10.021	0.851	0.570	0.682
9.685	0.845	0.567	0.679
9.655	0.841	0.575	0.683
9.561	0.843	0.569	0.679
9.571	0.839	0.573	0.681
9.782	0.841	0.564	0.675
10.129	0.848	0.562	0.676
10.139	0.845	0.567	0.679
11.125	0.840	0.566	0.676
10.239	0.840	0.569	0.678
9.237	0.841	0.575	0.683
8.586	0.838	0.578	0.684
7.950	0.833	0.579	0.683
7.712	0.830	0.582	0.684
7.474	0.830	0.587	0.688
7.528	0.829	0.597	0.694
7.680	0.832	0.599	0.697
7.274	0.833	0.599	0.697
7.172	0.834	0.603	0.700
7.089	0.832	0.602	0.699
7.128	0.829	0.606	0.700

Done.
