In [1]:
import csv
import random
from math import log

from pathlib import Path
import spacy
from spacy.util import minibatch, decaying, compounding
import spacy.about

In [8]:
def load_data(path):
    texts = []
    labels = []
    with path.open('r') as file_:
        for row in csv.DictReader(file_, delimiter=','):
#             text = row['title']
            text = row['text']
            text_labels = row['labels'] # there may be multiple labels per row
            for label in text_labels.split(';'):
                texts.append(text)
                labels.append(label.strip())
    return texts, labels


def format_data_for_spacy(texts, labels, all_labels):
    ys = []
    for true_label in labels:
        cats = {wrong_label: 0.0 for wrong_label in all_labels}
        cats[true_label] = 1.
        ys.append({'cats': cats})
    return list(zip(texts, ys))

In [3]:
import re
from itertools import groupby
regex = re.compile(r"n[ar]\d+[a-z]*") # e.g: na18020,nr18030ml

def normalize_word(t):
    if (t.ent_type_ in ('DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL')):
        return t.ent_type_
    if t.like_num:
        return 'LIKE_NUM'
    if t.like_email:
        return 'LIKE_EMAIL'
    if t.like_url:
        return 'LIKE_URL'
    if t.is_punct:
        return None
#     if t.is_stop:
#         return None
#     if len(t.lemma_) < 3:
#         return None
    if regex.match(t.lemma_):
        return None
        
    return t.lemma_

def clean_text(nlp, text):
    return text

def clean_text2(nlp, text):
    doc = nlp(text)
    
    # merge entities
#     for span in doc.ents:
#         span.merge()
    
    # normalize & filter tokens
    words = []
    for t in doc:
        w = normalize_word(t)
        if (None != w):
            words.append(w)
    
    # remove duplicated consecutive terms (e.g: DATE DATE... -> DATE)
    words = [x[0] for x in groupby(words)]
    
    # to string
    return ' '.join(words)

In [4]:
def evaluate_model(nlp, eval_data):
    right = 0.
    wrong = 0.
    loss = 0.
    for doc, gold in nlp.pipe(eval_data, as_tuples=True):
        score, guess = max((score, label) for label, score in doc.cats.items())
        if gold['cats'].get(guess):
            right += 1
        else:
            wrong += 1
        truth = [a for a, true in gold['cats'].items() if true][0]
        loss += log(doc.cats[truth])
    loss /= (right + wrong)
    print(right, wrong)
    return right / (right + wrong), -loss

In [5]:
def augment_data(train_data):
    return train_data

def train(nlp, texts, labels, models_dir=None, use_default_model=True):
    textcat = nlp.create_pipe('textcat')
    nlp.add_pipe(textcat)
    label_set = set(labels)
    for label in label_set:
        textcat.add_label(label)
    train_data = format_data_for_spacy(texts, labels, label_set)
    random.shuffle(train_data)
    eval_data = train_data[:1000]
    train_data = train_data[len(eval_data):]
    optimizer = nlp.begin_training()
    if not use_default_model:
        textcat.model = build_text_classifier(3, width=64, pretrained_dims=300)
    best = None
#     for i in range(6):
    for i in range(10):
        losses = {}
        augmented = augment_data(train_data)
        for j, batch in enumerate(minibatch(augmented, size=128)):
            texts, annot = zip(*batch)
            nlp.update(texts, annot, sgd=optimizer, losses=losses, drop=0.3)
            if j % 10 == 0: # Pretty basic progress reporting
                if j:
                    print(j, len(batch), 'loss=', losses['textcat'])
                losses = {}
        with nlp.use_params(optimizer.averages):
            acc, loss = evaluate_model(nlp, eval_data)
            if not best or loss < best[0]:
                best = (loss, acc, nlp.to_bytes())
                print('Dev acc', loss, acc, '(new best)')
            else:
                print('Dev acc', loss, acc, '(best: %.2f)' % best[0])
    nlp.from_bytes(best[-1]) # Load our best weights back in
    return nlp



In [10]:
def main(data_dir = '../../../data/lapd.labeled'):
    
    print('loading model...')
    nlp = spacy.load('en_core_web_sm')
    print('model loaded')

    print()
    print("loading data from '{}'...".format(data_dir))
    data_dir = Path(data_dir)
    texts = []
    labels = []
    for year in ['2018', '2017', '2015', '2014', '2013']:
        t, l = load_data(data_dir / 'lapd_news_{}.csv'.format(year))
        texts = texts + t
        labels = labels + l
    
    print('cleaning texts...')
    clean_texts = [clean_text(nlp, text) for text in texts]
    print('clean texts completed.')
#     for i in range(5):
#         print(labels[i])
#         print(texts[i])
#         print(clean_texts[i])
#         print()
    print('training...')
    best = train(nlp, clean_texts, labels)
    
    print('Done.')

main()

loading model...
model loaded

loading data from '../../../data/lapd.labeled'...
cleaning texts...
clean texts completed.
training...
10 128 loss= 20.61522912979126
660.0 340.0
Dev acc 2.121617211163293 0.66 (new best)
10 128 loss= 6.149322986602783


KeyboardInterrupt: 