In [1]:
import csv
import random
from math import log

from pathlib import Path
import spacy
from spacy.util import minibatch, decaying, compounding
import spacy.about

In [2]:
def read_data(path):
    texts = []
    labels = []
    with path.open('r') as file_:
        for row in csv.DictReader(file_, delimiter=','):
            text = row['title']
            #text = row['text']
            text_labels = row['labels'] # there may be multiple labels per row
            for label in text_labels.split(','):
                texts.append(text)
                labels.append(label.strip())
    return texts, labels


def format_data_for_spacy(texts, labels, all_labels):
    ys = []
    for true_label in labels:
        cats = {wrong_label: 0.0 for wrong_label in all_labels}
        cats[true_label] = 1.
        ys.append({'cats': cats})
    return list(zip(texts, ys))

In [3]:
import re
from itertools import groupby
regex = re.compile(r"n[ar]\d+[a-z]*") # e.g: na18020,nr18030ml

def normalize_word(t):
    if (t.ent_type_ in ('DATE', 'TIME', 'PERCENT', 'MONEY', 'QUANTITY', 'ORDINAL', 'CARDINAL')):
        return t.ent_type_
    if t.like_num:
        return 'LIKE_NUM'
    if t.like_email:
        return 'LIKE_EMAIL'
    if t.like_url:
        return 'LIKE_URL'
    if t.is_punct:
        return None
#     if t.is_stop:
#         return None
#     if len(t.lemma_) < 3:
#         return None
    if regex.match(t.lemma_):
        return None
        
    return t.lemma_

def clean_text(nlp, text):
    doc = nlp(text)
    
    # merge entities
#     for span in doc.ents:
#         span.merge()
    
    # normalize & filter tokens
    words = []
    for t in doc:
        w = normalize_word(t)
        if (None != w):
            words.append(w)
    
    # remove duplicated consecutive terms (e.g: DATE DATE... -> DATE)
    words = [x[0] for x in groupby(words)]
    
    # to string
    return ' '.join(words)

In [4]:
def main(data_dir = '../../../data/lapd.labeled'):
    data_dir = Path(data_dir)
    texts, labels = read_data(data_dir / 'lapd_news_2018.csv')
    nlp = spacy.load('en')
    clean_texts = [clean_text(nlp, text) for text in texts]
    for i in range(5):
        print(labels[i])
        print(texts[i])
        print(clean_texts[i])
        print()
main()

MURDER
City Wide Annual Homicide Report - 2017    NA18022rh
city wide annual homicide report LIKE_NUM    

NONE
Operations-Valley Bureau Human Trafficking Task Force Enforcement Update    NA18020dm
operations valley bureau human traffic task force enforcement update     dm

THEFT
Grand Theft by Trickery      NR18030ml  
grand theft by trickery        

MURDER
Attempt Murder on a Police Officer Suspect arrested in Southwest Division NR18026bm
attempt murder on a police officer suspect arrest in southwest division

NONE
LAPD & “Go Be” Welcome Home Kit Donation NA18016dm
lapd go be welcome home kit donation dm

