In [1]:
import _common

In [2]:
import erica
from erica.core import *
from erica.model import *

In [3]:
import random
import math
import time

from sqlalchemy import distinct, and_, desc
from sqlalchemy.sql import func
import janome.tokenizer
import gensim
from gensim.corpora import Dictionary

unable to import 'smart_open.gcs', disabling that module


In [4]:
class Constant:
    irex_tags = ["location", "organization", "person", "date", "artifact", "time", "money", "percent"]
    tags_path = Config.get("corpus.root") + "/" + Config.get("corpus.tags")

In [5]:
class Vector:
    def __init__(self, tuples = None, dictionary = None):
        if dictionary is not None:
            self.data = dictionary
            return

        self.data = {}
        
        if tuples is None:
            return

        for k, v in tuples:
            self.data[k] = v

    def __add__(self, other):
        data = {}
        for k in set(self.data.keys()) | set(other.data.keys()):
            data[k] = self.data.get(k, 0) + other.data.get(k, 0)

        return Vector(None, dictionary = data)

    def __truediv__(self, scalar):
        data = {}
        for k, v in self.data.items():
            data[k] = v / scalar

        return Vector(None, dictionary = data)

    def norm(self):
        return math.sqrt(sum([v * v for k, v in self.data.items()]))
    
    def normalized(self):
        norm = self.norm()
        return Vector([(k, v / norm) for k, v in self.data.items()])

    @classmethod
    def distance(cls, v1, v2):
        return math.sqrt(sum([(v1.data.get(k, 0) - v2.data.get(k, 0)) ** 2 for k in set(v1.data.keys()) | set(v2.data.keys())]))

    @classmethod
    def center(cls, vs):
        total = Vector()
        for v in vs:
            total += v

        return total / len(vs)

In [6]:
global_tokenizer = janome.tokenizer.Tokenizer()

In [7]:
def separate(records):
    irex_tags = Constant.irex_tags
    store = { tag: [] for tag in irex_tags }

    for record in records:
        tags = [target for target in irex_tags if target in record["tags"]]

        if len(tags) == 1:
            tag = tags[0]
            store[tag].append(record["word"])

    return store

In [8]:
def sample_entries(n):
    max_id = Session.query(func.max(Entry.id).label("max_id")).one().max_id
    sample_id_list = random.sample(range(1, max_id + 1), n)

    return Session.query(Entry, PlainText)\
        .join(PlainText, Entry.id == PlainText.entry_id)\
        .filter(Entry.id.in_(sample_id_list))\
        .all()

In [9]:
def document_to_bow(dictionary, title, text):
    words = [word for line in text.split("\n") for word in global_tokenizer.tokenize(line, wakati = True)]
    dictionary.add_documents([words])
    return dictionary.doc2bow(words)

In [10]:
def initialize_clusters(dictionary, annotated_clusters, *, debug = False):
    document_map = {}
    clusters = {}

    for tag, entities in annotated_clusters.items():
        entries = Session.query(Entry, PlainText)\
            .join(PlainText, Entry.id == PlainText.entry_id)\
            .filter(Entry.title.in_(entities))\
            .all()
        for entry, plain_text in entries:
            if debug:
                print(entry.title, len(plain_text.text))
            document_map[entry.title] = Vector(document_to_bow(dictionary, entry.title, plain_text.text)).normalized()

        if len(entries) > 0:
            clusters[tag] = {}
            clusters[tag]["entities"] = [entry.title for entry, _ in entries]
            clusters[tag]["center"] = Vector.center([document_map[entry.title] for entry, _ in entries])

    return clusters, document_map

In [11]:
def expand_clusters(dictionary, document_map, clusters, records, *, debug = False):
    for entry, plain_text in records:
        entity = entry.title
        word_vector = Vector(document_to_bow(dictionary, entity, plain_text.text)).normalized()
        document_map[entry.title] = word_vector
        min_tag = None
        min_dist = 100000000
        for tag in clusters:
            dist = Vector.distance(clusters[tag]["center"], word_vector)
            if min_dist > dist:
                min_dist = dist
                min_tag = tag
        clusters[min_tag]["entities"].append(entity)
        clusters[min_tag]["center"] = Vector.center([document_map[entity] for entity in clusters[min_tag]["entities"]])
        if debug:
            print(entity, min_tag, max(clusters[min_tag]["center"].data.values()))

In [12]:
pre_clusters = separate(File.load_yaml(Constant.tags_path))

In [13]:
dictionary = Dictionary()

In [14]:
clusters, document_map = initialize_clusters(dictionary, pre_clusters)

In [15]:
records = sample_entries(300)

In [16]:
expand_clusters(dictionary, document_map, clusters, records)

In [None]:
clusters