In [1]:
from date_selection import get_date_graph, pagerank, get_dates_perso
from textrank import summarize

In [2]:
import os
import codecs

import _pickle as cPickle
from datetime import datetime, timedelta
import random
import math
import numpy
import timeit

from tilse.data import timelines
from tilse.evaluation import rouge
from joblib import Parallel, delayed
from gensim.summarization.textcleaner import clean_text_by_word as _clean_text_by_word
from gensim.corpora import Dictionary
from gensim.models import TfidfModel

In [3]:
# crisis dataset from http://l3s.de/~gtran/timeline/
DATA_PATH = None

# temporally tagged sentences for date selection by HeidelTime
# cPickle files: e.g. egypt.dated_sents
#                     containing   [[(pub_date_sent1, sent1), (ref_date1_sent1, sent1), (ref_date2_sent1, sent1)], [(pub_date_sent2, sent2)], ...]
SENT_PATH = None

# use exactly the same sentence corpus per timeline with TILSE for extractive summarization
# from `sentence_preprocessing.ipynb`
# cPickle file: tssf_crisis.filtered_sents
#               containing {'egypt': {'date1': [sents], {'date2': [sents]}, ...},
#                           'syria': {'date1': [sents], {'date2': [sents]}, ...},
#                            ... }
filtered_dated_sents = None

def get_groundtruth(tl):
    gt = []
    p = f'{DATA_PATH}/{tl}/public/timelines'
    for pp in os.listdir(p):
        with codecs.open(f'{p}/{pp}', 'r', "utf-8", "ignore") as f:
            gt.append(timelines.Timeline.from_file(f))
    groundtruth = timelines.GroundTruth(gt)
    return groundtruth

def get_dt_sents(tl):
    return filtered_dated_sents[tl]

def get_daily_summarization(dat, sents, SENT_NUM):
    sents = summarize('\n\n'.join(sents), num=SENT_NUM, split=True, rerank=False)
    return (dat, sents[:SENT_NUM])

def tokenize_sents(dated_sent):
    dt, sent = dated_sent[0]
    sent = _clean_text_by_word(sent)
    return [sent[word].token for word in sent]

def _cos(v1, v2):
    v1 = dict(v1)
    v2 = dict(v2)
    norm1 = numpy.sqrt(sum([v1[i] ** 2 for i in v1]))
    norm2 = numpy.sqrt(sum([v2[i] ** 2 for i in v2]))
    return sum([v1[i] * v2[i] for i in v1 if i in v2]) / norm1 / norm2

def get_timeline(tl, timeline, perso=False, postprocess=False):
    
    dated_sents = cPickle.load(open(f'{SENT_PATH}/{tl}.dated_sents', 'rb'))

    dts = timeline.get_dates()
    date_range = (min(dts), max(dts))
    pred_date_range = list(date_range)
    for i in dated_sents:
        if len(i[0][0]) == 10:
            pred_date_range[0] = min(pred_date_range[0], datetime.strptime(i[0][0], '%Y-%m-%d').date())
            pred_date_range[1] = max(pred_date_range[1], datetime.strptime(i[0][0], '%Y-%m-%d').date())
    potential_dates = set()
    st = pred_date_range[0]
    while st <= pred_date_range[1]:
        potential_dates.add(st.strftime('%Y-%m-%d'))
        st += timedelta(days=1)

    G = get_date_graph(dated_sents, potential_dates)
    dt_sents = get_dt_sents(tl)
    DATE_NUM = len(timeline)
    SENT_NUM = math.floor(numpy.mean([len(timeline[i]) for i in timeline]))
    
    if perso:
        betas = []
        for _beta in range(1000):
            betas.append(0.00001 * _beta)
        
        res = Parallel(n_jobs=23, backend='multiprocessing')(delayed(get_dates_perso)(G, beta, pred_date_range, date_range, dt_sents, DATE_NUM) for beta in betas)
        dts = min(res, key=lambda x: x[0])[1]
    else:
        res = pagerank(G)
        dts = get_dates(res, date_range, dt_sents, DATE_NUM)

    tmp = {}
    for dt in dts:
        t = dt.strftime('%Y-%m-%d')
        if date_range[0] <= dt and dt <= date_range[1] and t in dt_sents:
            sents = list(set(dt_sents[t]))
            tmp.setdefault(dt, sents)
    
    if postprocess:
        ttmp = Parallel(n_jobs=23, backend='multiprocessing')(delayed(get_daily_summarization)(dat, sents, SENT_NUM * 10) for dat, sents in tmp.items())
        
        sents = Parallel(n_jobs=23, backend='multiprocessing')(delayed(tokenize_sents)(dated_sent) for dated_sent in dated_sents)
        dictionary = Dictionary(sents)
        bow = [dictionary.doc2bow(sent) for sent in sents]
        model = TfidfModel(bow)
        
        vecs = []
        tmp = {}
        for dt, sents in ttmp:
            tmp.setdefault(dt, [])
            sent = sents[0]
            vec = _clean_text_by_word(sent)
            vec = dictionary.doc2bow(vec)
            vec = model[vec]
            vecs.append(vec)
            tmp[dt].append(sent)

        for dt, sents in ttmp:
            tmp.setdefault(dt, [])
            cnt = 1
            for sent in sents[1:]:
                if cnt >= SENT_NUM:
                    break
                vec = _clean_text_by_word(sent)
                vec = dictionary.doc2bow(vec)
                vec = model[vec]
                sim = max(_cos(vec, i) for i in vecs)
                if sim < 0.5:
                    tmp[dt].append(sent)
                    cnt += 1
                    vecs.append(vec)

    else:
        tmp = Parallel(n_jobs=23, backend='multiprocessing')(delayed(get_daily_summarization)(dat, sents, SENT_NUM) for dat, sents in tmp.items())
        tmp = dict(tmp)
    
    predicted_timeline = timelines.Timeline(tmp)
    return (predicted_timeline, timelines.GroundTruth([timeline]))

In [4]:
tl_timelines = []
for tl in os.listdir(DATA_PATH):
    gt = get_groundtruth(tl)
    for timeline in gt.timelines:
        tl_timelines.append((tl, timeline))

In [5]:
eval_pairs = []
sentnum_time = []

start_time1 = timeit.default_timer()
for tl, timeline in tl_timelines:
    start_time = timeit.default_timer()
    eval_pairs.append(get_timeline(tl, timeline, perso=True, postprocess=True))
    elapsed = timeit.default_timer() - start_time
    dt_sents = get_dt_sents(tl)
    sentnum_time.append(
        (sum([len(dt_sents[dt]) for dt in dt_sents]), elapsed)
    )

elapsed = timeit.default_timer() - start_time1
print('total sec: ', elapsed)
print('avg sec. per timline: ', elapsed / len(eval_pairs))

total sec:  632.6240723449737
avg sec. per timline:  28.75563965204426


In [6]:
def get_rouge_score(eval_pairs, c=0):
    evaluator = rouge.TimelineRougeEvaluator(measures=["rouge_1", "rouge_2"])
    res = []
    if c == 0:
        res = Parallel(n_jobs=23)(delayed(evaluator.evaluate_concat)(t1, t2) for t1, t2 in eval_pairs)
    elif c == 1:
        res = Parallel(n_jobs=23)(delayed(evaluator.evaluate_agreement)(t1, t2) for t1, t2 in eval_pairs)
    elif c == 2:
        res = Parallel(n_jobs=23)(delayed(evaluator.evaluate_align_date_content_costs_many_to_one)(t1, t2) for t1, t2 in eval_pairs)
    else:
        return None
    
    pre = numpy.mean([i['rouge_1']['precision'] for i in res])
    rec = numpy.mean([i['rouge_1']['recall'] for i in res])
    rouge_1_f = 2 * pre * rec / (pre + rec)
    
    pre = numpy.mean([i['rouge_2']['precision'] for i in res])
    rec = numpy.mean([i['rouge_2']['recall'] for i in res])
    rouge_2_f = 2 * pre * rec / (pre + rec)
    return {'rouge 1 f1': rouge_1_f,
            'rouge 2 f1': rouge_2_f}

print(get_rouge_score(eval_pairs[:], c=0))
print(get_rouge_score(eval_pairs[:], c=1))
print(get_rouge_score(eval_pairs[:], c=2))

{'rouge 1 f1': 0.36051176245962313, 'rouge 2 f1': 0.0759064124038041}
{'rouge 1 f1': 0.06787000464523299, 'rouge 2 f1': 0.02025320688329175}
{'rouge 1 f1': 0.08460087822235977, 'rouge 2 f1': 0.023219421029152542}


In [7]:
def get_date_f1(eval_pairs):
    pre = []
    rec = []
    f = []
    for t1, t2 in eval_pairs:
        d1 = set([i for i in t1])
        d2 = set([i for i in t2.timelines[0]])
        pre.append(len(d1 & d2) / len(d1))
        rec.append(len(d1 & d2) / len(d2))
        if pre[-1] + rec[-1] != 0:
            f.append(2 * pre[-1] * rec[-1] / (pre[-1] + rec[-1]))
        else:
            f.append(0)
    pre = numpy.mean(pre)
    rec = numpy.mean(rec)
    f = numpy.mean(f)
    f1 = 2 * pre * rec / (pre + rec)
    return f1, f

print(get_date_f1(eval_pairs))

(0.27562968799077764, 0.27562968799077764)


# sanity check for statistics of sentence number

In [8]:
sentence_cnt = []
for t1, t2 in eval_pairs:
    for dt in t1:
        sentence_cnt.append(len(t1[dt]))
print(len(sentence_cnt))
print(numpy.mean(sentence_cnt))
print(numpy.median(sentence_cnt))

643
1.181959564541213
1.0


In [9]:
sentence_cnt = []
for t1, t2 in eval_pairs:
    s = 0
    for dt in t1:
        s = max(s, len(t1[dt]))
    sentence_cnt.append(s)
print(len(sentence_cnt))
print(numpy.mean(sentence_cnt), numpy.std(sentence_cnt))
print(numpy.median(sentence_cnt))

22
1.1818181818181819 0.385694607919935
1.0
