In [None]:
######## https://github.com/smartschat/tilse

# we run pre-processing (including sentence tokenization and temporal tagging) first
# reference: https://github.com/smartschat/tilse/blob/master/bin/get-and-preprocess-data

# then we only measure computation time for summarization in this script for fair comparison.

# set path to tilse pre-processed dataset
# After running tilse pre-processing, we should get data folder like
# crisis/
#    raw/
#    dumped_corpora/
# timeline17/
#    raw/
#    dumped_corpora/
TILSE_PATH = None 

In [None]:
import codecs
import json
import os
import pickle
import pprint
import sys
from collections import defaultdict
import spacy
import _pickle as cPickle
from tilse.models.submodular import submodular, upper_bounds
from tilse.models.chieu import chieu
from tilse.data import timelines
from tilse.evaluation import rouge
import timeit

In [None]:
class newSubmodular(submodular.Submodular):
    
    def _run_for_one(self, t, corpora, topic_to_preprocessed, reference_timelines):
        corpus = corpora[t]

        params = self.params

        results_rouge = {}
        results_date_selection = {}
        returned_timelines = {}

        # predict
        for i, timeline in enumerate(reference_timelines[t].timelines):
            timeline_properties = self.get_timeline_properties(timeline)
            groundtruth = timelines.GroundTruth([timeline])

            pred = self.predict(corpus, topic_to_preprocessed[t], timeline_properties, params)

            returned_timelines[t + "_" + str(i)] = pred

        return returned_timelines

    def run(self, corpora, reference_timelines):
        topics = sorted(list(corpora.keys()))
        results_rouge = {}
        results_date = {}
        returned_timelines = {}

        #################### NOTE that ####################
        #  the TILSE implementation uses the same processing (e.g. caches sentence similarity calculation) 
        #  to generate multiple timelines that use the same news corpus (i.e. same topic), therefore, we 
        #  need to add the processing time back in measuring the generation speed per timeline.
        topic_to_preprocessed = {}
        for t in topics:
            start_time = timeit.default_timer()
            topic_to_preprocessed[t] = self.preprocess(t, corpora[t])
            elapsed = timeit.default_timer() - start_time
            print('pre-processing time for each topic: ', [t, elapsed])

        for t in topics:
            new_returned_timelines = self._run_for_one(t, corpora,
                                                       topic_to_preprocessed,
                                                       reference_timelines)
            returned_timelines.update(new_returned_timelines)

        return returned_timelines

In [None]:
exps = [
    ('timeline17', {
                      "name": "asmds_tempdiv_dateref_timeline17",
                      "algorithm": "submodular",
                      "restrict_topics_to": None,
                      "assess_length": "average_length_in_sentences",
                      "sentence_representation": "ChieuSentenceRepresentation",
                      "keyword_mapping": {
                        "bpoil": ["bp", "oil", "spill"],
                        "egypt": ["egypt", "egyptian"],
                        "finan": ["financial", "economic", "crisis"],
                        "h1n1": ["h1n1", "swine", "flu"],
                        "haiti": ["haiti", "quake", "earthquake"],
                        "iraq": ["iraq", "iraqi"],
                        "libya": ["libya", "libyan"],
                        "mj": ["michael", "jackson"],
                        "syria": ["syria", "syrian"],
                        "yemen": ["yemen"]
                      },
                      "rouge_computation": "original",
                      "properties": {
                        "constraint": "is_valid_total_length",
                        "semantic_cluster": "clusters_by_similarity",
                        "date_cluster": "clusters_by_date",
                        "coefficients": [1, 0, 1, 1]
                      }
                    }),
    ('timeline17', {
                      "name": "tlsconstraints_reweighting_dateref_timeline17",
                      "algorithm": "submodular",
                      "restrict_topics_to": None,
                      "assess_length": "average_length_in_sentences",
                      "sentence_representation": "DateWeightedChieuSentenceRepresentation",
                      "keyword_mapping": {
                        "bpoil": ["bp", "oil", "spill"],
                        "egypt": ["egypt", "egyptian"],
                        "finan": ["financial", "economic", "crisis"],
                        "h1n1": ["h1n1", "swine", "flu"],
                        "haiti": ["haiti", "quake", "earthquake"],
                        "iraq": ["iraq", "iraqi"],
                        "libya": ["libya", "libyan"],
                        "mj": ["michael", "jackson"],
                        "syria": ["syria", "syrian"],
                        "yemen": ["yemen"]
                      },
                      "rouge_computation": "original",
                      "properties": {
                        "constraint": "is_valid_individual_constraints",
                        "semantic_cluster": "clusters_by_similarity",
                        "date_cluster": "clusters_by_date",
                        "coefficients": [1, 1, 0, 1]
                      }
                    }),
    ('crisis', {
                  "name": "tlsconstraints_reweighting_dateref_crisis",
                  "algorithm": "submodular",
                  "restrict_topics_to": None,
                  "assess_length": "average_length_in_sentences",
                  "sentence_representation": "DateWeightedChieuSentenceRepresentation",
                  "keyword_mapping": {
                    "bpoil": ["bp", "oil", "spill"],
                    "egypt": ["egypt", "egyptian"],
                    "finan": ["financial", "economic", "crisis"],
                    "h1n1": ["h1n1", "swine", "flu"],
                    "haiti": ["haiti", "quake", "earthquake"],
                    "iraq": ["iraq", "iraqi"],
                    "libya": ["libya", "libyan"],
                    "mj": ["michael", "jackson"],
                    "syria": ["syria", "syrian"],
                    "yemen": ["yemen"]
                  },
                  "rouge_computation": "original",
                  "properties": {
                    "constraint": "is_valid_individual_constraints",
                    "semantic_cluster": "clusters_by_similarity",
                    "date_cluster": "clusters_by_date",
                    "coefficients": [1, 1, 0, 1]
                  }
                }),
    ('crisis', {
                  "name": "asmds_tempdiv_dateref_crisis",
                  "algorithm": "submodular",
                  "restrict_topics_to": None,
                  "assess_length": "average_length_in_sentences",
                  "sentence_representation": "ChieuSentenceRepresentation",
                  "keyword_mapping": {
                    "bpoil": ["bp", "oil", "spill"],
                    "egypt": ["egypt", "egyptian"],
                    "finan": ["financial", "economic", "crisis"],
                    "h1n1": ["h1n1", "swine", "flu"],
                    "haiti": ["haiti", "quake", "earthquake"],
                    "iraq": ["iraq", "iraqi"],
                    "libya": ["libya", "libyan"],
                    "mj": ["michael", "jackson"],
                    "syria": ["syria", "syrian"],
                    "yemen": ["yemen"]
                  },
                  "rouge_computation": "original",
                  "properties": {
                    "constraint": "is_valid_total_length",
                    "semantic_cluster": "clusters_by_similarity",
                    "date_cluster": "clusters_by_date",
                    "coefficients": [1, 0, 1, 1]
                  }
                }),
]

In [None]:
exp_ti = {
    # 'asmds_tempdiv_dateref_timeline17': 1321.3712241500616,
    # 'tlsconstraints_reweighting_dateref_timeline17': 2011.1612675283104,
    # 'asmds_tempdiv_dateref_crisis.pkl': 9558.274625536054,
    # 'tlsconstraints_reweighting_dateref_crisis.pkl': 15898.265180394053,
}
for (exp_dataset, exp_config) in exps:
    
    if exp_config["name"] in exp_ti:
        continue
    
    news_corpora = {}
    temp_reference_timelines = defaultdict(list)
    
    corpus = f"{TILSE_PATH}/{exp_dataset}"
    raw_directory = corpus + "/raw/"
    dumped_corpora_directrory = corpus + "/dumped_corpora/"
    keyword_mapping = {
        "bpoil": ["bp", "oil", "spill"],
        "egypt": ["egypt", "egyptian"],
        "finan": ["financial", "economic", "crisis"],
        "h1n1": ["h1n1", "swine", "flu"],
        "haiti": ["haiti", "quake", "earthquake"],
        "iraq": ["iraq", "iraqi"],
        "libya": ["libya", "libyan"],
        "mj": ["michael", "jackson"],
        "syria": ["syria", "syrian"],
        "yemen": ["yemen"]
    }
    restrict_topics_to = None
    reference_timelines = {}
    
    for topic in sorted(os.listdir(raw_directory)):
        if restrict_topics_to is not None and topic not in restrict_topics_to:
            continue
        print(topic)

        news_corpora[topic] = pickle.load(open(dumped_corpora_directrory + topic + ".corpus.obj", "rb"))

        if keyword_mapping is not None and keyword_mapping[topic] is not None:
            news_corpora[topic] = news_corpora[topic].filter_by_keywords_contained(keyword_mapping[topic])

        for filename in sorted(list(os.listdir(raw_directory + "/" + topic + "/timelines/"))):
            full_path = raw_directory + "/" + topic + "/timelines/" + filename

            temp_reference_timelines[topic].append(
                timelines.Timeline.from_file(codecs.open(full_path, "r", "utf-8", "replace"))
            )

    for topic in temp_reference_timelines:
        reference_timelines[topic] = timelines.GroundTruth(temp_reference_timelines[topic])
    
    algorithm = newSubmodular(exp_config, None)

    returned_timelines = algorithm.run(news_corpora, reference_timelines)
    
    groundtruths = {}

    for topic in reference_timelines:
        for i, tl in enumerate(reference_timelines[topic].timelines):
            groundtruths[topic + "_" + str(i)] = tl

    cPickle.dump((returned_timelines, groundtruths), open(exp_config["name"] + ".pkl", "wb"))