diff --git a/tensorflow_datasets/summarization/__init__.py b/tensorflow_datasets/summarization/__init__.py index 8f0bfba6c07..d2c68a8c9b5 100644 --- a/tensorflow_datasets/summarization/__init__.py +++ b/tensorflow_datasets/summarization/__init__.py @@ -27,6 +27,7 @@ from tensorflow_datasets.summarization.newsroom import Newsroom from tensorflow_datasets.summarization.opinion_abstracts import OpinionAbstracts from tensorflow_datasets.summarization.opinosis import Opinosis +from tensorflow_datasets.summarization.points_of_correspondence import PointsOfCorrespondence from tensorflow_datasets.summarization.reddit import Reddit from tensorflow_datasets.summarization.reddit_tifu import RedditTifu from tensorflow_datasets.summarization.samsum import Samsum diff --git a/tensorflow_datasets/summarization/points_of_correspondence.py b/tensorflow_datasets/summarization/points_of_correspondence.py new file mode 100644 index 00000000000..de174077089 --- /dev/null +++ b/tensorflow_datasets/summarization/points_of_correspondence.py @@ -0,0 +1,128 @@ +# coding=utf-8 +# Copyright 2020 The TensorFlow Datasets Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Lint as: python3 +"""Points of Correspondence Dataset.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import json +import os + +import tensorflow.compat.v2 as tf +import tensorflow_datasets.public_api as tfds + +_CITATION = """ +@inproceedings{lebanoff-etal-2020-understanding, + title = "Understanding Points of Correspondence between Sentences for Abstractive Summarization", + author = "Lebanoff, Logan and Muchovej, John and Dernoncourt, Franck and Kim, Doo Soon and Wang, \ + Lidan and Chang, Walter and Liu, Fei", + booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics: \ + Student Research Workshop", + month = jul, + year = "2020", + publisher = "Association for Computational Linguistics", + url = "https://www.aclweb.org/anthology/2020.acl-srw.26", + pages = "191--198", +} +""" + +_DESCRIPTION = """ +The dataset contains 1,599 sentence fusion examples with fine-grained Points of Correspondence annotations. +A Point of Correspondence is an entity or event that connects two sentences together, +which is represented as a span of text from each sentence. +Please take a look at our Github for a description of each feature in our dataset: +https://github.com/ucfnlp/points-of-correspondence +""" + +_URL = "https://github.com/ucfnlp/points-of-correspondence/raw/master/PoC_dataset.json" + +_FULL_ARTICLE = "Full_Article" +_FULL_SUMMARY = "Full_Summary" +_SENTENCE_1 = "Sentence_1" +_SENTENCE_2 = "Sentence_2" +_SENTENCE_FUSED = "Sentence_Fused" +_SENTENCE_1_INDEX = "Sentence_1_Index" +_SENTENCE_2_INDEX = "Sentence_2_Index" +_SENTENCE_FUSED_INDEX = "Sentence_Fused_Index" +_POCS = "PoCs" +_EXAMPLE_ID = "Example_Id" + +_SENTENCE_1_SELECTION = "Sentence_1_Selection" +_SENTENCE_2_SELECTION = "Sentence_2_Selection" +_SENTENCE_FUSED_SELECTION = "Sentence_Fused_Selection" +_POC_TYPE = "PoC_Type" + +_POC_TYPE_CLASSES = [ + "Nominal", "Pronominal", "Common-Noun", "Repetition", "Event" +] + + +class PointsOfCorrespondence(tfds.core.GeneratorBasedBuilder): + """Points of Correspondence Dataset.""" + + VERSION = tfds.core.Version("3.0.0") + + def _info(self): + return tfds.core.DatasetInfo( + builder=self, + description=_DESCRIPTION, + features=tfds.features.FeaturesDict({ + _FULL_ARTICLE: tfds.features.Text(), + _FULL_SUMMARY: tfds.features.Text(), + _SENTENCE_1: tfds.features.Text(), + _SENTENCE_2: tfds.features.Text(), + _SENTENCE_FUSED: tfds.features.Text(), + _SENTENCE_1_INDEX: tf.int32, + _SENTENCE_2_INDEX: tf.int32, + _SENTENCE_FUSED_INDEX: tf.int32, + _POCS: tfds.features.Sequence({ + _SENTENCE_1_SELECTION: + tfds.features.Tensor(shape=(2,), dtype=tf.int32), + _SENTENCE_2_SELECTION: + tfds.features.Tensor(shape=(2,), dtype=tf.int32), + _SENTENCE_FUSED_SELECTION: + tfds.features.Tensor(shape=(2,), dtype=tf.int32), + _POC_TYPE: tfds.features.Text(), + }), + }), + supervised_keys=(_FULL_ARTICLE, _FULL_SUMMARY), + homepage="https://github.com/ucfnlp/points-of-correspondence", + citation=_CITATION, + ) + + def _split_generators(self, dl_manager): + """Returns SplitGenerators.""" + dl_path = dl_manager.download(_URL) + return [ + tfds.core.SplitGenerator( + name=tfds.Split.TEST, + gen_kwargs={ + "path": + os.path.join(dl_path), + "key": + _EXAMPLE_ID + }, + ), + ] + + def _generate_examples(self, path=None, key=None): + """Yields examples.""" + with tf.io.gfile.GFile(path + 'PoC_dataset.json') as f: + data = json.load(f) + for ex in data: + yield ex[key], {k: v for k, v in ex.items() if k != _EXAMPLE_ID} diff --git a/tensorflow_datasets/summarization/points_of_correspondence_test.py b/tensorflow_datasets/summarization/points_of_correspondence_test.py new file mode 100644 index 00000000000..bd8caeb1311 --- /dev/null +++ b/tensorflow_datasets/summarization/points_of_correspondence_test.py @@ -0,0 +1,35 @@ +# coding=utf-8 +# Copyright 2020 The TensorFlow Datasets Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Lint as: python3 +"""Points of Correspondence Dataset Test.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow_datasets import testing +from tensorflow_datasets.summarization import points_of_correspondence + + +class PointsOfCorrespondenceTest(testing.DatasetBuilderTestCase): + DATASET_CLASS = points_of_correspondence.PointsOfCorrespondence + SPLITS = { + "test": 1, # Number of fake test example + } + DL_EXTRACT_RESULT = "" + +if __name__ == "__main__": + testing.test_main() diff --git a/tensorflow_datasets/url_checksums/points_of_correspondence.txt b/tensorflow_datasets/url_checksums/points_of_correspondence.txt new file mode 100644 index 00000000000..4cae8763ecf --- /dev/null +++ b/tensorflow_datasets/url_checksums/points_of_correspondence.txt @@ -0,0 +1 @@ +https://github.com/ucfnlp/points-of-correspondence/raw/master/PoC_dataset.json 9074642 e23c3f6f4c96ac312575ac387375ad23553f805a076893ecaf13adf825f7ce4e \ No newline at end of file