added fever and goemotions and new mixins (#976)

ludwig-ai · Oct 29, 2020 · 0644620 · 0644620
1 parent ea9f9e6
commit 0644620
Show file tree

Hide file tree

Showing 9 changed files with 258 additions and 11 deletions.
diff --git a/ludwig/datasets/fever/__init__.py b/ludwig/datasets/fever/__init__.py
@@ -0,0 +1,37 @@
+#! /usr/bin/env python
+# coding=utf-8
+# Copyright (c) 2019 Uber Technologies, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from ludwig.datasets.base_dataset import BaseDataset, DEFAULT_CACHE_LOCATION
+from ludwig.datasets.mixins.process import MultifileJoinProcessMixin
+from ludwig.datasets.mixins.download import UncompressedFileDownloadMixin
+from ludwig.datasets.mixins.load import CSVLoadMixin
+
+
+def load(cache_dir=DEFAULT_CACHE_LOCATION):
+    dataset = Fever(cache_dir=cache_dir)
+    return dataset.load()
+
+
+class Fever(UncompressedFileDownloadMixin, MultifileJoinProcessMixin, CSVLoadMixin, BaseDataset):
+    """The Fever dataset.
+
+    This pulls in an array of mixins for different types of functionality
+    which belongs in the workflow for ingesting and transforming training data into a destination
+    dataframe that can fit into Ludwig's training API.
+    """
+
+    def __init__(self, cache_dir=DEFAULT_CACHE_LOCATION):
+        super().__init__(dataset_name="fever", cache_dir=cache_dir)
diff --git a/ludwig/datasets/fever/config.yaml b/ludwig/datasets/fever/config.yaml
@@ -0,0 +1,11 @@
+version: 1.0
+download_urls: 
+  - https://s3-eu-west-1.amazonaws.com/fever.public/train.jsonl
+  - https://s3-eu-west-1.amazonaws.com/fever.public/paper_dev.jsonl
+  - https://s3-eu-west-1.amazonaws.com/fever.public/paper_test.jsonl
+split_filenames:
+  train_file: train.jsonl
+  test_file: paper_dev.jsonl
+  val_file: paper_test.jsonl
+download_file_type: jsonl
+csv_filename: fever.csv
diff --git a/ludwig/datasets/goemotions/__init__.py b/ludwig/datasets/goemotions/__init__.py
@@ -0,0 +1,37 @@
+#! /usr/bin/env python
+# coding=utf-8
+# Copyright (c) 2019 Uber Technologies, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from ludwig.datasets.base_dataset import BaseDataset, DEFAULT_CACHE_LOCATION
+from ludwig.datasets.mixins.process import MultifileJoinProcessMixin
+from ludwig.datasets.mixins.download import UncompressedFileDownloadMixin
+from ludwig.datasets.mixins.load import CSVLoadMixin
+
+
+def load(cache_dir=DEFAULT_CACHE_LOCATION):
+    dataset = GoEmotions(cache_dir=cache_dir)
+    return dataset.load()
+
+
+class GoEmotions(UncompressedFileDownloadMixin, MultifileJoinProcessMixin, CSVLoadMixin, BaseDataset):
+    """The GoEmotions dataset.
+
+    This pulls in an array of mixins for different types of functionality
+    which belongs in the workflow for ingesting and transforming training data into a destination
+    dataframe that can fit into Ludwig's training API.
+    """
+
+    def __init__(self, cache_dir=DEFAULT_CACHE_LOCATION):
+        super().__init__(dataset_name="goemotions", cache_dir=cache_dir)
diff --git a/ludwig/datasets/goemotions/config.yaml b/ludwig/datasets/goemotions/config.yaml
@@ -0,0 +1,11 @@
+version: 1.0
+download_urls: 
+  - https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/train.tsv
+  - https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/dev.tsv
+  - https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/test.tsv
+split_filenames:
+  train_file: train.tsv
+  test_file: test.tsv
+  val_file: dev.tsv
+download_file_type: tsv
+csv_filename: goemotions.csv
diff --git a/ludwig/datasets/mixins/download.py b/ludwig/datasets/mixins/download.py
@@ -16,6 +16,7 @@
 # ==============================================================================
 import os
 import tempfile
+import urllib.request
 
 from io import BytesIO
 from urllib.request import urlopen
@@ -35,12 +36,36 @@ def download_raw_dataset(self):
         store that in the cache location.
         """
         os.makedirs(self.raw_temp_path, exist_ok=True)
-        with urlopen(self.download_url) as zipresp:
-            with ZipFile(BytesIO(zipresp.read())) as zfile:
-                zfile.extractall(self.raw_temp_path)
+        for url in self.download_urls:
+            with urlopen(url) as zipresp:
+                with ZipFile(BytesIO(zipresp.read())) as zfile:
+                    zfile.extractall(self.raw_temp_path)
+        os.rename(self.raw_temp_path, self.raw_dataset_path)
+
+    @property
+    def download_urls(self):
+        return self.config["download_urls"]
+
+class UncompressedFileDownloadMixin:
+    """Downloads the json file containing the training data and extracts the contents."""
+
+    config: dict
+    raw_dataset_path: str
+    raw_temp_path: str
+
+    def download_raw_dataset(self):
+        """
+        Download the raw dataset files and store in the cache location.
+        """
+        os.makedirs(self.raw_temp_path, exist_ok=True)
+        for url in self.download_url:
+            filename = url.split('/')[-1]
+            urllib.request.urlretrieve(url, os.path.join(self.raw_temp_path,filename))
 
         os.rename(self.raw_temp_path, self.raw_dataset_path)
 
     @property
     def download_url(self):
-        return self.config["download_url"]
+        return self.config["download_urls"]
+
+
diff --git a/ludwig/datasets/mixins/process.py b/ludwig/datasets/mixins/process.py
@@ -15,7 +15,7 @@
 # limitations under the License.
 # ==============================================================================
 import os
-
+import pandas as pd
 
 class IdentityProcessMixin:
     """A mixin that performs a no-op for already processed raw datasets."""
@@ -25,3 +25,49 @@ class IdentityProcessMixin:
 
     def process_downloaded_dataset(self):
         os.rename(self.raw_dataset_path, self.processed_dataset_path)
+
+class MultifileJoinProcessMixin:
+    """A mixin that joins raw files to build final dataset"""
+
+    config: dict
+    raw_dataset_path: str
+    processed_dataset_path: str
+
+    def read_file(self, filetype, filename):
+        if filetype == 'json':
+            file_df = pd.read_json(os.path.join(self.raw_dataset_path, filename))
+        if filetype == 'jsonl':
+            file_df = pd.read_json(os.path.join(self.raw_dataset_path, filename), lines=True)
+        if filetype == 'tsv':
+            file_df = pd.read_table(os.path.join(self.raw_dataset_path, filename))
+        return file_df
+
+    def process_downloaded_dataset(self):
+        downloaded_files = self.download_filenames
+        filetype = self.download_file_type
+        all_files = []
+        for split_name, filename in downloaded_files.items():
+            file_df = self.read_file(filetype, filename)
+            if split_name == 'train_file': file_df['split'] = 0
+            if split_name == 'val_file': file_df['split'] = 1
+            if split_name == 'test_file': file_df['split'] = 2
+            all_files.append(file_df)
+
+        concat_df = pd.concat(all_files, ignore_index=True)
+        os.makedirs(self.processed_dataset_path)
+        concat_df.to_csv(os.path.join(self.processed_dataset_path, self.csv_filename), index=False)
+
+
+    @property
+    def download_filenames(self):
+        return self.config['split_filenames']
+
+    @property
+    def download_file_type(self):
+        return self.config['download_file_type']
+
+    @property
+    def csv_filename(self):
+        return self.config['csv_filename']
+
+
diff --git a/ludwig/datasets/ohsumed/config.yaml b/ludwig/datasets/ohsumed/config.yaml
@@ -1,3 +1,4 @@
 version: 1.0
-download_url: http://boston.lti.cs.cmu.edu/classes/95-865-K/HW/HW2/ohsumed-allcats-6.zip
+download_urls: 
+  - http://boston.lti.cs.cmu.edu/classes/95-865-K/HW/HW2/ohsumed-allcats-6.zip
 csv_filename: ohsumed-allcats.csv
diff --git a/ludwig/datasets/reuters/config.yaml b/ludwig/datasets/reuters/config.yaml
@@ -1,3 +1,4 @@
 version: 1.0
-download_url: http://boston.lti.cs.cmu.edu/classes/95-865-K/HW/HW2/reuters-allcats-6.zip
+download_urls: 
+  - http://boston.lti.cs.cmu.edu/classes/95-865-K/HW/HW2/reuters-allcats-6.zip
 csv_filename: reuters-allcats.csv
diff --git a/tests/ludwig/datasets/test_datasets.py b/tests/ludwig/datasets/test_datasets.py
@@ -1,20 +1,26 @@
 import os
+import pytest
 import tempfile
+import pandas as pd
 
 from unittest import mock
 
-import pandas as pd
 
 from ludwig.datasets.base_dataset import BaseDataset
-from ludwig.datasets.mixins.download import ZipDownloadMixin
+from ludwig.datasets.mixins.download import ZipDownloadMixin, UncompressedFileDownloadMixin
 from ludwig.datasets.mixins.load import CSVLoadMixin
-from ludwig.datasets.mixins.process import IdentityProcessMixin
+from ludwig.datasets.mixins.process import IdentityProcessMixin, MultifileJoinProcessMixin
 
+SUPPORTED_UNCOMPRESSED_FILETYPES = ['json', 'jsonl', 'tsv']
 
 class FakeCSVDataset(ZipDownloadMixin, IdentityProcessMixin, CSVLoadMixin, BaseDataset):
     def __init__(self, cache_dir=None):
         super().__init__(dataset_name="fake", cache_dir=cache_dir)
 
+class FakeMultiFileDataset(UncompressedFileDownloadMixin, MultifileJoinProcessMixin, CSVLoadMixin, BaseDataset):
+    def __init__(self, cache_dir=None):
+        super().__init__(dataset_name="multifiles", cache_dir=cache_dir)
+
 
 def test_load_csv_dataset():
     input_df = pd.DataFrame({
@@ -35,7 +41,7 @@ def test_load_csv_dataset():
 
         config = dict(
             version=1.0,
-            download_url='file://' + archive_filename,
+            download_urls=['file://' + archive_filename],
             csv_filename=extracted_filename,
         )
 
@@ -51,3 +57,75 @@ def test_load_csv_dataset():
 
                 assert dataset.is_downloaded()
                 assert dataset.is_processed()
+
+@pytest.mark.parametrize('f_type', SUPPORTED_UNCOMPRESSED_FILETYPES)
+def test_multifile_join_dataset(f_type):
+    if f_type is not 'jsonl':
+        train_df = pd.DataFrame({
+            'name': ['Raphael', 'Donatello'],
+            'mask': ['red', 'purple'],
+            'weapon': ['sai', 'bo staff']
+        })
+
+        test_df = pd.DataFrame({
+            'name': ['Jack', 'Bob'],
+            'mask': ['green', 'yellow'],
+            'weapon': ['knife', 'gun']
+        })
+
+        val_df = pd.DataFrame({
+            'name': ['Tom'],
+            'mask': ['pink'],
+            'weapon': ['stick']
+        })
+    else:
+        train_df = pd.DataFrame([{'name': 'joe'}, {'mask':'green'}, {'weapon':'stick'}])
+        test_df = pd.DataFrame([{'name': 'janice'}, {'mask':'black'}, {'weapon':'gun'}])
+        val_df = pd.DataFrame([{'name': 'sara'}, {'mask':'pink'}, {'weapon':'gun'}])
+
+    #filetypes = ['json', 'tsv', 'jsonl']
+    train_filename = 'train.' + f_type
+    test_filename = 'test.' + f_type
+    val_filename = 'val.' + f_type
+    with tempfile.TemporaryDirectory() as source_dir:
+        train_filepath = os.path.join(source_dir, train_filename)
+        test_filepath = os.path.join(source_dir, test_filename)
+        val_filepath = os.path.join(source_dir, val_filename)
+
+        if f_type == 'json':
+            train_df.to_json(train_filepath)
+            test_df.to_json(test_filepath)
+            val_df.to_json(val_filepath)
+        elif f_type == 'jsonl':
+            train_df.to_json(train_filepath, orient='records', lines=True)
+            test_df.to_json(test_filepath, orient='records', lines=True)
+            val_df.to_json(val_filepath, orient='records', lines=True)
+        else:
+            train_df.to_csv(train_filepath, sep='\t')
+            test_df.to_csv(test_filepath, sep='\t')
+            val_df.to_csv(val_filepath, sep='\t')
+
+        config = {
+            'version': 1.0,
+            'download_urls': ['file://' + train_filepath, 'file://' + test_filepath, 'file://' + val_filepath],
+            'split_filenames': {
+                'train_file': train_filename,
+                'test_file' : test_filename,
+                'val_file' : val_filename
+            },
+            'download_file_type' : f_type,
+            'csv_filename': 'fake.csv',
+        }
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            with mock.patch('ludwig.datasets.base_dataset.read_config', return_value=config):
+                dataset = FakeMultiFileDataset(tmpdir)
+
+                assert not dataset.is_downloaded()
+                assert not dataset.is_processed()
+
+                output_df = dataset.load()
+                assert output_df.shape[0] == train_df.shape[0] + test_df.shape[0] + val_df.shape[0]
+
+                assert dataset.is_downloaded()
+                assert dataset.is_processed()