Skip to content

Commit

Permalink
added fever and goemotions and new mixins (#976)
Browse files Browse the repository at this point in the history
  • Loading branch information
ANarayan committed Oct 29, 2020
1 parent ea9f9e6 commit 0644620
Show file tree
Hide file tree
Showing 9 changed files with 258 additions and 11 deletions.
37 changes: 37 additions & 0 deletions ludwig/datasets/fever/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#! /usr/bin/env python
# coding=utf-8
# Copyright (c) 2019 Uber Technologies, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
from ludwig.datasets.base_dataset import BaseDataset, DEFAULT_CACHE_LOCATION
from ludwig.datasets.mixins.process import MultifileJoinProcessMixin
from ludwig.datasets.mixins.download import UncompressedFileDownloadMixin
from ludwig.datasets.mixins.load import CSVLoadMixin


def load(cache_dir=DEFAULT_CACHE_LOCATION):
dataset = Fever(cache_dir=cache_dir)
return dataset.load()


class Fever(UncompressedFileDownloadMixin, MultifileJoinProcessMixin, CSVLoadMixin, BaseDataset):
"""The Fever dataset.
This pulls in an array of mixins for different types of functionality
which belongs in the workflow for ingesting and transforming training data into a destination
dataframe that can fit into Ludwig's training API.
"""

def __init__(self, cache_dir=DEFAULT_CACHE_LOCATION):
super().__init__(dataset_name="fever", cache_dir=cache_dir)
11 changes: 11 additions & 0 deletions ludwig/datasets/fever/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
version: 1.0
download_urls:
- https://s3-eu-west-1.amazonaws.com/fever.public/train.jsonl
- https://s3-eu-west-1.amazonaws.com/fever.public/paper_dev.jsonl
- https://s3-eu-west-1.amazonaws.com/fever.public/paper_test.jsonl
split_filenames:
train_file: train.jsonl
test_file: paper_dev.jsonl
val_file: paper_test.jsonl
download_file_type: jsonl
csv_filename: fever.csv
37 changes: 37 additions & 0 deletions ludwig/datasets/goemotions/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#! /usr/bin/env python
# coding=utf-8
# Copyright (c) 2019 Uber Technologies, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
from ludwig.datasets.base_dataset import BaseDataset, DEFAULT_CACHE_LOCATION
from ludwig.datasets.mixins.process import MultifileJoinProcessMixin
from ludwig.datasets.mixins.download import UncompressedFileDownloadMixin
from ludwig.datasets.mixins.load import CSVLoadMixin


def load(cache_dir=DEFAULT_CACHE_LOCATION):
dataset = GoEmotions(cache_dir=cache_dir)
return dataset.load()


class GoEmotions(UncompressedFileDownloadMixin, MultifileJoinProcessMixin, CSVLoadMixin, BaseDataset):
"""The GoEmotions dataset.
This pulls in an array of mixins for different types of functionality
which belongs in the workflow for ingesting and transforming training data into a destination
dataframe that can fit into Ludwig's training API.
"""

def __init__(self, cache_dir=DEFAULT_CACHE_LOCATION):
super().__init__(dataset_name="goemotions", cache_dir=cache_dir)
11 changes: 11 additions & 0 deletions ludwig/datasets/goemotions/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
version: 1.0
download_urls:
- https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/train.tsv
- https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/dev.tsv
- https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/test.tsv
split_filenames:
train_file: train.tsv
test_file: test.tsv
val_file: dev.tsv
download_file_type: tsv
csv_filename: goemotions.csv
33 changes: 29 additions & 4 deletions ludwig/datasets/mixins/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
# ==============================================================================
import os
import tempfile
import urllib.request

from io import BytesIO
from urllib.request import urlopen
Expand All @@ -35,12 +36,36 @@ def download_raw_dataset(self):
store that in the cache location.
"""
os.makedirs(self.raw_temp_path, exist_ok=True)
with urlopen(self.download_url) as zipresp:
with ZipFile(BytesIO(zipresp.read())) as zfile:
zfile.extractall(self.raw_temp_path)
for url in self.download_urls:
with urlopen(url) as zipresp:
with ZipFile(BytesIO(zipresp.read())) as zfile:
zfile.extractall(self.raw_temp_path)
os.rename(self.raw_temp_path, self.raw_dataset_path)

@property
def download_urls(self):
return self.config["download_urls"]

class UncompressedFileDownloadMixin:
"""Downloads the json file containing the training data and extracts the contents."""

config: dict
raw_dataset_path: str
raw_temp_path: str

def download_raw_dataset(self):
"""
Download the raw dataset files and store in the cache location.
"""
os.makedirs(self.raw_temp_path, exist_ok=True)
for url in self.download_url:
filename = url.split('/')[-1]
urllib.request.urlretrieve(url, os.path.join(self.raw_temp_path,filename))

os.rename(self.raw_temp_path, self.raw_dataset_path)

@property
def download_url(self):
return self.config["download_url"]
return self.config["download_urls"]


48 changes: 47 additions & 1 deletion ludwig/datasets/mixins/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
# limitations under the License.
# ==============================================================================
import os

import pandas as pd

class IdentityProcessMixin:
"""A mixin that performs a no-op for already processed raw datasets."""
Expand All @@ -25,3 +25,49 @@ class IdentityProcessMixin:

def process_downloaded_dataset(self):
os.rename(self.raw_dataset_path, self.processed_dataset_path)

class MultifileJoinProcessMixin:
"""A mixin that joins raw files to build final dataset"""

config: dict
raw_dataset_path: str
processed_dataset_path: str

def read_file(self, filetype, filename):
if filetype == 'json':
file_df = pd.read_json(os.path.join(self.raw_dataset_path, filename))
if filetype == 'jsonl':
file_df = pd.read_json(os.path.join(self.raw_dataset_path, filename), lines=True)
if filetype == 'tsv':
file_df = pd.read_table(os.path.join(self.raw_dataset_path, filename))
return file_df

def process_downloaded_dataset(self):
downloaded_files = self.download_filenames
filetype = self.download_file_type
all_files = []
for split_name, filename in downloaded_files.items():
file_df = self.read_file(filetype, filename)
if split_name == 'train_file': file_df['split'] = 0
if split_name == 'val_file': file_df['split'] = 1
if split_name == 'test_file': file_df['split'] = 2
all_files.append(file_df)

concat_df = pd.concat(all_files, ignore_index=True)
os.makedirs(self.processed_dataset_path)
concat_df.to_csv(os.path.join(self.processed_dataset_path, self.csv_filename), index=False)


@property
def download_filenames(self):
return self.config['split_filenames']

@property
def download_file_type(self):
return self.config['download_file_type']

@property
def csv_filename(self):
return self.config['csv_filename']


3 changes: 2 additions & 1 deletion ludwig/datasets/ohsumed/config.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
version: 1.0
download_url: http://boston.lti.cs.cmu.edu/classes/95-865-K/HW/HW2/ohsumed-allcats-6.zip
download_urls:
- http://boston.lti.cs.cmu.edu/classes/95-865-K/HW/HW2/ohsumed-allcats-6.zip
csv_filename: ohsumed-allcats.csv
3 changes: 2 additions & 1 deletion ludwig/datasets/reuters/config.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
version: 1.0
download_url: http://boston.lti.cs.cmu.edu/classes/95-865-K/HW/HW2/reuters-allcats-6.zip
download_urls:
- http://boston.lti.cs.cmu.edu/classes/95-865-K/HW/HW2/reuters-allcats-6.zip
csv_filename: reuters-allcats.csv
86 changes: 82 additions & 4 deletions tests/ludwig/datasets/test_datasets.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,26 @@
import os
import pytest
import tempfile
import pandas as pd

from unittest import mock

import pandas as pd

from ludwig.datasets.base_dataset import BaseDataset
from ludwig.datasets.mixins.download import ZipDownloadMixin
from ludwig.datasets.mixins.download import ZipDownloadMixin, UncompressedFileDownloadMixin
from ludwig.datasets.mixins.load import CSVLoadMixin
from ludwig.datasets.mixins.process import IdentityProcessMixin
from ludwig.datasets.mixins.process import IdentityProcessMixin, MultifileJoinProcessMixin

SUPPORTED_UNCOMPRESSED_FILETYPES = ['json', 'jsonl', 'tsv']

class FakeCSVDataset(ZipDownloadMixin, IdentityProcessMixin, CSVLoadMixin, BaseDataset):
def __init__(self, cache_dir=None):
super().__init__(dataset_name="fake", cache_dir=cache_dir)

class FakeMultiFileDataset(UncompressedFileDownloadMixin, MultifileJoinProcessMixin, CSVLoadMixin, BaseDataset):
def __init__(self, cache_dir=None):
super().__init__(dataset_name="multifiles", cache_dir=cache_dir)


def test_load_csv_dataset():
input_df = pd.DataFrame({
Expand All @@ -35,7 +41,7 @@ def test_load_csv_dataset():

config = dict(
version=1.0,
download_url='file://' + archive_filename,
download_urls=['file://' + archive_filename],
csv_filename=extracted_filename,
)

Expand All @@ -51,3 +57,75 @@ def test_load_csv_dataset():

assert dataset.is_downloaded()
assert dataset.is_processed()

@pytest.mark.parametrize('f_type', SUPPORTED_UNCOMPRESSED_FILETYPES)
def test_multifile_join_dataset(f_type):
if f_type is not 'jsonl':
train_df = pd.DataFrame({
'name': ['Raphael', 'Donatello'],
'mask': ['red', 'purple'],
'weapon': ['sai', 'bo staff']
})

test_df = pd.DataFrame({
'name': ['Jack', 'Bob'],
'mask': ['green', 'yellow'],
'weapon': ['knife', 'gun']
})

val_df = pd.DataFrame({
'name': ['Tom'],
'mask': ['pink'],
'weapon': ['stick']
})
else:
train_df = pd.DataFrame([{'name': 'joe'}, {'mask':'green'}, {'weapon':'stick'}])
test_df = pd.DataFrame([{'name': 'janice'}, {'mask':'black'}, {'weapon':'gun'}])
val_df = pd.DataFrame([{'name': 'sara'}, {'mask':'pink'}, {'weapon':'gun'}])

#filetypes = ['json', 'tsv', 'jsonl']
train_filename = 'train.' + f_type
test_filename = 'test.' + f_type
val_filename = 'val.' + f_type
with tempfile.TemporaryDirectory() as source_dir:
train_filepath = os.path.join(source_dir, train_filename)
test_filepath = os.path.join(source_dir, test_filename)
val_filepath = os.path.join(source_dir, val_filename)

if f_type == 'json':
train_df.to_json(train_filepath)
test_df.to_json(test_filepath)
val_df.to_json(val_filepath)
elif f_type == 'jsonl':
train_df.to_json(train_filepath, orient='records', lines=True)
test_df.to_json(test_filepath, orient='records', lines=True)
val_df.to_json(val_filepath, orient='records', lines=True)
else:
train_df.to_csv(train_filepath, sep='\t')
test_df.to_csv(test_filepath, sep='\t')
val_df.to_csv(val_filepath, sep='\t')

config = {
'version': 1.0,
'download_urls': ['file://' + train_filepath, 'file://' + test_filepath, 'file://' + val_filepath],
'split_filenames': {
'train_file': train_filename,
'test_file' : test_filename,
'val_file' : val_filename
},
'download_file_type' : f_type,
'csv_filename': 'fake.csv',
}

with tempfile.TemporaryDirectory() as tmpdir:
with mock.patch('ludwig.datasets.base_dataset.read_config', return_value=config):
dataset = FakeMultiFileDataset(tmpdir)

assert not dataset.is_downloaded()
assert not dataset.is_processed()

output_df = dataset.load()
assert output_df.shape[0] == train_df.shape[0] + test_df.shape[0] + val_df.shape[0]

assert dataset.is_downloaded()
assert dataset.is_processed()

0 comments on commit 0644620

Please sign in to comment.