Skip to content

Commit

Permalink
[Python] lance.write_dataset takes pandas DataFrame (milvus-io#342)
Browse files Browse the repository at this point in the history
Write_dataset takes pandas dataframe
  • Loading branch information
eddyxu committed Dec 1, 2022
1 parent 8626d59 commit 2c77acc
Showing 1 changed file with 15 additions and 8 deletions.
23 changes: 15 additions & 8 deletions python/lance/__init__.py
Expand Up @@ -12,23 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from pathlib import Path
from typing import Optional, Union
import os

import pandas
import pyarrow as pa
import pyarrow.fs
import pyarrow.dataset as ds
import pyarrow.fs

from . import version

__version__ = version.__version__

from lance.lib import (
FileSystemDataset,
LanceFileFormat,
_lance_dataset_write,
_lance_dataset_make,
FileSystemDataset,
_lance_dataset_write,
)
from lance.types import register_extension_types

Expand Down Expand Up @@ -84,20 +85,21 @@ def dataset(


def write_dataset(
data: Union[pa.Table, pa.dataset.Dataset],
data: Union[pa.Table, pa.dataset.Dataset, pandas.DataFrame, pandas.DataFrame],
base_dir: Union[str, Path],
mode: str = "create",
filesystem: Optional[pa.fs.FileSystem] = None,
max_rows_per_file: int = 0,
min_rows_per_group: int = 0,
max_rows_per_group: int = 1024,
schema: Optional[pa.Schema] = None,
**kwargs,
):
"""Write a dataset.
Parameters
----------
data : pyarrow.Table or pyarrow.dataset.Dataset
data : pyarrow.Table, pyarrow.dataset.Dataset, pandas.DataFrame
The data to write.
base_dir : str or Path
The root directory to write dataset to.
Expand All @@ -107,7 +109,7 @@ def write_dataset(
The filesystem to write the dataset
max_rows_per_file : int
Maximum number of rows per file. If greater than 0 then this will limit how many rows
are placed in any single file. Otherwise there will be no limit and one file will be
are placed in any single file. Otherwise, there will be no limit and one file will be
created in each output directory unless files need to be closed to respect max_open_files.
min_rows_per_group : int
Minimum number of rows per group. When the value is greater than 0,
Expand All @@ -117,7 +119,10 @@ def write_dataset(
Maximum number of rows per group. If the value is greater than 0,
then the dataset writer may split up large incoming batches into multiple row groups.
If this value is set, then min_rows_per_group should also be set.
Otherwise it could end up with very small row groups.
Otherwise, it could end up with very small row groups.
schema : pyarrow.Schema, optional
The expected schema of the pandas DataFrame.
This can be used to indicate the type of columns if we cannot infer it automatically.
"""
from pyarrow.fs import _resolve_filesystem_and_path

Expand All @@ -126,6 +131,8 @@ def write_dataset(
"Max rows per file must be larger or equal to max_rows_per_group"
)

if isinstance(data, pandas.DataFrame):
data = pa.Table.from_pandas(data, schema=schema)
if isinstance(data, pa.Table):
data = pa.dataset.InMemoryDataset(data)

Expand Down

0 comments on commit 2c77acc

Please sign in to comment.