[Python] lance.write_dataset takes pandas DataFrame (milvus-io#342)

Write_dataset takes pandas dataframe
yah01 · Dec 1, 2022 · 2c77acc · 2c77acc
1 parent 8626d59
commit 2c77acc
Showing 1 changed file with 15 additions and 8 deletions.
diff --git a/python/lance/__init__.py b/python/lance/__init__.py
@@ -12,23 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 from pathlib import Path
 from typing import Optional, Union
-import os
 
+import pandas
 import pyarrow as pa
-import pyarrow.fs
 import pyarrow.dataset as ds
+import pyarrow.fs
 
 from . import version
 
 __version__ = version.__version__
 
 from lance.lib import (
+    FileSystemDataset,
     LanceFileFormat,
-    _lance_dataset_write,
     _lance_dataset_make,
-    FileSystemDataset,
+    _lance_dataset_write,
 )
 from lance.types import register_extension_types
 
@@ -84,20 +85,21 @@ def dataset(
 
 
 def write_dataset(
-    data: Union[pa.Table, pa.dataset.Dataset],
+    data: Union[pa.Table, pa.dataset.Dataset, pandas.DataFrame, pandas.DataFrame],
     base_dir: Union[str, Path],
     mode: str = "create",
     filesystem: Optional[pa.fs.FileSystem] = None,
     max_rows_per_file: int = 0,
     min_rows_per_group: int = 0,
     max_rows_per_group: int = 1024,
+    schema: Optional[pa.Schema] = None,
     **kwargs,
 ):
     """Write a dataset.
 
     Parameters
     ----------
-    data : pyarrow.Table or pyarrow.dataset.Dataset
+    data : pyarrow.Table, pyarrow.dataset.Dataset, pandas.DataFrame
         The data to write.
     base_dir : str or Path
         The root directory to write dataset to.
@@ -107,7 +109,7 @@ def write_dataset(
         The filesystem to write the dataset
     max_rows_per_file : int
         Maximum number of rows per file. If greater than 0 then this will limit how many rows
-        are placed in any single file. Otherwise there will be no limit and one file will be
+        are placed in any single file. Otherwise, there will be no limit and one file will be
         created in each output directory unless files need to be closed to respect max_open_files.
     min_rows_per_group : int
         Minimum number of rows per group. When the value is greater than 0,
@@ -117,7 +119,10 @@ def write_dataset(
         Maximum number of rows per group. If the value is greater than 0,
         then the dataset writer may split up large incoming batches into multiple row groups.
         If this value is set, then min_rows_per_group should also be set.
-        Otherwise it could end up with very small row groups.
+        Otherwise, it could end up with very small row groups.
+    schema : pyarrow.Schema, optional
+        The expected schema of the pandas DataFrame.
+        This can be used to indicate the type of columns if we cannot infer it automatically.
     """
     from pyarrow.fs import _resolve_filesystem_and_path
 
@@ -126,6 +131,8 @@ def write_dataset(
             "Max rows per file must be larger or equal to max_rows_per_group"
         )
 
+    if isinstance(data, pandas.DataFrame):
+        data = pa.Table.from_pandas(data, schema=schema)
     if isinstance(data, pa.Table):
         data = pa.dataset.InMemoryDataset(data)