<a href="https://colab.research.google.com/github/woncoh1/opendata/blob/main/excels2arrows.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import libraries

In [None]:
import ast
import os
import re
import statistics
from toolz import curry
from typing import Dict, Union

import pandas as pd
import ipywidgets as widgets
import ipywidgets as widgets
from IPython.display import clear_output

Colab-specific imports:

In [None]:
from google.colab import auth, data_table
from google.auth import default
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive, GoogleDriveFile
from oauth2client.client import GoogleCredentials

# Define constants

In [None]:
# Google Drive folder ID
SOURCE_FOLDER_ID = ''
TARGET_FOLDER_ID = ''

# Define functions

In [None]:
def list_files(
    dir: str,
) -> Dict[str, GoogleDriveFile]:
    """Get all files from a Google Drive directory."""
    query = {'q': f"parents='{dir}' and trashed=false"}
    files = drive.ListFile(query).GetList()
    return {
        gdf['title']: gdf
        for gdf in files
    }

In [None]:
def import_excel(
    gdf: GoogleDriveFile,
    sheet_name=0,
    header=0,
    names=None,
    dtype=None,
) -> Union[dict, pd.DataFrame]:
    """Import an Excel file from disk to memory."""
    filename = gdf['title']
    # Google Drive -> Colab disk
    gdf.GetContentFile(filename)
    # Colab disk -> Colab RAM
    dict_or_df = pd.read_excel(
        filename,
        sheet_name=sheet_name,
        header=header,
        names=names,
        dtype=dtype,
    )
    # Clean up Colab disk
    os.system(f'rm {filename}')
    return dict_or_df

In [None]:
def import_arrow(
    gdf: GoogleDriveFile,
) -> pd.DataFrame:
    """Import an Arrow file from disk to memory."""
    filename = gdf['title']
    # Google Drive -> Colab disk
    gdf.GetContentFile(filename)
    # Colab disk -> Colab RAM
    df = pd.read_feather(filename)
    # Clean up Colab disk
    os.system(f'rm {filename}')
    return df

In [None]:
def export_dataframe(
    df: pd.DataFrame,
    folder_id: str,
    filename: str,
) -> None:
    """Export an Arrow or Excel file from disk to memory."""
    extension = filename.split('.', 1)[-1]
    # Colab RAM -> Colab disk
    # https://arrow.apache.org/docs/r/reference/write_feather.html
    df.to_feather(filename) if extension == 'arrow' else df.to_csv(filename)
    # Colab disk -> Google Drive
    uploaded = drive.CreateFile({
        'title': filename,
        'parents': [{
            'id': folder_id,
            'kind': 'drive#parentReference',
            'isRoot': 'False',
        }],
    })
    uploaded.SetContentFile(filename)
    uploaded.Upload()

In [None]:
def export_filename(import_filename: str):
    """Get filename of the arrow file to export."""
    return import_filename.split('.', 1)[0] + '.arrow'

# Set Options

In [None]:
pd.set_option('display.precision', 2)

# Authenticate drive

In [None]:
# Google Colab
auth.authenticate_user()
# Google Drive
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Convert Excel to Arrow
- Import Excel file as dataframe
- Export dataframe as arrow file

In [None]:
for title, gdf in list_files(SOURCE_FOLDER_ID).items():
    export_dataframe(
        import_excel(gdf, header=0, dtype=object),
        TARGET_FOLDER_ID,
        export_filename(title),
    )

# Inspect Arrow

In [None]:
def on_change(change):
    display_head()

def display_head():
    clear_output(wait=True)
    display(w)
    df = import_arrow(output_files[w.value])
    display(df)
    display(df.info())
    display(df.describe())

output_files: dict = list_files(TARGET_FOLDER_ID)

w = widgets.Dropdown(
    options=sorted(list(output_files)),
    description='Table:',
)

w.observe(on_change)

display_head()