## Generate larger versions of the vega datasets

In [None]:
from vega_datasets import data
import pandas as pd
from math import log10
import altair as alt
from pathlib import Path
import json

In [None]:
outdir = Path("data") / "vega"
outdir.mkdir(exist_ok=True)

### Flights

In [None]:
for dups in [10, 20, 50, 100, 200]:
    n = 10000*dups
    n_commas = int(log10(n) // 3)
    suffix = ["", "k", "m"][n_commas]
    filename = "flights_{}{}".format(int(n / 10**(n_commas * 3)), suffix)
    print(filename)
    
    # Build dataframe
    df = pd.concat([data.flights_10k()]*dups, axis=0).reset_index()
    
    rows = alt.data.to_values(df)["values"]
    with open(outdir / (filename + ".json"), "wt") as f:
        json.dump(rows, f)
    
    df.to_parquet(outdir / (filename + ".parquet"))

### Seattle Weather

In [None]:
for dups in [7, 14, 35, 69, 137, 343, 685, 1369]:
    n = 1461*dups
    n_commas = int(log10(n) // 3)
    suffix = ["", "k", "m"][n_commas]
    filename = "seattle_weather_{}{}".format(int(n / 10**(n_commas * 3)), suffix)
    print(filename)
    
    # Build dataframe
    df = pd.concat([data.seattle_weather()]*dups, axis=0).reset_index()
    
    rows = alt.data.to_values(df)["values"]
    with open(outdir / (filename + ".json"), "wt") as f:
        json.dump(rows, f)
        
    df.to_parquet(outdir / (filename + ".parquet"))

### Movies

In [None]:
for dups in [4, 7, 16, 32, 63, 157, 313]:
    n = 3201*dups
    n_commas = int(log10(n) // 3)
    suffix = ["", "k", "m"][n_commas]
    filename = "movies_{}{}".format(int(n / 10**(n_commas * 3)), suffix)
    print(filename)
    
    # Build dataframe
    df = data.movies()   
    df = pd.concat([data.movies()]*dups, axis=0).reset_index()
    df["Title"] = df.Title.map(str)
    
    rows = alt.data.to_values(df)["values"]
    with open(outdir / (filename + ".json"), "wt") as f:
        json.dump(rows, f)
        
    df.to_parquet(outdir / (filename + ".parquet"))