# Working with groups of rows of a data frame

In [None]:
using DataFrames
using CSV
using Arrow
using Statistics
using FreqTables
using Pipe

using Test

# Adjust table display
ENV["LINES"]   = 15
ENV["COLUMNS"] = 200

# Make sure Chapter 3 is run
@test isfile("auto2.csv")

df = CSV.File("auto2.csv") |> DataFrame

In [None]:
@test isfile("auto2.arrow")
df2 = Arrow.Table("auto2.arrow") |> DataFrame

In [None]:
@test df == df2 # no missing in both tables

In general `Arrow.jl` is the preferred way to store data frames, see [Arrow.jl blog](https://bkamins.github.io/julialang/2020/11/06/arrow.html).

Notice that it uses its own `AbstractVector` type:

In [None]:
@test df2.mpg     isa Arrow.Primitive{Float64, Vector{Float64}}
@test df2[!,:mpg] isa Arrow.Primitive{Float64, Vector{Float64}}
@test !(df2.mpg isa Vector)

# Turn underlying `Arrow.Primitive` into Vector by 
df3 = copy(df2)
@test df3[!,:mpg] isa Vector

Group data frame by `:brand` column:

In [None]:
#gdf = groupby(df, [:brand,:cylinders])
#gdf[("chevrolet",4)]
gdf = groupby(df, :brand)

In [None]:
gdf[("ford",)]

In [None]:
# Calculate aggregates by group
# Pass :mpg to be processed by mean() and save to column "MPG Average"
brand_mpg = combine(gdf, :mpg => mean => "MPG Average" )

In [None]:
# sort data frame
ENV["LINES"] = 50
sort!(brand_mpg, "MPG Average", rev=true) # high to low

In [None]:
# Check data consistency
# The :origin of each brand must be the same
# For example, the :origin of all Ford must be 1.0
freqtable(df, :brand, :origin)

In [None]:
gdf = groupby(df, :brand)
origin_brand = combine(gdf, :origin => (x -> length(unique(x))) => "uniqueness")
# use pipe the above 2 statements can be combined into 1
origin_brand = @pipe df |> groupby(_, :brand) |> combine(_, :origin => (x -> length(unique(x))) => "uniqueness")


In [None]:
@test (1,1) == extrema(origin_brand[!,"uniqueness"])