In [1]:
import IJulia
import Base64

# The julia kernel has built in support for Revise.jl, so this is the 
# recommended approach for long-running sessions:
# https://github.com/JuliaLang/IJulia.jl/blob/9b10fa9b879574bbf720f5285029e07758e50a5e/src/kernel.jl#L46-L51

# Users should enable revise within .julia/config/startup_ijulia.jl:
# https://timholy.github.io/Revise.jl/stable/config/#Using-Revise-automatically-within-Jupyter/IJulia-1

# clear console history
IJulia.clear_history()

fig_width = 7
fig_height = 5
fig_format = :retina
fig_dpi = 96

# no retina format type, use svg for high quality type/marks
if fig_format == :retina
  fig_format = :svg
elseif fig_format == :pdf
  fig_dpi = 96
  # Enable PDF support for IJulia
  IJulia.register_mime(MIME("application/pdf"))
end

# convert inches to pixels
fig_width = fig_width * fig_dpi
fig_height = fig_height * fig_dpi

# Intialize Plots w/ default fig width/height
try
  import Plots

  # Plots.jl doesn't support PDF output for versions < 1.28.1
  # so use png (if the DPI remains the default of 300 then set to 96)
  if (Plots._current_plots_version < v"1.28.1") & (fig_format == :pdf)
    Plots.gr(size=(fig_width, fig_height), fmt = :png, dpi = fig_dpi)
  else
    Plots.gr(size=(fig_width, fig_height), fmt = fig_format, dpi = fig_dpi)
  end
catch e
  # @warn "Plots init" exception=(e, catch_backtrace())
end

# Initialize CairoMakie with default fig width/height
try
  import CairoMakie

  # CairoMakie's display() in PDF format opens an interactive window
  # instead of saving to the ipynb file, so we don't do that.
  # https://github.com/quarto-dev/quarto-cli/issues/7548
  if fig_format == :pdf
    CairoMakie.activate!(type = "png")
  else
    CairoMakie.activate!(type = string(fig_format))
  end
  CairoMakie.update_theme!(resolution=(fig_width, fig_height))
catch e
    # @warn "CairoMakie init" exception=(e, catch_backtrace())
end
  
# Set run_path if specified
try
  run_path = "L1VzZXJzL2FuZHJldy9HaXRIdWIvc2l0ZXMvdmlydHVlbGxlYWthZGVtaWUuZ2l0aHViLmlvL3Bvc3RzL2p1bGlhLXRpZGllcg=="
  if !isempty(run_path)
    run_path = String(Base64.base64decode(run_path))
    cd(run_path)
  end
catch e
  @warn "Run path init:" exception=(e, catch_backtrace())
end


# emulate old Pkg.installed beahvior, see
# https://discourse.julialang.org/t/how-to-use-pkg-dependencies-instead-of-pkg-installed/36416/9
import Pkg
function isinstalled(pkg::String)
  any(x -> x.name == pkg && x.is_direct_dep, values(Pkg.dependencies()))
end

# ojs_define
if isinstalled("JSON") && isinstalled("DataFrames")
  import JSON, DataFrames
  global function ojs_define(; kwargs...)
    convert(x) = x
    convert(x::DataFrames.AbstractDataFrame) = Tables.rows(x)
    content = Dict("contents" => [Dict("name" => k, "value" => convert(v)) for (k, v) in kwargs])
    tag = "<script type='ojs-define'>$(JSON.json(content))</script>"
    IJulia.display(MIME("text/html"), tag)
  end
elseif isinstalled("JSON")
  import JSON
  global function ojs_define(; kwargs...)
    content = Dict("contents" => [Dict("name" => k, "value" => v) for (k, v) in kwargs])
    tag = "<script type='ojs-define'>$(JSON.json(content))</script>"
    IJulia.display(MIME("text/html"), tag)
  end
else
  global function ojs_define(; kwargs...)
    @warn "JSON package not available. Please install the JSON.jl package to use ojs_define."
  end
end


# don't return kernel dependencies (b/c Revise should take care of dependencies)
nothing


In [2]:
# Load required packages
using Tidier
using DataFrames
using Random
using Statistics

# Set random seed for reproducibility
Random.seed!(123)

# Display Julia and package versions
println("Julia version: ", VERSION)
println("Tidier.jl version: v1.2.0")

Julia version: 1.11.5
Tidier.jl version: v1.2.0


In [3]:
# Create a simple dataset for demonstration
students = DataFrame(
    id = 1:100,
    name = ["Student $i" for i in 1:100],
    math_score = rand(60:100, 100),
    science_score = rand(55:100, 100),
    program = rand(["CS", "Math", "Physics"], 100),
    grade_level = rand([1, 2, 3, 4], 100)
)

println("Dataset shape: ", size(students))
first(students, 5)

Dataset shape: (

100, 6)


Row,id,name,math_score,science_score,program,grade_level
Unnamed: 0_level_1,Int64,String,Int64,Int64,String,Int64
1,1,Student 1,81,81,Physics,1
2,2,Student 2,84,57,Physics,4
3,3,Student 3,96,82,Physics,3
4,4,Student 4,67,64,Math,4
5,5,Student 5,81,83,Physics,3


In [4]:
# Add a total score column
students = @mutate(students, total = math_score + science_score)
first(students, 5)

Row,id,name,math_score,science_score,program,grade_level,total
Unnamed: 0_level_1,Int64,String,Int64,Int64,String,Int64,Int64
1,1,Student 1,81,81,Physics,1,162
2,2,Student 2,84,57,Physics,4,141
3,3,Student 3,96,82,Physics,3,178
4,4,Student 4,67,64,Math,4,131
5,5,Student 5,81,83,Physics,3,164


In [5]:
# Filter students with high math scores
high_performers = @filter(students, math_score >= 90)
println("Students with math score >= 90: ", nrow(high_performers))
first(high_performers, 5)

Students with math score >= 90: 26




Row,id,name,math_score,science_score,program,grade_level,total
Unnamed: 0_level_1,Int64,String,Int64,Int64,String,Int64,Int64
1,3,Student 3,96,82,Physics,3,178
2,8,Student 8,98,98,CS,2,196
3,12,Student 12,94,67,CS,4,161
4,22,Student 22,100,83,Physics,3,183
5,23,Student 23,96,82,CS,3,178


In [6]:
# Filter by multiple conditions
cs_seniors = @filter(students, program == "CS" && grade_level == 4)
println("CS seniors: ", nrow(cs_seniors))
first(cs_seniors, 5)

CS seniors: 9




Row,id,name,math_score,science_score,program,grade_level,total
Unnamed: 0_level_1,Int64,String,Int64,Int64,String,Int64,Int64
1,7,Student 7,61,67,CS,4,128
2,12,Student 12,94,67,CS,4,161
3,27,Student 27,63,83,CS,4,146
4,48,Student 48,90,100,CS,4,190
5,52,Student 52,69,85,CS,4,154


In [7]:
# Select specific columns
scores_only = @select(students, id, math_score, science_score, total)
first(scores_only, 5)

Row,id,math_score,science_score,total
Unnamed: 0_level_1,Int64,Int64,Int64,Int64
1,1,81,81,162
2,2,84,57,141
3,3,96,82,178
4,4,67,64,131
5,5,81,83,164


In [8]:
# Select columns using patterns
name_and_scores = @select(students, name, ends_with("score"))
first(name_and_scores, 5)

Row,name,math_score,science_score
Unnamed: 0_level_1,String,Int64,Int64
1,Student 1,81,81
2,Student 2,84,57
3,Student 3,96,82
4,Student 4,67,64
5,Student 5,81,83


In [9]:
# Add calculated columns
students_graded = @mutate(students, 
    average_score = (math_score + science_score) / 2,
    passed = total >= 140
)
first(students_graded, 5)

Row,id,name,math_score,science_score,program,grade_level,total,average_score,passed
Unnamed: 0_level_1,Int64,String,Int64,Int64,String,Int64,Int64,Float64,Bool
1,1,Student 1,81,81,Physics,1,162,81.0,True
2,2,Student 2,84,57,Physics,4,141,70.5,True
3,3,Student 3,96,82,Physics,3,178,89.0,True
4,4,Student 4,67,64,Math,4,131,65.5,False
5,5,Student 5,81,83,Physics,3,164,82.0,True


In [10]:
# Basic summary statistics
summary_stats = @summarize(students,
    avg_math = mean(math_score),
    avg_science = mean(science_score),
    max_total = maximum(total),
    min_total = minimum(total),
    n_students = length(id)
)
summary_stats

Row,avg_math,avg_science,max_total,min_total,n_students
Unnamed: 0_level_1,Float64,Float64,Int64,Int64,Int64
1,80.07,76.96,196,119,100


In [11]:
# Summary by program
program_summary = @chain students begin
    @group_by(program)
    @summarize(
        count = length(id),
        avg_math = mean(math_score),
        avg_science = mean(science_score),
        avg_total = mean(total)
    )
    @arrange(desc(avg_total))
end
program_summary

Row,program,count,avg_math,avg_science,avg_total
Unnamed: 0_level_1,String,Int64,Float64,Float64,Float64
1,Physics,41,81.7073,76.5854,158.293
2,Math,23,78.913,79.2174,158.13
3,CS,36,78.9444,75.9444,154.889


In [12]:
# Summary by grade level
grade_level_summary = @chain students begin
    @group_by(grade_level)
    @summarize(
        n_students = length(id),
        avg_math = round(mean(math_score), digits=1),
        avg_science = round(mean(science_score), digits=1)
    )
    @arrange(grade_level)
end
grade_level_summary

Row,grade_level,n_students,avg_math,avg_science
Unnamed: 0_level_1,Int64,Int64,Float64,Float64
1,1,23,82.9,84.3
2,2,18,80.1,77.6
3,3,33,80.5,73.6
4,4,26,77.0,74.2


In [13]:
# Sort by total score (descending)
top_students = @chain students begin
    @arrange(desc(total))
    @select(name, program, math_score, science_score, total)
    @slice(1:10)
end
println("Top 10 students by total score:")
top_students

Top 10 students by total score:


Row,name,program,math_score,science_score,total
Unnamed: 0_level_1,String,String,Int64,Int64,Int64
1,Student 8,CS,98,98,196
2,Student 72,Physics,98,98,196
3,Student 59,Physics,96,99,195
4,Student 48,CS,90,100,190
5,Student 50,Physics,84,100,184
6,Student 87,Physics,87,97,184
7,Student 22,Physics,100,83,183
8,Student 80,Math,89,94,183
9,Student 64,Physics,83,98,181
10,Student 86,Physics,85,96,181


In [14]:
# First, let's verify the DataFrame exists and has the right columns
if @isdefined(students)
    println("Students DataFrame columns: ", names(students))
    println("Number of rows: ", nrow(students))
else
    println("Students DataFrame not found!")
end

# Use DataFrames.jl functions instead of Tidier.jl for this example
# Filter for upper-level students (grade_level >= 3)
upper_level = filter(row -> row.grade_level >= 3, students)

# Add performance column
upper_level.performance = map(upper_level.total) do t
    if t >= 160
        "Excellent"
    elseif t >= 140
        "Good"
    else
        "Average"
    end
end

# Group and summarize using DataFrames.jl
result = combine(groupby(upper_level, [:program, :performance]), nrow => :count)
sort!(result, [:program, order(:count, rev=true)])

println("\nPerformance distribution for upper-level students:")
result

Students DataFrame columns: 

["id", "name", "math_score", "science_score", "program", "grade_level", "total"]
Number of rows: 100



Performance distribution for upper-level students:


Row,program,performance,count
Unnamed: 0_level_1,String,String,Int64
1,CS,Good,11
2,CS,Average,6
3,CS,Excellent,6
4,Math,Excellent,5
5,Math,Good,5
6,Math,Average,2
7,Physics,Good,9
8,Physics,Excellent,9
9,Physics,Average,6


In [15]:
# Create a DataFrame with some missing values
students_missing = DataFrame(
    id = 1:10,
    name = ["Student $i" for i in 1:10],
    math_score = [85, missing, 92, 78, missing, 88, 95, missing, 82, 90],
    science_score = [78, 85, missing, 82, 88, missing, 92, 85, missing, 87]
)

println("Data with missing values:")
println(students_missing)

# Count missing values
missing_counts = DataFrame(
    math_missing = sum(ismissing.(students_missing.math_score)),
    science_missing = sum(ismissing.(students_missing.science_score))
)
println("\nMissing value counts:")
println(missing_counts)

# Calculate mean, skipping missing values
math_mean = mean(skipmissing(students_missing.math_score))
science_mean = mean(skipmissing(students_missing.science_score))
println("\nMeans (excluding missing): Math = $math_mean, Science = $science_mean")

Data with missing values:
[1m10×4 DataFrame[0m
[1m Row [0m│[1m id    [0m[1m name       [0m[1m math_score [0m[1m science_score [0m
     │[90m Int64 [0m[90m String     [0m[90m Int64?     [0m[90m Int64?        [0m
─────┼──────────────────────────────────────────────
   1 │     1  Student 1           85             78
   2 │     2  Student 2  [90m    missing [0m            85
   3 │     3  Student 3           92 [90m       missing [0m
   4 │     4  Student 4           78             82
   5 │     5  Student 5  [90m    missing [0m            88
   6 │     6  Student 6           88 [90m       missing [0m
   7 │     7  Student 7           95             92
   8 │     8  Student 8  [90m    missing [0m            85
   9 │     9  Student 9           82 [90m       missing [0m
  10 │    10  Student 10          90             87



Missing value counts:
[1m1×2 DataFrame[0m
[1m Row [0m│[1m math_missing [0m[1m science_missing [0m
     │[90m Int64        [0m[90m Int64           [0m
─────┼───────────────────────────────
   1 │            3                3

Means (excluding missing): Math = 87.14285714285714, Science = 85.28571428571429


In [16]:
# Create a simple grades DataFrame
grades = DataFrame(
    id = [1, 2, 3, 4, 5],
    final_grade = ["A", "B", "A", "C", "B"]
)

# Join with students data
students_with_grades = @left_join(students[1:5, :], grades, id)
students_with_grades

Row,id,name,math_score,science_score,program,grade_level,total,final_grade
Unnamed: 0_level_1,Int64,String,Int64,Int64,String,Int64,Int64,String?
1,1,Student 1,81,81,Physics,1,162,A
2,2,Student 2,84,57,Physics,4,141,B
3,3,Student 3,96,82,Physics,3,178,A
4,4,Student 4,67,64,Math,4,131,C
5,5,Student 5,81,83,Physics,3,164,B


In [17]:
# Create wide data
wide_scores = @chain students[1:5, :] begin
    @select(id, name, math_score, science_score)
end

println("Wide format:")
wide_scores

# Convert to long format (using DataFrames stack function)
long_scores = stack(wide_scores, [:math_score, :science_score], 
                    variable_name=:subject, value_name=:score)
println("\nLong format:")
first(long_scores, 10)

Wide format:

Long format:


Row,id,name,subject,score
Unnamed: 0_level_1,Int64,String,String,Int64
1,1,Student 1,math_score,81
2,2,Student 2,math_score,84
3,3,Student 3,math_score,96
4,4,Student 4,math_score,67
5,5,Student 5,math_score,81
6,1,Student 1,science_score,81
7,2,Student 2,science_score,57
8,3,Student 3,science_score,82
9,4,Student 4,science_score,64
10,5,Student 5,science_score,83
