In [None]:
import pandas as pd
import numpy as np
pd

### Student Styling Code

In [None]:
def color_student_column(student):
    return {
        'Ada': 'background-color: lightgreen; color: black;',
        'Bob': 'background-color: lightblue; color: black;',
        'Cam': 'background-color: lightgoldenrodyellow; color: black;',
        'Dan': 'background-color: lightcoral; color: black;'
    }.get(student, '')

# Single Value Column Example

In [None]:
n_grades = 5

grades_df = (pd.DataFrame({
                    "Ada": np.random.randint(low=87, high=100, size=n_grades),
                    "Bob": np.random.randint(low=77, high=93, size=n_grades),
                    "Cam": np.random.randint(low=67, high=85, size=n_grades),
                    "Dan": np.random.randint(low=60, high=76, size=n_grades)
              })
             .melt(var_name="student", value_name="grade")
             .sample(frac=1, random_state=42)
             .reset_index(drop=True)
             .sort_values(["student", "grade"], ascending=[True, False]))

(
    grades_df
    .style.map(color_student_column, subset=['student'])
    .background_gradient(subset=['grade'], cmap="RdYlGn", vmin=65, vmax=100)  # add color to grades table
)

In [None]:
(
    grades_df
    .groupby("student")
    .mean() 
    
    .reset_index(drop=False) # change 'student' from index to column for styling    
    .style.map(color_student_column, subset=['student']) # color code students    
    .background_gradient(subset=['grade'], cmap="RdYlGn", vmin=65, vmax=100) # color code grades    
    .format(precision=2) # round to 2 decimals
)

## groupby.agg - <i>canonical w/dict</i>

In [None]:
(
    grades_df
    .groupby(by=["student"])
    .agg( {"grade": ["mean"]} )

    .reset_index(drop=False) # change 'student' from index to column for styling    
    .style.map(color_student_column, subset=['student']) # color code students    
    .background_gradient(subset=['grade'], cmap="RdYlGn", vmin=65, vmax=100) # color code grades    
    .format(precision=2) # round to 2 decimals       
)

## groupby.agg - <i>canonical w/parens</i>

In [None]:
(
    grades_df
    .groupby(by=["student"])
    .agg( grade_mean=("grade", "mean") )
        
    .reset_index(drop=False) # change 'student' from index to column for styling    
    .style.map(color_student_column, subset=['student']) # color code students    
    .background_gradient(subset=['grade_mean'], cmap="RdYlGn", vmin=65, vmax=100) # color code grades    
    .format(precision=2) # round to 2 decimals
)

<br/>

# 2 Value Columns Example

In [None]:
students = ["Ada", "Bob", "Cam", "Dan"]
quarters = list(range(1,5))

data = []
for student in students:
    for quarter in quarters:
        
        if student == "Ada":
            grade = np.random.randint(90, 101)
            absences = np.random.randint(0, 2)
        
        elif student == "Bob":
            grade = np.random.randint(80, 90) 
            absences = np.random.randint(1, 3)
        
        elif student == "Cam":
            grade = np.random.randint(70, 80)  
            absences = np.random.randint(2, 4)  
        
        elif student == "Dan":
            grade = np.random.randint(65, 72)     
            absences = np.random.randint(3, 5)
        
        data.append([quarter, student, grade, absences])

df = (
    pd.DataFrame(data, columns=["qtr", "student", "grade", "absences"])
    .sort_values(["qtr", "student"])
    .reset_index(drop=True)
)
(
    df    
    .style.map(color_student_column, subset=['student']) # color code students    
    .background_gradient(subset=['grade'], cmap="PuBuGn", vmin=65, vmax=100) # color code grades    
    .background_gradient(subset=['qtr'], cmap="Grays", vmin=1, vmax=4) # color code qtr    
    .background_gradient(subset=['absences'], cmap="GnBu", vmin=0, vmax=10) # color code absences
)

# groupby.agg - <i>simplest form</i>

In [None]:
(
    df
    [["student", "grade"]]
    .groupby("student")
    .mean()

    .reset_index(drop=False)  # change 'student' from index to column for styling
    .style.map(color_student_column, subset=['student'])  # color code students
    .background_gradient(subset=['grade'], cmap="PuBuGn", vmin=65, vmax=100) # color code grades    
    .format(precision=2) # round to 2 decimals
)

# groupby.agg - <i>canonical w/dict</i>

In [None]:
(
    df
    .groupby(by=["student"])
    .agg(
        {"grade":["mean", "std"],
         "absences":["sum"]},
    )

    .reset_index(drop=False)  # change 'student' from index to column for styling
    .style.map(color_student_column, subset=['student'])  # color code students
    .background_gradient(subset=[('grade', 'mean')], cmap="PuBuGn", vmin=65, vmax=100)  # color code grades
    .background_gradient(subset=[('grade', 'std')], cmap="Blues", vmin=0, vmax=10) # color code std
    .background_gradient(subset=[('absences', 'sum')], cmap="GnBu", vmin=0, vmax=20) # color code absences        
    .format(precision=2) # round to 2 decimals
)

# groupby.agg - <i>canonical w/parens</i>

In [None]:
(
    df
    .groupby(by=["student"])
    .agg(
        grade_mean=("grade", "mean"),
        grade_std=("grade", "std"),
        absences_sum=("absences", "sum"))
    
    .reset_index(drop=False)  # change 'student' from index to column for styling
    .style.map(color_student_column, subset=['student']) # color code students    
    .background_gradient(subset=['grade_mean'], cmap="PuBuGn", vmin=65, vmax=100) # color code grades    
    .background_gradient(subset=['grade_std'], cmap="Blues", vmin=0, vmax=10) # color code std    
    .background_gradient(subset=['absences_sum'], cmap="GnBu", vmin=0, vmax=20) # color code absences    
    .format(precision=2) # round to 2 decimals
)

<br/>

<br/>

<br/>

<br/>

<br/>

# RAPIDS ON/OFF - 400M Rows

In [1]:
%load_ext cudf.pandas

In [2]:
import pandas as pd
import numpy as np
pd

<module 'pandas' (ModuleAccelerator(fast=cudf, slow=pandas))>

<br/>

## Host

In [3]:
!echo $(nproc) "CPU cores"

32 CPU cores


In [4]:
!free -g | awk 'NR==2 {print $2 " GB"}'

62 GB


<br/>

### GPU

In [5]:
!nvidia-smi | grep 'NVIDIA RTX 6000 Ada'

|   0  NVIDIA RTX 6000 Ada Gene...    Off |   00000000:01:00.0 Off |                    0 |


In [6]:
!nvidia-smi | grep 46068MiB

| 30%   46C    P2             67W /  300W |     870MiB /  46068MiB |      0%      Default |


<br/>

<br/>

# <span style="color: green;">x 13 speedup</span> - 🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️

In [7]:
%%time
# CPU - 39.2  s
# GPU -  3.02 s

grades_df = pd.read_csv("400M_grades.csv")

CPU times: user 5.82 s, sys: 1.79 s, total: 7.61 s
Wall time: 3.03 s


In [8]:
grades_df.sample(1)

Unnamed: 0,student,exam,quiz
288497474,Dan,75.129105,56.334813


# groupby.agg - <i>simplest form</i>

# <span style="color: green;">x 61 speedup</span> - 🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️

In [9]:
%%time
# CPU - 9.12 s
# GPU - 0.148

summary = (
    grades_df
    .groupby("student")
    .mean()
)

CPU times: user 96.7 ms, sys: 96.4 ms, total: 193 ms
Wall time: 144 ms


# groupby.agg - <i>canonical w/dict</i>

# <span style="color: green;">x 11 speedup</span> - 🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️🏎️

In [10]:
%%time
# CPU - 12.9  s
# GPU -  1.22 s

summary = (
    grades_df
    .groupby(by=["student"])
    .agg({
        "exam":["mean", "min", "max", "std"],
        "quiz":["mean", "min", "max", "std"],
    })    
)

CPU times: user 1.05 s, sys: 660 ms, total: 1.71 s
Wall time: 1.24 s


# groupby.agg - <i>canonical w/parens</i>

# <span style="color: green;">x 6 Speedup</span> - 🏎️🏎️🏎️🏎️🏎️🏎️

In [11]:
%%time
# CPU - 12.8  s
# GPU -  1.95 s 

summary = (
    grades_df
    .groupby(by=["student"])
    .agg(
        exam_mean=("exam", "mean"),
        exam_min=("exam", "min"),
        exam_max=("exam", "max"),
        exam_std=("exam", "std"),

        quiz_mean=("quiz", "mean"),
        quiz_min=("quiz", "min"),
        quiz_max=("quiz", "max"),
        quiz_std=("quiz", "std")
    )    
)

CPU times: user 1.2 s, sys: 1.05 s, total: 2.25 s
Wall time: 1.9 s
