### 07 - Pandas Aggregate

In [1]:
import numpy as np
import pandas as pd
import string

In [2]:
# We start with a refresher on Numpy
# We have used rea-world data which is massive and messy.
# Let's generate small and simple dataset today
# create a list of 100 integers from 0 to 99 using Numpy arange 
# then reshape it to a 10 by 10 matrix (a list with 10 lists, each is a list of 10 integers)

x = np.arange(100).reshape(10,10)
x

array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34, 35, 36, 37, 38, 39],
       [40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
       [50, 51, 52, 53, 54, 55, 56, 57, 58, 59],
       [60, 61, 62, 63, 64, 65, 66, 67, 68, 69],
       [70, 71, 72, 73, 74, 75, 76, 77, 78, 79],
       [80, 81, 82, 83, 84, 85, 86, 87, 88, 89],
       [90, 91, 92, 93, 94, 95, 96, 97, 98, 99]])

In [3]:
# Create a string of 26 alphabets (type str)
# turn a str into a list of 26 alphabets
# then only keep the fist 10 alphabets
# We will use the 10 alphabets as the column names for our dataframe

y = list(string.ascii_uppercase)[0:10]
y

['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']

In [4]:
# Create a data frame using x as the data and y as the column names
df = pd.DataFrame(x, columns=y)
df

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
0,0,1,2,3,4,5,6,7,8,9
1,10,11,12,13,14,15,16,17,18,19
2,20,21,22,23,24,25,26,27,28,29
3,30,31,32,33,34,35,36,37,38,39
4,40,41,42,43,44,45,46,47,48,49
5,50,51,52,53,54,55,56,57,58,59
6,60,61,62,63,64,65,66,67,68,69
7,70,71,72,73,74,75,76,77,78,79
8,80,81,82,83,84,85,86,87,88,89
9,90,91,92,93,94,95,96,97,98,99


In [5]:
# Use describe() method to produce summary statistics 
df.describe()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,45.0,46.0,47.0,48.0,49.0,50.0,51.0,52.0,53.0,54.0
std,30.276504,30.276504,30.276504,30.276504,30.276504,30.276504,30.276504,30.276504,30.276504,30.276504
min,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0
25%,22.5,23.5,24.5,25.5,26.5,27.5,28.5,29.5,30.5,31.5
50%,45.0,46.0,47.0,48.0,49.0,50.0,51.0,52.0,53.0,54.0
75%,67.5,68.5,69.5,70.5,71.5,72.5,73.5,74.5,75.5,76.5
max,90.0,91.0,92.0,93.0,94.0,95.0,96.0,97.0,98.0,99.0


In [6]:
# The summary statistics does not include the sum of each column or each row 
# we would use aggregate() function


In [7]:
# By default, aggregate is over the rows 

df.aggregate("sum")

A    450
B    460
C    470
D    480
E    490
F    500
G    510
H    520
I    530
J    540
dtype: int64

In [59]:
# We can also aggregate over columns
# BTW, agg is an alias for aggregrate

df.agg("sum",axis="columns")

0     45
1    145
2    245
3    345
4    445
5    545
6    645
7    745
8    845
9    945
dtype: int64

In [63]:
# You can aggregate using multiple functions

df.agg(["sum", "median","std"])

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
sum,450.0,460.0,470.0,480.0,490.0,500.0,510.0,520.0,530.0,540.0
median,45.0,46.0,47.0,48.0,49.0,50.0,51.0,52.0,53.0,54.0
std,30.276504,30.276504,30.276504,30.276504,30.276504,30.276504,30.276504,30.276504,30.276504,30.276504


In [13]:
# Here is another example with less data

df = pd.DataFrame([[1,2,3],
                  [4,5,6],
                  [7,8,9],
                  [np.nan, np.nan, np.nan]],
                  columns=["A", "B", "C"])
df

Unnamed: 0,A,B,C
0,1.0,2.0,3.0
1,4.0,5.0,6.0
2,7.0,8.0,9.0
3,,,


In [19]:
df.describe()

Unnamed: 0,A,B,C
count,3.0,3.0,3.0
mean,4.0,5.0,6.0
std,3.0,3.0,3.0
min,1.0,2.0,3.0
25%,2.5,3.5,4.5
50%,4.0,5.0,6.0
75%,5.5,6.5,7.5
max,7.0,8.0,9.0


In [26]:
df.aggregate(["min", "max", "sum", "mean"])

Unnamed: 0,A,B,C
min,1.0,2.0,3.0
max,7.0,8.0,9.0
sum,12.0,15.0,18.0
mean,4.0,5.0,6.0


In [25]:
df.agg("min", axis="columns")

0    1.0
1    4.0
2    7.0
3    NaN
dtype: float64

In [75]:
# Here is another example using random numbers
# Generate 100 random integers ranging from 0 and 99
# reshape the one-dimensional numpy array into a 20 by 5 matrix (20 rows and 5 columns)

x = np.random.randint(0,100,100).reshape(20,5)
x

array([[79, 80, 24, 28, 57],
       [ 1, 31, 37, 97,  7],
       [27, 64, 58, 65, 35],
       [ 1, 25, 15, 73, 60],
       [ 2, 86, 35, 47, 19],
       [44, 84, 56, 53, 66],
       [13, 72, 32, 29, 94],
       [11, 17, 33, 74, 65],
       [87, 75, 44,  4, 95],
       [16, 34, 46, 72, 16],
       [34,  6, 85,  9,  0],
       [87, 60, 72, 42, 60],
       [28, 56, 53,  6, 40],
       [11, 72, 29, 40, 90],
       [97, 20, 98, 77,  8],
       [98,  3, 20, 55, 80],
       [21, 23, 26,  0, 48],
       [39, 79, 76,  1, 62],
       [75,  8, 31, 35, 62],
       [44, 80, 60, 70, 61]])

In [69]:
# Get the 26 alphabets using string library

y = string.ascii_uppercase
y

'ABCDEFGHIJKLMNOPQRSTUVWXYZ'

In [71]:
# Turn the str object into a list and keep only the first five 

y = list(y)[0:5]
y

['A', 'B', 'C', 'D', 'E']

In [77]:
# Create a dataframe using x and y

df = pd.DataFrame(x,columns=y)
df

Unnamed: 0,A,B,C,D,E
0,79,80,24,28,57
1,1,31,37,97,7
2,27,64,58,65,35
3,1,25,15,73,60
4,2,86,35,47,19
5,44,84,56,53,66
6,13,72,32,29,94
7,11,17,33,74,65
8,87,75,44,4,95
9,16,34,46,72,16


In [None]:
# Calculate summary statistics 

In [78]:
df.describe()

Unnamed: 0,A,B,C,D,E
count,20.0,20.0,20.0,20.0,20.0
mean,40.75,48.75,46.5,43.85,51.25
std,34.013736,29.784975,22.816199,29.367499,29.134669
min,1.0,3.0,15.0,0.0,0.0
25%,12.5,22.25,30.5,23.25,31.0
50%,31.0,58.0,40.5,44.5,60.0
75%,76.0,76.0,58.5,70.5,65.25
max,98.0,86.0,98.0,97.0,95.0


In [None]:
# try pandas aggregate method here