In [1]:
import pandas as pd
import numpy as np
from pandas import Series
from pandas import DataFrame

In [2]:
s1 = Series(range(4), index=list('dabc'))
f1 = DataFrame(np.arange(8).reshape((2,4)), index=['uno', 'tres'], columns=list("dabc"))
display(s1)
display(f1)

d    0
a    1
b    2
c    3
dtype: int64

Unnamed: 0,d,a,b,c
uno,0,1,2,3
tres,4,5,6,7


In [3]:
s1.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [4]:
# by default, sort index in data frame is done per row
f1.sort_index()

Unnamed: 0,d,a,b,c
tres,4,5,6,7
uno,0,1,2,3


In [5]:
f1.sort_index(ascending=False)

Unnamed: 0,d,a,b,c
uno,0,1,2,3
tres,4,5,6,7


In [6]:
f1.sort_index(ascending=True, axis = 1) # axis 0 is rows, axis 1 is cols, axis can increase

Unnamed: 0,a,b,c,d
uno,1,2,3,0
tres,5,6,7,4


In [7]:
f2 = DataFrame({"b":[4,-7,-3,2], "a":[0,1,1,0]})
f2

Unnamed: 0,b,a
0,4,0
1,-7,1
2,-3,1
3,2,0


In [8]:
f2.sort_values(by="a") # when repeated values, it uses stable sort

Unnamed: 0,b,a
0,4,0
3,2,0
1,-7,1
2,-3,1


In [9]:
# if you want to define what it does when repeated values
f2.sort_values(by=["a", "b"])

Unnamed: 0,b,a
3,2,0
0,4,0
1,-7,1
2,-3,1


In [10]:
s2 = Series([7,-5,7,4,2,0,4,7])
s2.sort_values()

1   -5
5    0
4    2
3    4
6    4
0    7
2    7
7    7
dtype: int64

In [11]:
s2.rank() # 3 and 6 have the same score

0    7.0
1    1.0
2    7.0
3    4.5
4    3.0
5    2.0
6    4.5
7    7.0
dtype: float64

In [12]:
s2.rank(method="min")

0    6.0
1    1.0
2    6.0
3    4.0
4    3.0
5    2.0
6    4.0
7    6.0
dtype: float64

In [13]:
s2.rank(method="min", ascending=False)
# methods: min, average (default), max, first, dense

0    1.0
1    8.0
2    1.0
3    4.0
4    6.0
5    7.0
6    4.0
7    1.0
dtype: float64

In [14]:
s2.rank(method="first")

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
7    8.0
dtype: float64

In [15]:
s2.rank(method="dense")

0    5.0
1    1.0
2    5.0
3    4.0
4    3.0
5    2.0
6    4.0
7    5.0
dtype: float64

![img]("methods.png")

In [16]:
f3 = DataFrame({"b":[4.3, 7, -3, 2], "a":[0,1,1,0], "c":[-2,5,8,-2.5]})
f3

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,1,8.0
3,2.0,0,-2.5


In [17]:
f3.rank()

Unnamed: 0,b,a,c
0,3.0,1.5,2.0
1,4.0,3.5,3.0
2,1.0,3.5,4.0
3,2.0,1.5,1.0


In [18]:
f3.rank(axis=1, method="dense")
# for first row: first is c, second a, and third b

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,3.0,2.0,1.0


## Descriptive Statistics

In [19]:
np.random.seed(3571)
f4 = DataFrame(np.random.randn(64).reshape((8,8)))
f4

Unnamed: 0,0,1,2,3,4,5,6,7
0,3.075195,0.804493,1.285172,-0.748446,-0.697122,-1.251434,0.893521,0.366261
1,-1.796313,-1.514583,-1.819609,0.619147,-0.573138,0.099955,0.683461,-0.183118
2,-0.216383,0.352661,0.854963,1.773649,-0.027573,1.608783,0.940662,-0.189553
3,-1.491702,-0.627679,0.928926,-1.077908,1.04737,-0.841811,0.903555,-0.749431
4,0.483074,-0.127705,0.273208,0.946763,-1.169801,1.562638,2.040584,-0.367932
5,0.718237,-0.037154,2.698077,-0.872232,-0.71363,0.820188,1.615771,0.459943
6,-0.650003,-1.462323,-0.304525,0.241432,0.268977,0.023715,0.28318,1.060606
7,1.461383,0.338723,1.406254,-0.971697,0.753918,-2.706855,1.585571,1.075724


## Exercise: apply each of the following operations over rows and columns of the dataframe

1. count()

1. describe()

1. min, max

1. argmin, argmax

1. idxmin, idxmax

1. quantile

1. sum

1. mean

1. median

1. mod

1. prod

1. var

1. std

1. skew

1. kurt

1. cumsum

1. cummin, cummax

1. cumprod

1. diff

1. pct_change

In [22]:
# count()
f4.count(axis=0)

0    8
1    8
2    8
3    8
4    8
5    8
6    8
7    8
dtype: int64

In [21]:
f4.count(axis=1)

0    8
1    8
2    8
3    8
4    8
5    8
6    8
7    8
dtype: int64

In [24]:
# describe()
f4.describe()

Unnamed: 0,0,1,2,3,4,5,6,7
count,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
mean,0.197936,-0.284196,0.665308,-0.011161,-0.138875,-0.085603,1.118288,0.184063
std,1.601316,0.85224,1.333094,1.063006,0.781607,1.476311,0.577078,0.667962
min,-1.796313,-1.514583,-1.819609,-1.077908,-1.169801,-2.706855,0.28318,-0.749431
25%,-0.860428,-0.83634,0.128775,-0.897098,-0.701249,-0.944217,0.841006,-0.234148
50%,0.133346,-0.08243,0.891944,-0.253507,-0.300356,0.061835,0.922108,0.091571
75%,0.904023,0.342208,1.315443,0.701051,0.390212,1.0058,1.593121,0.610109
max,3.075195,0.804493,2.698077,1.773649,1.04737,1.608783,2.040584,1.075724


In [26]:
f4.min(axis=0)

0   -1.796313
1   -1.514583
2   -1.819609
3   -1.077908
4   -1.169801
5   -2.706855
6    0.283180
7   -0.749431
dtype: float64

In [27]:
f4.min(axis=1)

0   -1.251434
1   -1.819609
2   -0.216383
3   -1.491702
4   -1.169801
5   -0.872232
6   -1.462323
7   -2.706855
dtype: float64

In [28]:
f4.max(axis=0)

0    3.075195
1    0.804493
2    2.698077
3    1.773649
4    1.047370
5    1.608783
6    2.040584
7    1.075724
dtype: float64

In [29]:
f4.max(axis=1)

0    3.075195
1    0.683461
2    1.773649
3    1.047370
4    2.040584
5    2.698077
6    1.060606
7    1.585571
dtype: float64

In [32]:
# argmin: Return int position of the smallest value in the Series.
f4[0].argmin(axis=0)

1

In [35]:
f4[0].argmax(axis=0)

0

In [36]:
f4[0].idxmin()

1

In [37]:
f4[0].idxmax()

0

In [39]:
# quantile(): Return values at the given quantile over requested axis (default 0), q default is 0.5
f4.quantile()

0    0.133346
1   -0.082430
2    0.891944
3   -0.253507
4   -0.300356
5    0.061835
6    0.922108
7    0.091571
Name: 0.5, dtype: float64

In [40]:
f4.quantile(axis=1)

0    0.585377
1   -0.378128
2    0.603812
3   -0.688555
4    0.378141
5    0.589090
6    0.132574
7    0.914821
Name: 0.5, dtype: float64

In [42]:
f4.sum(axis=0)

0    1.583488
1   -2.273567
2    5.322466
3   -0.089292
4   -1.111001
5   -0.684822
6    8.946306
7    1.472500
dtype: float64

In [43]:
f4.sum(axis=1)

0    3.727639
1   -4.484199
2    5.097209
3   -1.908681
4    3.640829
5    4.689200
6   -0.538940
7    2.943020
dtype: float64

In [44]:
f4.mean()

0    0.197936
1   -0.284196
2    0.665308
3   -0.011161
4   -0.138875
5   -0.085603
6    1.118288
7    0.184063
dtype: float64

In [45]:
f4.mean(axis=1)

0    0.465955
1   -0.560525
2    0.637151
3   -0.238585
4    0.455104
5    0.586150
6   -0.067368
7    0.367878
dtype: float64

In [46]:
f4.median(axis=0)

0    0.133346
1   -0.082430
2    0.891944
3   -0.253507
4   -0.300356
5    0.061835
6    0.922108
7    0.091571
dtype: float64

In [47]:
f4.median(axis=1)

0    0.585377
1   -0.378128
2    0.603812
3   -0.688555
4    0.378141
5    0.589090
6    0.132574
7    0.914821
dtype: float64

In [49]:
# mod(): Get Modulo of dataframe and other, element-wise (binary operator mod).
f4.mod(other=1, axis=0)

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.075195,0.804493,0.285172,0.251554,0.302878,0.748566,0.893521,0.366261
1,0.203687,0.485417,0.180391,0.619147,0.426862,0.099955,0.683461,0.816882
2,0.783617,0.352661,0.854963,0.773649,0.972427,0.608783,0.940662,0.810447
3,0.508298,0.372321,0.928926,0.922092,0.04737,0.158189,0.903555,0.250569
4,0.483074,0.872295,0.273208,0.946763,0.830199,0.562638,0.040584,0.632068
5,0.718237,0.962846,0.698077,0.127768,0.28637,0.820188,0.615771,0.459943
6,0.349997,0.537677,0.695475,0.241432,0.268977,0.023715,0.28318,0.060606
7,0.461383,0.338723,0.406254,0.028303,0.753918,0.293145,0.585571,0.075724


In [50]:
f4.mod(other=1, axis=1)

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.075195,0.804493,0.285172,0.251554,0.302878,0.748566,0.893521,0.366261
1,0.203687,0.485417,0.180391,0.619147,0.426862,0.099955,0.683461,0.816882
2,0.783617,0.352661,0.854963,0.773649,0.972427,0.608783,0.940662,0.810447
3,0.508298,0.372321,0.928926,0.922092,0.04737,0.158189,0.903555,0.250569
4,0.483074,0.872295,0.273208,0.946763,0.830199,0.562638,0.040584,0.632068
5,0.718237,0.962846,0.698077,0.127768,0.28637,0.820188,0.615771,0.459943
6,0.349997,0.537677,0.695475,0.241432,0.268977,0.023715,0.28318,0.060606
7,0.461383,0.338723,0.406254,0.028303,0.753918,0.293145,0.585571,0.075724


In [51]:
# prod(): Return the product of the values over the requested axis
f4.prod(axis=0)

0    0.587652
1   -0.000634
2    0.586276
3    0.171634
4   -0.001953
5   -0.013938
6    0.768405
7    0.001840
dtype: float64

In [52]:
f4.prod(axis=1)

0   -0.679405
1   -0.021976
2   -0.000915
3   -0.559736
4   -0.021900
5   -0.027317
6   -0.000134
7    2.354395
dtype: float64

In [54]:
f4.var(axis=0)

0    2.564214
1    0.726313
2    1.777139
3    1.129981
4    0.610909
5    2.179494
6    0.333019
7    0.446173
dtype: float64

In [55]:
f4.var(axis=1)

0    1.937650
1    1.077235
2    0.615716
3    1.053178
4    1.096193
5    1.401903
6    0.564799
7    2.236589
dtype: float64

In [56]:
f4.std(axis=0)

0    1.601316
1    0.852240
2    1.333094
3    1.063006
4    0.781607
5    1.476311
6    0.577078
7    0.667962
dtype: float64

In [57]:
f4.std(axis=1)

0    1.391995
1    1.037899
2    0.784676
3    1.026245
4    1.046992
5    1.184020
6    0.751531
7    1.495523
dtype: float64

In [58]:
f4.skew(axis=0)

0    0.582866
1   -0.532074
2   -0.582630
3    0.585428
4    0.362487
5   -0.588098
6    0.315385
7    0.208444
dtype: float64

In [59]:
f4.skew(axis=1)

0    0.706213
1   -0.143630
2    0.362691
3    0.394253
4    0.068236
5    0.571984
6   -0.618859
7   -1.527706
dtype: float64

In [60]:
f4.kurt(axis=0)

0    0.088989
1   -1.028798
2    1.240861
3   -1.102358
4   -1.211833
5   -0.159256
6   -0.694587
7   -1.250122
dtype: float64

In [61]:
f4.kurt(axis=1)

0    0.538217
1   -1.881659
2   -1.490540
3   -1.969701
4   -0.497842
5    0.080592
6    1.076701
7    1.793714
dtype: float64

In [62]:
# cumsum(): Return cumulative sum over a DataFrame or Series axis. 
# Returns a DataFrame or Series of the same size containing the cumulative sum.
f4.cumsum(axis=0)

Unnamed: 0,0,1,2,3,4,5,6,7
0,3.075195,0.804493,1.285172,-0.748446,-0.697122,-1.251434,0.893521,0.366261
1,1.278882,-0.71009,-0.534437,-0.129299,-1.270261,-1.15148,1.576982,0.183143
2,1.062499,-0.357429,0.320526,1.64435,-1.297834,0.457303,2.517644,-0.006411
3,-0.429203,-0.985108,1.249452,0.566442,-0.250464,-0.384508,3.421199,-0.755841
4,0.053871,-1.112814,1.52266,1.513205,-1.420265,1.17813,5.461783,-1.123773
5,0.772108,-1.149968,4.220737,0.640973,-2.133896,1.998318,7.077554,-0.66383
6,0.122105,-2.61229,3.916212,0.882405,-1.864919,2.022033,7.360735,0.396776
7,1.583488,-2.273567,5.322466,-0.089292,-1.111001,-0.684822,8.946306,1.4725


In [63]:
f4.cumsum(axis=1)

Unnamed: 0,0,1,2,3,4,5,6,7
0,3.075195,3.879687,5.16486,4.416414,3.719291,2.467857,3.361378,3.727639
1,-1.796313,-3.310896,-5.130505,-4.511358,-5.084497,-4.984542,-4.301081,-4.484199
2,-0.216383,0.136279,0.991242,2.764891,2.737318,4.346101,5.286763,5.097209
3,-1.491702,-2.119382,-1.190456,-2.268364,-1.220994,-2.062805,-1.15925,-1.908681
4,0.483074,0.355369,0.628577,1.57534,0.405539,1.968176,4.008761,3.640829
5,0.718237,0.681083,3.37916,2.506928,1.793297,2.613485,4.229257,4.6892
6,-0.650003,-2.112326,-2.41685,-2.175418,-1.906442,-1.882726,-1.599546,-0.53894
7,1.461383,1.800106,3.20636,2.234663,2.988581,0.281725,1.867296,2.94302


In [64]:
# cummin, cummax: Return cumulative minimum/maximum over a DataFrame or Series axis.
f4.cummin(axis=0)

Unnamed: 0,0,1,2,3,4,5,6,7
0,3.075195,0.804493,1.285172,-0.748446,-0.697122,-1.251434,0.893521,0.366261
1,-1.796313,-1.514583,-1.819609,-0.748446,-0.697122,-1.251434,0.683461,-0.183118
2,-1.796313,-1.514583,-1.819609,-0.748446,-0.697122,-1.251434,0.683461,-0.189553
3,-1.796313,-1.514583,-1.819609,-1.077908,-0.697122,-1.251434,0.683461,-0.749431
4,-1.796313,-1.514583,-1.819609,-1.077908,-1.169801,-1.251434,0.683461,-0.749431
5,-1.796313,-1.514583,-1.819609,-1.077908,-1.169801,-1.251434,0.683461,-0.749431
6,-1.796313,-1.514583,-1.819609,-1.077908,-1.169801,-1.251434,0.28318,-0.749431
7,-1.796313,-1.514583,-1.819609,-1.077908,-1.169801,-2.706855,0.28318,-0.749431


In [66]:
f4.cummax(axis=1)

Unnamed: 0,0,1,2,3,4,5,6,7
0,3.075195,3.075195,3.075195,3.075195,3.075195,3.075195,3.075195,3.075195
1,-1.796313,-1.514583,-1.514583,0.619147,0.619147,0.619147,0.683461,0.683461
2,-0.216383,0.352661,0.854963,1.773649,1.773649,1.773649,1.773649,1.773649
3,-1.491702,-0.627679,0.928926,0.928926,1.04737,1.04737,1.04737,1.04737
4,0.483074,0.483074,0.483074,0.946763,0.946763,1.562638,2.040584,2.040584
5,0.718237,0.718237,2.698077,2.698077,2.698077,2.698077,2.698077,2.698077
6,-0.650003,-0.650003,-0.304525,0.241432,0.268977,0.268977,0.28318,1.060606
7,1.461383,1.461383,1.461383,1.461383,1.461383,1.461383,1.585571,1.585571


In [67]:
# cumprod(): cummulative product
f4.cumprod(axis=0)

Unnamed: 0,0,1,2,3,4,5,6,7
0,3.075195,0.804493,1.285172,-0.748446,-0.697122,-1.251434,0.893521,0.366261
1,-5.524011,-1.218471,-2.338511,-0.463398,0.399548,-0.125087,0.610687,-0.067069
2,1.1953,-0.429707,-1.999341,-0.821905,-0.011017,-0.201237,0.57445,0.012713
3,-1.783032,0.269718,-1.857239,0.885937,-0.011538,0.169404,0.519047,-0.009528
4,-0.861336,-0.034444,-0.507413,0.838773,0.013498,0.264717,1.059159,0.003506
5,-0.618643,0.00128,-1.36904,-0.731604,-0.009632,0.217117,1.711359,0.001612
6,0.40212,-0.001871,0.416906,-0.176633,-0.002591,0.005149,0.484623,0.00171
7,0.587652,-0.000634,0.586276,0.171634,-0.001953,-0.013938,0.768405,0.00184


In [68]:
f4.cumprod(axis=1)

Unnamed: 0,0,1,2,3,4,5,6,7
0,3.075195,2.473972,3.17948,-2.379668,1.65892,-2.076029,-1.854976,-0.679405
1,-1.796313,2.720664,-4.950546,-3.065114,1.756735,0.175594,0.120011,-0.021976
2,-0.216383,-0.07631,-0.065242,-0.115716,0.003191,0.005133,0.004828,-0.000915
3,-1.491702,0.936311,0.869763,-0.937524,-0.981934,0.826603,0.746881,-0.559736
4,0.483074,-0.061691,-0.016855,-0.015957,0.018667,0.029169,0.059523,-0.0219
5,0.718237,-0.026685,-0.071999,0.0628,-0.044816,-0.036758,-0.059392,-0.027317
6,-0.650003,0.950514,-0.289455,-0.069884,-0.018797,-0.000446,-0.000126,-0.000134
7,1.461383,0.495004,0.696102,-0.6764,-0.50995,1.380361,2.188661,2.354395


In [69]:
# diff(): First discrete difference of element.
# Calculates the difference of a Dataframe element compared with 
# another element in the Dataframe (default is element in previous row).
f4.diff(axis=0)

Unnamed: 0,0,1,2,3,4,5,6,7
0,,,,,,,,
1,-4.871507,-2.319076,-3.104782,1.367592,0.123984,1.351389,-0.21006,-0.549379
2,1.57993,1.867244,2.674572,1.154503,0.545566,1.508828,0.2572,-0.006435
3,-1.27532,-0.980341,0.073963,-2.851557,1.074942,-2.450594,-0.037107,-0.559877
4,1.974776,0.499974,-0.655717,2.02467,-2.217171,2.404449,1.137029,0.381499
5,0.235163,0.090551,2.424868,-1.818995,0.456171,-0.74245,-0.424813,0.827875
6,-1.36824,-1.425169,-3.002601,1.113664,0.982607,-0.796473,-1.332591,0.600662
7,2.111386,1.801046,1.710779,-1.213129,0.484941,-2.730571,1.30239,0.015118


In [70]:
f4.diff(axis=1)

Unnamed: 0,0,1,2,3,4,5,6,7
0,,-2.270702,0.48068,-2.033618,0.051323,-0.554312,2.144955,-0.527261
1,,0.28173,-0.305026,2.438756,-1.192285,0.673093,0.583507,-0.866579
2,,0.569044,0.502302,0.918686,-1.801222,1.636356,-0.668122,-1.130215
3,,0.864023,1.556605,-2.006833,2.125277,-1.88918,1.745366,-1.652986
4,,-0.610779,0.400914,0.673554,-2.116564,2.732439,0.477947,-2.408516
5,,-0.755391,2.735231,-3.570309,0.158602,1.533818,0.795584,-1.155828
6,,-0.81232,1.157798,0.545957,0.027545,-0.245261,0.259465,0.777425
7,,-1.122659,1.067531,-2.377951,1.725615,-3.460773,4.292426,-0.509847


In [71]:
# pct_change(): Percentage change between the current and a prior element.
# Computes the percentage change from the immediately previous row by default. 
# This is useful in comparing the percentage of change in a time series of elements.
f4.pct_change(axis=0)

Unnamed: 0,0,1,2,3,4,5,6,7
0,,,,,,,,
1,-1.58413,-2.882656,-2.415849,-1.827243,-0.177851,-1.079872,-0.235092,-1.499966
2,-0.879541,-1.232844,-1.469861,1.864668,-0.951892,15.095135,0.37632,0.035144
3,5.893818,-2.779837,0.08651,-1.607734,-38.985779,-1.523259,-0.039447,2.953668
4,-1.323841,-0.796544,-0.705888,-1.878334,-2.116895,-2.856281,1.258395,-0.509052
5,0.486805,-0.709065,8.875525,-1.921278,-0.389956,-0.475126,-0.208182,-2.250078
6,-1.904998,38.358444,-1.112867,-1.276798,-1.376913,-0.971086,-0.82474,1.305948
7,-3.24827,-1.231634,-5.617867,-5.024723,1.802912,-115.139974,4.599153,0.014254


In [72]:
f4.pct_change(axis=1)

Unnamed: 0,0,1,2,3,4,5,6,7
0,,-0.738393,0.597494,-1.58237,-0.068573,0.795143,-1.713998,-0.590093
1,,-0.156838,0.201393,-1.340263,-1.925691,-1.174399,5.837715,-1.267927
2,,-2.629804,1.424319,1.074533,-1.015546,-59.347006,-0.415296,-1.201511
3,,-0.579219,-2.479937,-2.160381,-1.971669,-1.803738,-2.073347,-1.829425
4,,-1.26436,-3.139366,2.46535,-2.23558,-2.335814,0.305859,-1.180307
5,,-1.051729,-73.618789,-1.323279,-0.181834,-2.149317,0.970002,-0.715341
6,,1.249716,-0.791753,-1.792816,0.114089,-0.911832,10.940871,2.745335
7,,-0.768217,3.15163,-1.690983,-1.775877,-4.590385,-1.585761,-0.321554
