In [2]:
import numpy as np
import pandas as pd 


In [3]:
df = pd.DataFrame(np.arange(25).reshape(5,5), index=list("abcde"), columns=list("ABCDE"))
df["F"] = [np.nan,2]*(len(df) // 2) + [2]
df

Unnamed: 0,A,B,C,D,E,F
a,0,1,2,3,4,
b,5,6,7,8,9,2.0
c,10,11,12,13,14,
d,15,16,17,18,19,2.0
e,20,21,22,23,24,2.0


#### All such methods have a skipna option signaling whether to exclude missing data (True by default):

In [4]:
df.mean(skipna=False).mean() #default axis = 0 (columns), index = 1

12.0

In [5]:
df.sum(axis=1, skipna=True)

a     10.0
b     37.0
c     60.0
d     87.0
e    112.0
dtype: float64

In [6]:
df.std()

A    7.905694
B    7.905694
C    7.905694
D    7.905694
E    7.905694
F    0.000000
dtype: float64

In [7]:
# Combined with the broadcasting / arithmetic behavior, one can describe various statistical procedures, like standardization (rendering data zero mean and standard deviation of 1), very concisely:

ts_stand = (df - df.mean()) / df.std()
ts_stand

Unnamed: 0,A,B,C,D,E,F
a,-1.264911,-1.264911,-1.264911,-1.264911,-1.264911,
b,-0.632456,-0.632456,-0.632456,-0.632456,-0.632456,
c,0.0,0.0,0.0,0.0,0.0,
d,0.632456,0.632456,0.632456,0.632456,0.632456,
e,1.264911,1.264911,1.264911,1.264911,1.264911,


In [8]:
xs_stand = df.sub(df.mean(1), axis=0).div(df.std(1), axis=0)
xs_stand

Unnamed: 0,A,B,C,D,E,F
a,-1.264911,-0.632456,0.0,0.632456,1.264911,
b,-0.469809,-0.067116,0.335578,0.738272,1.140965,-1.67789
c,-1.264911,-0.632456,0.0,0.632456,1.264911,
d,0.079556,0.238667,0.397779,0.55689,0.716002,-1.988893
e,0.160904,0.281581,0.402259,0.522937,0.643614,-2.011295


#### Summarizing data: describe
- There is a convenient describe() function which computes a variety of summary statistics about a Series or the columns of a DataFrame (excluding NAs of course):

In [10]:
series = pd.Series(np.random.randn(1000))

series[::2] = np.nan

series.describe()

count    500.000000
mean       0.016537
std        0.966731
min       -2.944835
25%       -0.668137
50%        0.018219
75%        0.731153
max        2.666115
dtype: float64

In [11]:

# You can select specific percentiles to include in the output:
series.describe(percentiles=[0.05, 0.25, 0.75, 0.95])

count    500.000000
mean       0.016537
std        0.966731
min       -2.944835
5%        -1.571172
25%       -0.668137
50%        0.018219
75%        0.731153
95%        1.547712
max        2.666115
dtype: float64

In [14]:
# For a non-numerical Series object, describe() will give a simple summary of the number of unique values and most frequently occurring values:
s = pd.Series(["a", "a", "b", "b", "a", "a", np.nan, "c", "d", "a"])
s.describe()

count     9
unique    4
top       a
freq      5
dtype: object

- Note that on a mixed-type DataFrame object, describe() will restrict the summary to include only numerical columns or, if none are, only categorical columns:

In [15]:
frame = pd.DataFrame({"a": ["Yes", "Yes", "No", "No"], "b": range(4)})

frame.describe()

Unnamed: 0,b
count,4.0
mean,1.5
std,1.290994
min,0.0
25%,0.75
50%,1.5
75%,2.25
max,3.0


In [16]:
# This behavior can be controlled by providing a list of types as include/exclude arguments. The special value all can also be used:
frame.describe(include=["object"])

Unnamed: 0,a
count,4
unique,2
top,Yes
freq,2


In [17]:
frame.describe(include="number")

Unnamed: 0,b
count,4.0
mean,1.5
std,1.290994
min,0.0
25%,0.75
50%,1.5
75%,2.25
max,3.0


In [19]:
frame.describe(include="all")

Unnamed: 0,a,b
count,4,4.0
unique,2,
top,Yes,
freq,2,
mean,,1.5
std,,1.290994
min,,0.0
25%,,0.75
50%,,1.5
75%,,2.25


#### Index of min/max values

- The idxmin() and idxmax() functions on Series and DataFrame compute the index labels with the minimum and maximum corresponding values:

In [32]:
df = pd.DataFrame(np.random.randint(1,100,(5,5)), columns=list("ABCDE"))
df

Unnamed: 0,A,B,C,D,E
0,50,59,81,99,56
1,69,92,49,27,63
2,53,1,56,37,53
3,18,90,97,68,49
4,72,26,61,15,88


In [33]:
df.idxmin(axis=0)

A    3
B    2
C    1
D    4
E    3
dtype: int64

In [34]:
df.idxmax()

A    4
B    1
C    3
D    0
E    4
dtype: int64

#### Value counts (histogramming) / mode
- The value_counts() Series method and top-level function computes a histogram of a 1D array of values. It can also be used as a function on regular arrays:

In [44]:
new_df = pd.Series(np.random.randint(1,5, 50))
new_df.value_counts()

3    20
1    13
4    11
2     6
dtype: int64

- The value_counts() method can be used to count combinations across multiple columns. By default all columns are used but a subset can be selected using the subset argument.

In [45]:
data = {"a": [1, 2, 3, 4], "b": ["x", "x", "y", "y"]}

frame = pd.DataFrame(data)

frame.value_counts()

a  b
1  x    1
2  x    1
3  y    1
4  y    1
dtype: int64

In [51]:
# Similarly, you can get the most frequently occurring value(s), i.e. the mode, of the values in a Series or DataFrame:
s5 = pd.Series([1, 1, 3, 3, 3, 5, 5, 7, 7, 7])
s5.mode()


0    3
1    7
dtype: int64

In [48]:
df5 = pd.DataFrame(
    {
        "A": np.random.randint(0, 7, size=50),
        "B": np.random.randint(-10, 15, size=50),
    }
)
df5

Unnamed: 0,A,B
0,4,-7
1,3,-3
2,3,-2
3,1,-5
4,6,2
5,6,-3
6,5,-8
7,1,7
8,3,10
9,2,-1
