In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame(
    {
        "one": pd.Series(np.random.randn(3), index=["a", "b", "c"]),
        "two": pd.Series(np.random.randn(4), index=["a", "b", "c", "d"]),
        "three": pd.Series(np.random.randn(3), index=["b", "c", "d"]),
    }
)
df

Unnamed: 0,one,two,three
a,0.219198,-0.695035,
b,1.378505,0.025738,-0.43599
c,-1.661063,-0.249727,-1.826749
d,,-0.318575,1.076304


In [6]:
df.mean(0)
# 0 for axis= 0

one     -0.021120
two     -0.309400
three   -0.395478
dtype: float64

In [7]:
df.mean(1)
# 1 for axis= 1

a   -0.237918
b    0.322751
c   -1.245846
d    0.378865
dtype: float64

In [8]:
df.sum(0, skipna=False)
# 실수가 NaN과 연산을 하는 경우는 모두 NaN이 된다.

one           NaN
two     -1.237599
three         NaN
dtype: float64

In [10]:
df.sum(1, skipna=True)
# skipna=True이면 NaN은 연산에서 제외

a   -0.475837
b    0.968253
c   -3.737539
d    0.757729
dtype: float64

In [14]:
df.cumsum(0, skipna=False)
# NaN을 만나는 순간 이후로는 NaN

Unnamed: 0,one,two,three
a,0.219198,-0.695035,
b,1.597703,-0.669297,
c,-0.06336,-0.919024,
d,,-1.237599,


In [16]:
df

Unnamed: 0,one,two,three
a,0.219198,-0.695035,
b,1.378505,0.025738,-0.43599
c,-1.661063,-0.249727,-1.826749
d,,-0.318575,1.076304


In [15]:
df.median(0)
# 숫자의 갯수가 홀 수 이면 중앙갑 그러니까 갯수가 5 이면 정렬한 상태에서 3번째값이 Median이 된다.
# 짝수인 겨우는 (3번+ 4번)/2 가 Median 값

one      0.219198
two     -0.284151
three   -0.435990
dtype: float64

In [18]:
df.mode()

Unnamed: 0,one,two,three
0,-1.661063,-0.695035,-1.826749
1,0.219198,-0.318575,-0.43599
2,1.378505,-0.249727,1.076304
3,,0.025738,


In [19]:
df = pd.DataFrame([('bird', 2, 2),
                   ('mammal', 4, np.nan),
                   ('arthropod', 8, 0),
                   ('bird', 2, np.nan)],
                  index=('falcon', 'horse', 'spider', 'ostrich'),
                  columns=('species', 'legs', 'wings'))
df

Unnamed: 0,species,legs,wings
falcon,bird,2,2.0
horse,mammal,4,
spider,arthropod,8,0.0
ostrich,bird,2,


In [20]:
df.mode()
# 각 column(series)에서 출현 빈도가 가장높은 항목을 return

Unnamed: 0,species,legs,wings
0,bird,2.0,0.0
1,,,2.0


In [21]:
df.mode(dropna=False)

Unnamed: 0,species,legs,wings
0,bird,2,


In [22]:
df.mode(numeric_only=True)

Unnamed: 0,legs,wings
0,2.0,0.0
1,,2.0


In [23]:
df.mode(axis='columns', numeric_only=True)

Unnamed: 0,0,1
falcon,2.0,
horse,4.0,
spider,0.0,8.0
ostrich,2.0,


In [25]:
series = pd.Series(np.random.randn(500))

series[20:500] = np.nan
series[10:20] = 5
series.nunique()
# Series 요소중에 unique안 애들의 수를 return

series.unique()
# NaN은 숫자가 아니지만 그래도 unique하긴하다.

array([-1.75021992, -1.04655027, -0.17835049, -1.86832265, -0.21797125,
        0.85831743, -0.63124616, -0.13961061, -0.70752525, -0.77049151,
        5.        ,         nan])

In [27]:
# Summarizing data: describe
series = pd.Series(np.random.randn(1000))

series[::2] = np.nan
# series
series.describe()

count    500.000000
mean       0.040307
std        1.024203
min       -3.223595
25%       -0.742478
50%        0.060116
75%        0.729273
max        2.872598
dtype: float64

In [28]:
np.random.randint([1, 3, 5, 7], [[10], [20]], dtype=np.uint8)

array([[ 8,  7,  9,  7],
       [11, 10, 19,  8]], dtype=uint8)

In [29]:
s = pd.Series(["a", "a", "b", "b", "a", "a", np.nan, "c", "d", "a"])

s.describe()

count     9
unique    4
top       a
freq      5
dtype: object

In [30]:
# Index of min/max values
s1 = pd.Series(np.random.randn(5))

s1

0   -0.342810
1    1.429358
2    1.170753
3    0.012521
4   -0.212278
dtype: float64

In [31]:
s1.idxmin(), s1.idxmax()

(0, 1)

In [32]:
df1 = pd.DataFrame(np.random.randn(5, 3), columns=["A", "B", "C"])

df1

Unnamed: 0,A,B,C
0,-1.13125,-0.47936,-0.938903
1,0.275284,-0.250429,2.144935
2,0.228608,-1.579207,2.081531
3,-2.696867,1.84843,0.464907
4,-0.149149,-1.011692,-0.248192


In [33]:
df1.idxmin(axis=0)

A    3
B    2
C    0
dtype: int64

In [34]:
df1.idxmax(axis=1)

0    B
1    C
2    C
3    B
4    A
dtype: object

In [35]:
# Value counts (histogramming) / mode
data = np.random.randint(0, 7, size=50)

data

array([4, 4, 3, 1, 5, 5, 6, 1, 2, 4, 0, 0, 6, 3, 4, 2, 0, 3, 3, 3, 6, 2,
       0, 1, 0, 2, 4, 1, 6, 2, 4, 0, 4, 0, 2, 1, 0, 3, 5, 2, 0, 6, 5, 6,
       2, 5, 6, 4, 5, 6])

In [36]:
s = pd.Series(data)

s.value_counts()

0    9
4    8
6    8
2    8
3    6
5    6
1    5
Name: count, dtype: int64

In [37]:
pd.value_counts(data)

0    9
4    8
6    8
2    8
3    6
5    6
1    5
Name: count, dtype: int64

In [39]:
data = {"a": [1, 2, 3, 4], "b": ["x", "x", "y", "y"]}

frame = pd.DataFrame(data)

print(type(frame.value_counts()))
# MultiIndexed Series

<class 'pandas.core.series.Series'>


In [40]:
s5 = pd.Series([1, 1, 3, 3, 3, 5, 5, 7, 7, 7])

s5.mode()

0    3
1    7
dtype: int64

In [44]:
df5 = pd.DataFrame(
    {
        "A": np.random.randint(0, 7, size=50),
        "B": np.random.randint(-10, 15, size=50),
    }
)
# df5.value_counts()
df5
df5.mode()

Unnamed: 0,A,B
0,3.0,1
1,,8
2,,13


In [45]:
# Discretization and quantiling
pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3)

[(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], (0.994, 3.0]]
Categories (3, interval[float64, right]): [(0.994, 3.0] < (3.0, 5.0] < (5.0, 7.0]]

In [46]:
pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True)

([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], (0.994, 3.0]]
 Categories (3, interval[float64, right]): [(0.994, 3.0] < (3.0, 5.0] < (5.0, 7.0]],
 array([0.994, 3.   , 5.   , 7.   ]))