### Pandas按行遍历DataFrame的3种方法

In [1]:
import pandas as pd
import numpy as np
import collections

In [2]:
df = pd.DataFrame(
    np.random.random(size=(100000, 4)), 
    columns=list('ABCD')
)
df.head(3)

Unnamed: 0,A,B,C,D
0,0.282263,0.361864,0.359934,0.209402
1,0.313442,0.323673,0.318766,0.825063
2,0.365249,0.23298,0.396346,0.16364


In [3]:
df.shape

(100000, 4)

### 1. df.iterrows()

#### 使用方式

In [4]:
for idx, row in df.iterrows():
    print(idx, row)
    print(idx, row["A"], row["B"], row["C"], row["D"])
    break

0 A    0.282263
B    0.361864
C    0.359934
D    0.209402
Name: 0, dtype: float64
0 0.28226251811935454 0.36186430483000975 0.3599340225962093 0.20940237456896993


#### 时间耗费

In [9]:
%%time
result = collections.defaultdict(int)
for idx, row in df.iterrows():
    result[(row["A"], row["B"])] += row["A"] + row["B"]


Wall time: 9.61 s


### 2. df.itertuples()

#### 使用方式

In [10]:
for row in df.itertuples():
    print(row)
    print(row.Index, row.A, row.B, row.C, row.D)
    break

Pandas(Index=0, A=0.28226251811935454, B=0.36186430483000975, C=0.3599340225962093, D=0.20940237456896993)
0 0.28226251811935454 0.36186430483000975 0.3599340225962093 0.20940237456896993


#### 时间耗费

In [11]:
%%time
result = collections.defaultdict(int)
for row in df.itertuples():
    result[(row.A, row.B)] += row.A + row.B

Wall time: 186 ms


### 3. for+zip

#### 使用方式

In [12]:
# 既不需要类型检查，也不需要构建namedtuple
# 缺点是需要挨个指定变量
for A, B in zip(df["A"], df["B"]):
    print(A, B)
    break

0.28226251811935454 0.36186430483000975


#### 时间耗费

In [13]:
%%time
result = collections.defaultdict(int)
for A, B in zip(df["A"], df["B"]):
    result[(A, B)] += A + B

Wall time: 122 ms
