# Intro to pandas DataFrame iteration

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn import datasets
wine_data = pd.DataFrame(datasets.load_wine().data)
wine_data.columns = datasets.load_wine().feature_names
wine_data.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [3]:
# calculating color intensity / alcohol
def calc_color_alcohol(color_in, alco):
    per = color_in / alco
    return np.round(per, 2)

In [4]:
%%timeit
# using iloc

per_list = []
for i in range(len(wine_data)):
    row = wine_data.iloc[i]
    color_in = row['color_intensity']
    alco = row['alcohol']
    per_loop = calc_color_alcohol(color_in, alco)
    
    per_list.append(per_loop)

wine_data['percentage'] = per_list

33.5 ms ± 6.77 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [5]:
%%timeit

# iterrows() returns each DataFrame row
# as a tuple of (index, pandas series) pairs

per_list = []
for i, row in wine_data.iterrows():
    # row = wine_data.iloc[i]
    color_in = row['color_intensity']
    alco = row['alcohol']
    per_loop = calc_color_alcohol(color_in, alco)
    
    per_list.append(per_loop)

wine_data['percentage'] = per_list

16 ms ± 2.91 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [6]:
# row_tuple

wine_data2 = wine_data.head()

for row_tuple in wine_data2.iterrows():
    print(type(row_tuple))
    print(row_tuple[1]['alcohol']) # how to access value when it's row tuple

<class 'tuple'>
14.23
<class 'tuple'>
13.2
<class 'tuple'>
13.16
<class 'tuple'>
14.37
<class 'tuple'>
13.24


# Another iterator method: .itertuples()

In [7]:
# often more efficient than .iterrows

# .iterrows() : each row's values are stored as a pandas series
# pairs of index and pandas series
for i, row_tuple in wine_data.iterrows():
    if i == 0:
        print(row_tuple)
        print(type(row_tuple['ash']))  # acces data with bracket

alcohol                           14.23
malic_acid                         1.71
ash                                2.43
alcalinity_of_ash                 15.60
magnesium                        127.00
total_phenols                      2.80
flavanoids                         3.06
nonflavanoid_phenols               0.28
proanthocyanins                    2.29
color_intensity                    5.64
hue                                1.04
od280/od315_of_diluted_wines       3.92
proline                         1065.00
percentage                         0.40
Name: 0, dtype: float64
<class 'numpy.float64'>


In [8]:
# .itertuples(): returns each dataframe row as a special data type called a namedtuple
# namedtuple: have fields accessible using attribute lookup
for namedTuple in wine_data.itertuples():
    if namedTuple.Index == 0:
        print(namedTuple)
        print(namedTuple.ash)  # acces data with attribute

Pandas(Index=0, alcohol=14.23, malic_acid=1.71, ash=2.43, alcalinity_of_ash=15.6, magnesium=127.0, total_phenols=2.8, flavanoids=3.06, nonflavanoid_phenols=0.28, proanthocyanins=2.29, color_intensity=5.64, hue=1.04, _12=3.92, proline=1065.0, percentage=0.4)
2.43


In [9]:
%%timeit -r7 -n1

for row_tuple in wine_data.iterrows():
    row_tuple

The slowest run took 5.40 times longer than the fastest. This could mean that an intermediate result is being cached.
18.7 ms ± 13.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
%%timeit -r7 -n1

for namedtuple in wine_data.itertuples():
    namedtuple

3.21 ms ± 1.37 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
# the reason why iterrows() is less efficient: it returns pandas series (more overhead)

# pandas alterative to looping

In [12]:
# pandas .apply() method : takes a function and applies to a DataFrame
# must specify an axis to apply (0 for columns; 1 for rows)
# can be used with anonymous functions (lambda functions)
wine_data.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,percentage
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0.4
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0.33
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0.43
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0.54
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0.33


In [17]:
# define a function to use
def calc_diff_ash(al_ash, ash):
    diff = al_ash - ash
    return diff

In [18]:
# practice with lambda
ash_diff_apply = wine_data.apply(lambda row: calc_diff_ash(row['alcalinity_of_ash'] , row['ash']),
             axis = 1) # iterate over rows

In [20]:
type(ash_diff_apply)

pandas.core.series.Series

In [21]:
wine_data['ash_diff'] = ash_diff_apply
wine_data.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,percentage,ash_diff
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0.4,13.17
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0.33,9.06
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0.43,15.93
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0.54,14.3
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0.33,18.13


# Optimal pandas iterating

In [37]:
# pandas is a library that is built on Numpy
# pandas dataframe can take advantage of the efficient charateristics of NumPy arrays

# take numpy arrays from dataframe
type(wine_data['alcohol'].values)

numpy.ndarray

In [40]:
# take advantage of broadcasting feature of numpy array instead of using a loop

diff = wine_data['alcohol'].values - wine_data['malic_acid'].values
wine_data['diff'] = diff
wine_data.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,percentage,ash_diff,diff
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0.4,13.17,12.52
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0.33,9.06,11.42
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0.43,15.93,10.8
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0.54,14.3,12.42
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0.33,18.13,10.65
