# Efficient Code with pandas

## 1. Selecting columns and rows efficiently

In [1]:
import time
import pandas as pd
import numpy as np

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
poker_hands = pd.read_csv('data/poker_hands.csv')
poker_hands.shape

(25010, 12)

In [4]:
# Row selection: loc[] vs iloc[]
# iloc[] is more efficient

row_nums = range(0, 1000)

# Select the rows using .loc[] 
loc_start_time = time.time()
rows = poker_hands.loc[row_nums]
loc_end_time = time.time()

# Select the rows using .iloc[]
iloc_start_time = time.time()
rows = poker_hands.iloc[row_nums]
iloc_end_time = time.time()

print("Time using .loc[] : {} sec".format(loc_end_time - loc_start_time))
print("Time using .iloc[]: {} sec".format(iloc_end_time - iloc_start_time))

Time using .loc[] : 0.0019943714141845703 sec
Time using .iloc[]: 0.0010082721710205078 sec


In [5]:
poker_hands.columns

Index(['S1', 'R1', 'S2', 'R2', 'S3', 'R3', 'S4', 'R4', 'S5', 'R5', 'Class',
       'Explanation'],
      dtype='object')

In [6]:
# Column selection: .iloc[] vs by name

# Use .iloc to select the first 6 columns
iloc_start_time = time.time()
cols = poker_hands.iloc[:, 0:6]
iloc_end_time = time.time()

# Use simple column selection to select the first 6 columns
names_start_time = time.time()
cols = poker_hands[['S1', 'R1', 'S2', 'R2', 'S3', 'R3']]
names_end_time = time.time()

print("Time using .iloc[]           : {} sec".format(iloc_end_time - iloc_start_time))
print("Time using selection by name : {} sec".format(names_end_time - names_start_time))

Time using .iloc[]           : 0.0009984970092773438 sec
Time using selection by name : 0.0029947757720947266 sec


In [7]:
# Random row selection
# sample() is more efficient

N = poker_hands.shape[0]

# Select and time the selection of the 75% of the dataset's rows
rand_start_time = time.time()
poker_hands.iloc[np.random.randint(low=0, high=N, size=int(0.75 * N))]
print("Time using Numpy  : {} sec".format(time.time() - rand_start_time))

# Select and time the selection of the 75% of the dataset's rows using sample()
samp_start_time = time.time()
poker_hands.sample(int(0.75 * N), axis=0, replace=True)
print("Time using .sample: {} sec".format(time.time() - samp_start_time))

Time using Numpy  : 0.002993345260620117 sec
Time using .sample: 0.003991842269897461 sec


## 2. Replacing values in a DataFrame

In [8]:
names = pd.read_csv('data/baby_names.csv')
names.head()

Unnamed: 0,BRITH_YEAR,GENDER,ETHNICTY,NAME,COUNT,RANK
0,2011,FEMALE,HISPANIC,GERALDINE,13,75
1,2011,FEMALE,HISPANIC,GIA,21,67
2,2011,FEMALE,HISPANIC,GIANNA,49,42
3,2011,FEMALE,HISPANIC,GISELLE,38,51
4,2011,FEMALE,HISPANIC,GRACE,36,53


In [9]:
# Replacing scalar values

# Replace all the entries that has 'FEMALE' as a GENDER with 'GIRL'
names['GENDER'].loc[names['GENDER'] == 'FEMALE'] = 'GIRL'

# efficient way
# Replace all the entries that has 'FEMALE' as a GENDER with 'GIRL'
names['GENDER'].replace('FEMALE', 'GIRL', inplace=True)

names.head()

Unnamed: 0,BRITH_YEAR,GENDER,ETHNICTY,NAME,COUNT,RANK
0,2011,GIRL,HISPANIC,GERALDINE,13,75
1,2011,GIRL,HISPANIC,GIA,21,67
2,2011,GIRL,HISPANIC,GIANNA,49,42
3,2011,GIRL,HISPANIC,GISELLE,38,51
4,2011,GIRL,HISPANIC,GRACE,36,53


In [10]:
names.groupby('ETHNICTY').count()

Unnamed: 0_level_0,BRITH_YEAR,GENDER,NAME,COUNT,RANK
ETHNICTY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ASIAN AND PACI,358,358,358,358,358
ASIAN AND PACIFIC ISLANDER,1952,1952,1952,1952,1952
BLACK NON HISP,353,353,353,353,353
BLACK NON HISPANIC,2274,2274,2274,2274,2274
HISPANIC,4254,4254,4254,4254,4254
WHITE NON HISP,701,701,701,701,701
WHITE NON HISPANIC,4070,4070,4070,4070,4070


In [11]:
# Replace values using lists

# Replace all non-Hispanic ethnicities with 'NON HISPANIC'
names['ETHNICTY'].loc[(names["ETHNICTY"] == 'BLACK NON HISP') |
                      (names["ETHNICTY"] == 'BLACK NON HISPANIC') |
                      (names["ETHNICTY"] == 'WHITE NON HISP') |
                      (names["ETHNICTY"] == 'WHITE NON HISPANIC')] = 'NON HISPANIC'

# efficient way
# Replace all non-Hispanic ethnicities with 'NON HISPANIC'
names['ETHNICTY'].replace(['BLACK NON HISP', 'BLACK NON HISPANIC', 'WHITE NON HISP', 'WHITE NON HISPANIC'],
                          'NON HISPANIC', 
                          inplace=True)

names.groupby('ETHNICTY').count()

Unnamed: 0_level_0,BRITH_YEAR,GENDER,NAME,COUNT,RANK
ETHNICTY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ASIAN AND PACI,358,358,358,358,358
ASIAN AND PACIFIC ISLANDER,1952,1952,1952,1952,1952
HISPANIC,4254,4254,4254,4254,4254
NON HISPANIC,7398,7398,7398,7398,7398


In [12]:
# Replace values using dictionaries

# Replace string to string
poker_hands['Explanation'].replace({'Royal flush': 'Flush', 'Straight flush': 'Flush'}, inplace=True)
print(poker_hands['Explanation'].head(), '\n')

# Replace the number by a string
names['RANK'].replace({1: 'FIRST', 2: 'SECOND', 3: 'THIRD'}, inplace=True)
rank_count = names.groupby('RANK').count()
print(rank_count.head())
print(rank_count.tail())

0    Flush
1    Flush
2    Flush
3    Flush
4    Flush
Name: Explanation, dtype: object 

      BRITH_YEAR  GENDER  ETHNICTY  NAME  COUNT
RANK                                           
4             57      57        57    57     57
5             56      56        56    56     56
6             58      58        58    58     58
7             57      57        57    57     57
8             57      57        57    57     57
        BRITH_YEAR  GENDER  ETHNICTY  NAME  COUNT
RANK                                             
101             26      26        26    26     26
102             16      16        16    16     16
FIRST           56      56        56    56     56
SECOND          63      63        63    63     63
THIRD           58      58        58    58     58


In [13]:
# Replace multiple values with just one value
names = pd.read_csv('data/baby_names.csv')

# Replace the rank of the first three ranked names to 'MEDAL'
names.replace({'RANK': {1: 'MEDAL', 2: 'MEDAL', 3: 'MEDAL'}}, inplace=True)

# Replace the rank of the 4th and 5th ranked names to 'ALMOST MEDAL'
names.replace({'RANK': {4: 'ALMOST MEDAL', 5: 'ALMOST MEDAL'}}, inplace=True)

rank_count = names.groupby('RANK').count()
rank_count.tail()

Unnamed: 0_level_0,BRITH_YEAR,GENDER,ETHNICTY,NAME,COUNT
RANK,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
100,65,65,65,65,65
101,26,26,26,26,26
102,16,16,16,16,16
ALMOST MEDAL,113,113,113,113,113
MEDAL,177,177,177,177,177


## 3. Efficient iterating

In [14]:
for index, values in poker_hands.iterrows():
    
    # Check if index is odd
    if index % 2 == 1:
        # Sum the ranks of all the cards
        hand_sum = sum([values[1], values[3], values[5], values[7], values[9]])

In [15]:
# apply() in every cell

poker_rank = poker_hands[['R1', 'R2', 'R3', 'R4', 'R5']]

# Define the lambda transformation
get_square = lambda x: x ** 2

# Apply the transformation
data_sum = poker_rank.apply(get_square)
print(data_sum.head())

    R1   R2   R3   R4   R5
0  100  121  169  144    1
1  121  169  100  144    1
2  144  121  169  100    1
3  100  121    1  169  144
4    1  169  144  121  100


In [16]:
# apply() for rows iteration

get_variance = lambda x: np.var(x)

# Apply the transformation
data_tr = poker_rank.apply(get_variance, axis=1)
print(data_tr.head(), '\n')

data_tr = poker_rank.apply(get_variance, axis=0)
print(data_tr.head())

0    18.64
1    18.64
2    18.64
3    18.64
4    18.64
dtype: float64 

R1    14.060473
R2    14.189523
R3    14.024270
R4    14.040552
R5    13.998851
dtype: float64


In [17]:
# pandas vectorization

# Calculate the mean rank in each hand
row_start_time = time.time()
mean_r = poker_rank.mean(axis=1)
print("Time using pandas vectorization for rows: {} sec".format(time.time() - row_start_time))
print(mean_r.head())

# Calculate the mean rank of each of the 5 card in all hands
col_start_time = time.time()
mean_c = poker_rank.mean(axis=0)
print("Time using pandas vectorization for columns: {} sec".format(time.time() - col_start_time))
print(mean_c.head())

Time using pandas vectorization for rows: 0.000997304916381836 sec
0    9.4
1    9.4
2    9.4
3    9.4
4    9.4
dtype: float64
Time using pandas vectorization for columns: 0.0 sec
R1    6.995242
R2    7.014194
R3    7.014154
R4    6.942463
R5    6.962735
dtype: float64


In [18]:
# Vectorization methods for looping a DataFrame

# Calculate the variance in each hand
start_time = time.time()
poker_var = poker_rank.var(axis=1)
print("Time using pandas vectorization: {} sec".format(time.time() - start_time))
print(poker_var.head())

# Calculate the variance in each hand
start_time = time.time()
poker_var = poker_rank.values.var(axis=1, ddof=1)
print("Time using NumPy vectorization: {} sec".format(time.time() - start_time))
print(poker_var[0:5])

Time using pandas vectorization: 0.001992464065551758 sec
0    23.3
1    23.3
2    23.3
3    23.3
4    23.3
dtype: float64
Time using NumPy vectorization: 0.003994464874267578 sec
[23.3 23.3 23.3 23.3 23.3]


## 4. Data manipulation using groupby()

In [19]:
restaurant_data = pd.read_csv('data/restaurant_data.csv')
restaurant_data.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [20]:
# min-max normalization using .transform()

min_max_tr = lambda x: (x - x.min()) / (x.max() - x.min())

# Group the data according to the 'time'
restaurant_grouped = restaurant_data.groupby('time')
restaurant_min_max_group = restaurant_grouped.transform(min_max_tr)
print(restaurant_min_max_group.head(), '\n')

# Group the data according to the 'day'
restaurant_grouped = restaurant_data.groupby('day')
restaurant_min_max_group = restaurant_grouped.transform(min_max_tr)
print(restaurant_min_max_group.head())

   total_bill       tip  size
0    0.291579  0.001111   0.2
1    0.152283  0.073333   0.4
2    0.375786  0.277778   0.4
3    0.431713  0.256667   0.2
4    0.450775  0.290000   0.6 

   total_bill       tip  size
0    0.238025  0.000000  0.00
1    0.075513  0.118397  0.25
2    0.336266  0.453552  0.25
3    0.401515  0.418944  0.00
4    0.423754  0.473588  0.50


In [21]:
# exponential transformation
# exponential distribution : e**(−λ∗x)∗λ
# λ (lambda) is the mean of the group that the observation x belongs to.

# Define the exponential transformation
exp_tr = lambda x: np.exp(-x.mean() * x) * x.mean()

restaurant_grouped = restaurant_data.groupby('time')
restaurant_exp_group = restaurant_grouped['tip'].transform(exp_tr)
print(restaurant_exp_group.head())

0    0.135141
1    0.017986
2    0.000060
3    0.000108
4    0.000042
Name: tip, dtype: float64


In [22]:
# Validation of z-score normalization

zscore = lambda x: (x - x.mean()) / x.std()

restaurant_grouped = restaurant_data.groupby('time')
restaurant_trans = restaurant_grouped.transform(zscore)

# Re-group the grouped object
restaurant_regrouped = restaurant_trans.groupby(restaurant_data['day'])

# print each group's means and standard deviation
print(np.round(restaurant_regrouped.mean(), 3))
print(restaurant_regrouped.std())

      total_bill    tip   size
day                           
Fri       -0.285 -0.177 -0.468
Sat       -0.039 -0.076 -0.125
Sun        0.067  0.106  0.232
Thur       0.060  0.031  0.034
      total_bill       tip      size
day                                 
Fri     0.864878  0.710048  0.593099
Sat     1.037015  1.135612  0.900063
Sun     0.966101  0.859799  1.106675
Thur    1.022810  1.028730  1.028002


In [23]:
# filter() function

# Filter the days where the count of total_bill is greater than $40
total_bill_40 = restaurant_data.groupby('day').filter(lambda x: x['total_bill'].count() > 40)

print('Number of tables where total_bill is greater than $40:', total_bill_40.shape[0], '\n')

# the mean amount of money the customers paid
# Select only the entries that have a mean total_bill greater than $20
total_bill_20 = total_bill_40.groupby('day').filter(lambda x: x['total_bill'].mean() > 20)

print('Days of the week that have a mean total_bill greater than $20:', total_bill_20.day.unique())

Number of tables where total_bill is greater than $40: 225 

Days of the week that have a mean total_bill greater than $20: ['Sun' 'Sat']
