In [16]:
import pandas as pd
import numpy as np

In [17]:
df = pd.read_csv('files/tips.csv')

In [18]:
df.head()


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608
2,21.01,3.5,Male,No,Sun,Dinner,3,7.0,Travis Walters,6011812112971322,Sun4458
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994,Sun5260
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221,Sun2251


In [19]:
# lambda function
#lambda num: num * 2

In [20]:
df['total_bill'].apply(lambda bill: bill * 2)

0      33.98
1      20.68
2      42.02
3      47.36
4      49.18
       ...  
239    58.06
240    54.36
241    45.34
242    35.64
243    37.56
Name: total_bill, Length: 244, dtype: float64

In [21]:
def tip_quality(bill, tip):
    if tip/bill > 0.25:
        return "Generous"
    else:
        return "Other"

# calling using lambda function

In [22]:
df['quality'] = df[['total_bill', 'tip']].apply(lambda df: tip_quality(df['total_bill'], df['tip']), axis=1)

In [23]:
df[df['quality'] != 'Other']

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID,quality
51,10.29,2.6,Female,No,Sun,Dinner,2,5.14,Jessica Ibarra,4999759463713,Sun4474,Generous
67,3.07,1.0,Female,Yes,Sat,Dinner,1,3.07,Tiffany Brock,4359488526995267,Sat3455,Generous
93,16.32,4.3,Female,Yes,Fri,Dinner,2,8.16,Natalie Nguyen,5181236182893396,Fri6963,Generous
109,14.31,4.0,Female,Yes,Sat,Dinner,2,7.16,Amanda Anderson,375638820334211,Sat2614,Generous
149,7.51,2.0,Male,No,Thur,Lunch,2,3.76,Daniel Robbins,4823139288341889,Thur6321,Generous
172,7.25,5.15,Male,Yes,Sun,Dinner,2,3.62,Larry White,30432617123103,Sun9209,Generous
178,9.6,4.0,Female,Yes,Sun,Dinner,2,4.8,Melanie Gray,4211808859168,Sun4598,Generous
183,23.17,6.5,Male,Yes,Sun,Dinner,4,5.79,Dr. Michael James,4718501859162,Sun6059,Generous
221,13.42,3.48,Female,Yes,Fri,Lunch,2,6.71,Leslie Kaufman,379437981958785,Fri7511,Generous
232,11.61,3.39,Male,No,Sat,Dinner,2,5.8,James Taylor,6011482917327995,Sat2124,Generous


# using vectorize method

In [24]:
# works faster
df['quality2'] = np.vectorize(tip_quality)(df['total_bill'], df['tip'])

In [25]:
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,price_per_person,Payer Name,CC Number,Payment ID,quality,quality2
0,16.99,1.01,Female,No,Sun,Dinner,2,8.49,Christy Cunningham,3560325168603410,Sun2959,Other,Other
1,10.34,1.66,Male,No,Sun,Dinner,3,3.45,Douglas Tucker,4478071379779230,Sun4608,Other,Other
2,21.01,3.50,Male,No,Sun,Dinner,3,7.00,Travis Walters,6011812112971322,Sun4458,Other,Other
3,23.68,3.31,Male,No,Sun,Dinner,2,11.84,Nathaniel Harris,4676137647685994,Sun5260,Other,Other
4,24.59,3.61,Female,No,Sun,Dinner,4,6.15,Tonya Carter,4832732618637221,Sun2251,Other,Other
...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,9.68,Michael Avila,5296068606052842,Sat2657,Other,Other
240,27.18,2.00,Female,Yes,Sat,Dinner,2,13.59,Monica Sanders,3506806155565404,Sat1766,Other,Other
241,22.67,2.00,Male,Yes,Sat,Dinner,2,11.34,Keith Wong,6011891618747196,Sat3880,Other,Other
242,17.82,1.75,Male,No,Sat,Dinner,2,8.91,Dennis Dixon,4375220550950,Sat17,Other,Other


# test performance between lambda or vectorize using timeit

In [29]:
import timeit

In [30]:
setup = '''
import numpy as np
import pandas as pd
df = pd.read_csv('files/tips.csv')
def tip_quality(bill, tip):
    if tip/bill > 0.25:
        return "Generous"
    else:
        return "Other"
'''

In [31]:
stmt_one = '''
df['quality'] = df[['total_bill', 'tip']].apply(lambda df: tip_quality(df['total_bill'], df['tip']), axis=1)
'''
stmt_two = '''
df['quality2'] = np.vectorize(tip_quality)(df['total_bill'], df['tip'])
'''

In [32]:
timeit.timeit(setup=setup,stmt=stmt_one,number=1000)

1.6479457999957958

In [33]:
timeit.timeit(setup=setup,stmt=stmt_two,number=1000)

0.1954773000034038