In [1]:
import pandas as pd

# Pandas

In [2]:
users = pd.read_table('../data/user.tbl', sep='|')
ufo = pd.read_csv('../data/ufo.csv')

### Apply

The `apply` method calls the provided function on each item in a Pandas Series or along the specified dimension of a Pandas DataFrame.

In [3]:
# Series example
# /scrub/

# verbose
def is_under_30(num):
    return num < 30

users.loc[:, 'age'].apply(is_under_30)

0       True
1      False
2       True
3       True
4      False
5      False
6      False
7      False
8       True
9      False
10     False
11      True
12     False
13     False
14     False
15      True
16     False
17     False
18     False
19     False
20      True
21      True
22     False
23      True
24     False
25     False
26     False
27     False
28     False
29      True
       ...  
913    False
914    False
915     True
916     True
917    False
918     True
919    False
920     True
921     True
922     True
923     True
924     True
925    False
926     True
927     True
928    False
929     True
930    False
931    False
932     True
933    False
934    False
935     True
936    False
937    False
938     True
939    False
940     True
941    False
942     True
Name: age, Length: 943, dtype: bool

You can use the `lambda` keyword to create an "anonymous function" on the fly.

In [4]:
# Same thing, but use `lambda` to define the function inside the call to `apply`
# /scrub/

users.loc[:, 'age'].apply(lambda age: age < 30)

0       True
1      False
2       True
3       True
4      False
5      False
6      False
7      False
8       True
9      False
10     False
11      True
12     False
13     False
14     False
15      True
16     False
17     False
18     False
19     False
20      True
21      True
22     False
23      True
24     False
25     False
26     False
27     False
28     False
29      True
       ...  
913    False
914    False
915     True
916     True
917    False
918     True
919    False
920     True
921     True
922     True
923     True
924     True
925    False
926     True
927     True
928    False
929     True
930    False
931    False
932     True
933    False
934    False
935     True
936    False
937    False
938     True
939    False
940     True
941    False
942     True
Name: age, Length: 943, dtype: bool

In [5]:
# Applying to each row of a DataFrame.
# /scrub/

users.apply(lambda row: row.age < 30 and row.gender == 'M', axis=1)

0       True
1      False
2       True
3       True
4      False
5      False
6      False
7      False
8       True
9      False
10     False
11     False
12     False
13     False
14     False
15      True
16     False
17     False
18     False
19     False
20      True
21      True
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29      True
       ...  
913    False
914    False
915     True
916    False
917    False
918     True
919    False
920    False
921    False
922     True
923     True
924    False
925    False
926     True
927     True
928    False
929    False
930    False
931    False
932     True
933    False
934    False
935     True
936    False
937    False
938    False
939    False
940     True
941    False
942     True
Length: 943, dtype: bool

### String Methods

Pandas has built-in methods for operating on strings, which you access through `.str`.

In [6]:
# Use string methods to change State abbreviations in ufo data to uppercase
# /scrub/
ufo.loc[:, 'State'].str.upper()

0        NY
1        NJ
2        CO
3        KS
4        NY
5        ND
6        CA
7        MI
8        AK
9        OR
10       CA
11       AL
12       SC
13       IA
14       MI
15       CA
16       CA
17       GA
18       TN
19       AK
20       NE
21       LA
22       LA
23       KY
24       WV
25       CA
26       WV
27       NM
28       NM
29       UT
         ..
80513    NJ
80514    MA
80515    VA
80516    CA
80517    NH
80518    PA
80519    IL
80520    PA
80521    OH
80522    MA
80523    MD
80524    WA
80525    IA
80526    MA
80527    WA
80528    OH
80529    WA
80530    FL
80531    VA
80532    MA
80533    IA
80534    TX
80535    KY
80536    PA
80537    NE
80538    NE
80539    OH
80540    AZ
80541    IL
80542    FL
Name: State, Length: 80543, dtype: object

In [7]:
# Get a Boolean series that indicates which elements of ufo
# "Colors Reported" column contain the substring "RED"
# /scrub/
ufo.loc[:, 'Colors Reported'].str.contains('RED', na='False') 

0        False
1        False
2        False
3        False
4        False
5        False
6        False
7        False
8        False
9        False
10       False
11       False
12        True
13       False
14       False
15       False
16       False
17       False
18       False
19        True
20       False
21       False
22       False
23       False
24       False
25       False
26       False
27       False
28       False
29       False
         ...  
80513    False
80514    False
80515    False
80516    False
80517    False
80518    False
80519     True
80520    False
80521    False
80522    False
80523    False
80524     True
80525    False
80526    False
80527    False
80528     True
80529    False
80530    False
80531    False
80532    False
80533    False
80534    False
80535    False
80536     True
80537    False
80538    False
80539    False
80540     True
80541     True
80542    False
Name: Colors Reported, Length: 80543, dtype: object