## Writing Efficient Python Code

In [1]:
import warnings
warnings.filterwarnings(action='ignore')

### 1. Foundations for efficiencies

In [2]:
names = ['Jerry', 'Kramer', 'Elaine', 'George', 'Newman']

# 1. Non-Pythonic approach
i = 0
new_list= []
while i < len(names):
    if len(names[i]) >= 6:
        new_list.append(names[i])
    i += 1


# 2. better way
better_list = []
for name in names:
    if len(name) >= 6:
        better_list.append(name)


# 3. list comprehension
best_list = [name for name in names if len(name) >= 6]

print(best_list)

['Kramer', 'Elaine', 'George', 'Newman']


In [3]:
# range()

# Create a range object that goes from 0 to 5
nums = range(6)
print(type(nums))   # range object

# Convert nums to a list
nums_list = list(nums)
print(nums_list)

# Unpacking

# Create a new list of odd numbers from 1 to 11 by unpacking a range object
nums_list2 = [*range(1,12,2)]
print(nums_list2)

<class 'range'>
[0, 1, 2, 3, 4, 5]
[1, 3, 5, 7, 9, 11]


In [4]:
# enumerate()

# 1. loop to use enumerate
indexed_names = []
for i,name in enumerate(names):
    index_name = (i,name)
    indexed_names.append(index_name)
print(indexed_names)

# 2. using list comprehension
indexed_names_comp = [(i,name) for i,name in enumerate(names)]
print(indexed_names_comp)

# 3. Unpack an enumerate object and starting number
indexed_names_unpack = [*enumerate(names, 1)]
print(indexed_names_unpack)

[(0, 'Jerry'), (1, 'Kramer'), (2, 'Elaine'), (3, 'George'), (4, 'Newman')]
[(0, 'Jerry'), (1, 'Kramer'), (2, 'Elaine'), (3, 'George'), (4, 'Newman')]
[(1, 'Jerry'), (2, 'Kramer'), (3, 'Elaine'), (4, 'George'), (5, 'Newman')]


In [5]:
# map()

# map object
names_map  = map(str.upper, names)
print(type(names_map))

# Unpack map into a list
names_uppercase = [*names_map]
print(names_uppercase)

<class 'map'>
['JERRY', 'KRAMER', 'ELAINE', 'GEORGE', 'NEWMAN']


In [6]:
import numpy as np

arrival_times = [*range(10,60,10)]

# Convert arrival_times to an array and update the times
arrival_times_np = np.array(arrival_times)
new_times = arrival_times_np - 3
print(new_times)

# Use list comprehension and enumerate to pair guests to new times
guest_arrivals = [(names[i],time) for i,time in enumerate(new_times)]
print(guest_arrivals)

# Map the welcome_guest function to each (guest,time) pair
def welcome_guest(*args):
    message_list = ['Welcome {}... You are {} min late.'.format(guest, time) for guest, time in args]
    return message_list

welcome_map = map(welcome_guest, guest_arrivals)

guest_welcomes = [*welcome_map]

print(*guest_welcomes, sep='\n')

[ 7 17 27 37 47]
[('Jerry', 7), ('Kramer', 17), ('Elaine', 27), ('George', 37), ('Newman', 47)]
['Welcome Jerry... You are 7 min late.']
['Welcome Kramer... You are 17 min late.']
['Welcome Elaine... You are 27 min late.']
['Welcome George... You are 37 min late.']
['Welcome Newman... You are 47 min late.']


### 2. Timing and profiling code

In [7]:
"""
%timeit

%%timeit

%lprun : find the bottleneck

%mprun : memory usage

"""

'\n%timeit\n\n%%timeit\n\n%lprun : find the bottleneck\n\n%mprun : memory usage\n\n'

### 3. Gaining efficiencies

In [8]:
# Pokemon data
with open('data/chap01/pokemon_name.txt', 'r') as f:
    names = f.readlines()
    names = [n.replace('\n','') for n in names]

with open('data/chap01/pokemon_type1.txt', 'r') as f:
    primary_types = f.readlines()
    primary_types = [n.replace('\n','') for n in primary_types]

with open('data/chap01/pokemon_type2.txt', 'r') as f:
    secondary_types = f.readlines()
    secondary_types = [n.replace('\n','') for n in secondary_types]

with open('data/chap01/pokemon_gen.txt', 'r') as f:
    generations = f.readlines()
    generations = [n.replace('\n','') for n in generations]

In [9]:
# Combine lists

names_types = [*zip(names, primary_types, secondary_types)]
print(*names_types[:5], sep='\n')
print()

# Combine five items from names and three items from primary_types
differing_lengths = [*zip(names[:5], primary_types[:3])]
print(*differing_lengths, sep='\n')

('Abomasnow', 'Grass', 'Ice')
('Abra', 'Psychic', 'nan')
('Absol', 'Dark', 'nan')
('Accelgor', 'Bug', 'nan')
('Aerodactyl', 'Rock', 'Flying')

('Abomasnow', 'Grass')
('Abra', 'Psychic')
('Absol', 'Dark')


In [10]:
# Counting from a sample

from collections import Counter

# Collect the count of primary types
type_count = Counter(primary_types)
print(type_count, '\n')

# Collect the count of generations
gen_count = Counter(generations)
print(gen_count, '\n')

# Collect the count of Pokémon for each starting_letter
starting_letters = [name[0] for name in names]
starting_letters_count = Counter(starting_letters)
print(starting_letters_count)

Counter({'Water': 105, 'Normal': 92, 'Bug': 65, 'Grass': 64, 'Fire': 48, 'Psychic': 46, 'Rock': 41, 'Electric': 40, 'Ground': 30, 'Dark': 28, 'Poison': 28, 'Dragon': 25, 'Fighting': 25, 'Ice': 23, 'Steel': 21, 'Ghost': 20, 'Fairy': 17, 'Flying': 2}) 

Counter({'5': 122, '3': 103, '1': 99, '4': 78, '2': 51, '6': 47}) 

Counter({'S': 102, 'M': 58, 'C': 55, 'P': 47, 'G': 46, 'D': 41, 'B': 39, 'T': 35, 'L': 33, 'A': 32, 'R': 30, 'H': 27, 'F': 26, 'K': 25, 'W': 23, 'V': 22, 'E': 21, 'N': 16, 'Z': 9, 'J': 7, 'O': 6, 'I': 5, 'U': 5, 'Q': 4, 'Y': 4, 'X': 2})


In [11]:
# Combinations

from itertools import combinations
import random

pokemon = random.sample(names, 5)   # select 5 samples

# Create a combination object with pairs of Pokémon
combos_obj = combinations(pokemon, 2)
print(type(combos_obj), '\n')

# Convert combos_obj to a list by unpacking
combos_2 = [*combos_obj]
print(combos_2, '\n')

# Collect all possible combinations of 4 Pokémon directly into a list
combos_4 = [*combinations(pokemon, 4)]
print(combos_4)

<class 'itertools.combinations'> 

[('Glalie', 'Sandslash'), ('Glalie', 'Medicham'), ('Glalie', 'Buneary'), ('Glalie', 'Shelmet'), ('Sandslash', 'Medicham'), ('Sandslash', 'Buneary'), ('Sandslash', 'Shelmet'), ('Medicham', 'Buneary'), ('Medicham', 'Shelmet'), ('Buneary', 'Shelmet')] 

[('Glalie', 'Sandslash', 'Medicham', 'Buneary'), ('Glalie', 'Sandslash', 'Medicham', 'Shelmet'), ('Glalie', 'Sandslash', 'Buneary', 'Shelmet'), ('Glalie', 'Medicham', 'Buneary', 'Shelmet'), ('Sandslash', 'Medicham', 'Buneary', 'Shelmet')]


In [12]:
# Set

ash_pokedex = ['Pikachu', 'Bulbasaur', 'Koffing', 'Spearow', 'Vulpix', 'Wigglytuff', 'Zubat', 'Rattata', 'Psyduck', 'Squirtle']
misty_pokedex = ['Krabby', 'Horsea', 'Slowbro', 'Tentacool', 'Vaporeon', 'Magikarp', 'Poliwag', 'Starmie', 'Psyduck', 'Squirtle']

ash_set = set(ash_pokedex)
misty_set = set(misty_pokedex)

# Find the Pokémon that exist in both sets
both = ash_set.intersection(misty_set)
print(both)

# Find the Pokémon that Ash has, and Misty does not have
ash_only = ash_set.difference(misty_set)
print(ash_only)

# Find the Pokémon that are in only one set (not both)
unique_to_set = ash_set.symmetric_difference(misty_set)
print(unique_to_set)

{'Squirtle', 'Psyduck'}
{'Spearow', 'Pikachu', 'Bulbasaur', 'Rattata', 'Vulpix', 'Zubat', 'Wigglytuff', 'Koffing'}
{'Krabby', 'Spearow', 'Horsea', 'Pikachu', 'Bulbasaur', 'Rattata', 'Magikarp', 'Wigglytuff', 'Slowbro', 'Vulpix', 'Vaporeon', 'Poliwag', 'Zubat', 'Starmie', 'Tentacool', 'Koffing'}


In [13]:
# Searching from set

print('Pikachu' in ash_pokedex)
print('Pikachu' in misty_set)

True
False


In [14]:
# Gathering unique item

def find_unique_items(data):
    uniques = []

    for item in data:
        if item not in uniques:
            uniques.append(item)

    return uniques


# collect unique Pokémon names
uniq_names_func = find_unique_items(names)
print(len(uniq_names_func))

# set is faster !!!
uniq_names_set = set(names)
print(len(uniq_names_set))

# Check that both unique collections are equivalent
print(sorted(uniq_names_func) == sorted(uniq_names_set))

720
720
True


In [15]:
# Use the best approach to collect unique primary types and generations
uniq_types = set(primary_types)
uniq_gens = set(generations)
print(uniq_types, uniq_gens, sep='\n')

{'Dark', 'Fire', 'Ghost', 'Electric', 'Ice', 'Ground', 'Dragon', 'Fighting', 'Poison', 'Psychic', 'Grass', 'Bug', 'Normal', 'Water', 'Flying', 'Steel', 'Rock', 'Fairy'}
{'4', '2', '3', '1', '6', '5'}


In [16]:
# Eliminating loops

# Loop
gen1_gen2_name_lengths_loop = []

for name, gen in zip(names, generations):
    if int(gen) < 3:
        name_length = len(name)
        poke_tuple = (name, name_length)
        gen1_gen2_name_lengths_loop.append(poke_tuple)


# Effective code
gen1_gen2_pokemon = [name for name, gen in zip(names, generations) if int(gen) < 3]
name_lengths_map = map(len, gen1_gen2_pokemon)
gen1_gen2_name_lengths = [*zip(gen1_gen2_pokemon, name_lengths_map)]

print(gen1_gen2_name_lengths_loop[:5])
print(gen1_gen2_name_lengths[:5])

[('Abomasnow', 9), ('Abra', 4), ('Absol', 5), ('Aipom', 5), ('Alomomola', 9)]
[('Abomasnow', 9), ('Abra', 4), ('Absol', 5), ('Aipom', 5), ('Alomomola', 9)]


In [17]:
# totals and averages without a loop

# hp, attack, defense, sp_attack, sp_defense - numpy array
stats = np.loadtxt('data/chap01/pokemon_stat.txt')

# Loop
poke_list = []

for pokemon, row in zip(names, stats):
    total_stats = np.sum(row)
    avg_stats = np.mean(row)
    poke_list.append((pokemon, total_stats, avg_stats))


# Effective code
total_stats_np = stats.sum(axis=1)
avg_stats_np = stats.mean(axis=1)
poke_list_np = [*zip(names, total_stats_np, avg_stats_np)]

print(poke_list_np == poke_list, '\n')
print(poke_list_np[:3])
print(poke_list[:3], '\n')


top_3 = sorted(poke_list_np, key=lambda x: x[1], reverse=True)[:3]
print('3 strongest Pokémon:\n{}'.format(top_3))

True 

[('Abomasnow', 253.0, 50.6), ('Abra', 325.0, 65.0), ('Absol', 503.0, 100.6)]
[('Abomasnow', 253.0, 50.6), ('Abra', 325.0, 65.0), ('Absol', 503.0, 100.6)] 

3 strongest Pokémon:
[('Zoroark', 623.0, 124.6), ('Marshtomp', 620.0, 124.0), ('Gothorita', 605.0, 121.0)]


In [18]:
# One-time calculation loop

from collections import Counter

# Collect the count of each generation
gen_counts = Counter(generations)

total_count = len(generations)

for gen, count in gen_counts.items():
    gen_percent = round(count / total_count * 100, 2)
    print('generation {}: count = {:3}, percentage = {}'.format(gen, count, gen_percent))

generation 1: count =  99, percentage = 19.8
generation 5: count = 122, percentage = 24.4
generation 3: count = 103, percentage = 20.6
generation 6: count =  47, percentage = 9.4
generation 4: count =  78, percentage = 15.6
generation 2: count =  51, percentage = 10.2


In [19]:
# Holistic conversion loop
# gather all the possible pairs of Pokémon types.

pokemon_types = sorted(list(set(primary_types)))
print(pokemon_types, '\n')

# Collect all possible pairs using combinations()
possible_pairs = [*combinations(pokemon_types, 2)]

enumerated_tuples = []

for i, pair in enumerate(possible_pairs, 1):
    enumerated_pair_tuple = (i,) + pair
    enumerated_tuples.append(enumerated_pair_tuple)

# Convert all tuples in enumerated_tuples to a list
enumerated_pairs = [*map(list, enumerated_tuples)]
print(enumerated_pairs[:30])

['Bug', 'Dark', 'Dragon', 'Electric', 'Fairy', 'Fighting', 'Fire', 'Flying', 'Ghost', 'Grass', 'Ground', 'Ice', 'Normal', 'Poison', 'Psychic', 'Rock', 'Steel', 'Water'] 

[[1, 'Bug', 'Dark'], [2, 'Bug', 'Dragon'], [3, 'Bug', 'Electric'], [4, 'Bug', 'Fairy'], [5, 'Bug', 'Fighting'], [6, 'Bug', 'Fire'], [7, 'Bug', 'Flying'], [8, 'Bug', 'Ghost'], [9, 'Bug', 'Grass'], [10, 'Bug', 'Ground'], [11, 'Bug', 'Ice'], [12, 'Bug', 'Normal'], [13, 'Bug', 'Poison'], [14, 'Bug', 'Psychic'], [15, 'Bug', 'Rock'], [16, 'Bug', 'Steel'], [17, 'Bug', 'Water'], [18, 'Dark', 'Dragon'], [19, 'Dark', 'Electric'], [20, 'Dark', 'Fairy'], [21, 'Dark', 'Fighting'], [22, 'Dark', 'Fire'], [23, 'Dark', 'Flying'], [24, 'Dark', 'Ghost'], [25, 'Dark', 'Grass'], [26, 'Dark', 'Ground'], [27, 'Dark', 'Ice'], [28, 'Dark', 'Normal'], [29, 'Dark', 'Poison'], [30, 'Dark', 'Psychic']]


In [20]:
hps = stats[:, 0]
hp_avg = hps.mean()
hp_std = hps.std()

# Loop version
poke_zscores = []

for name, hp in zip(names, hps):
    z_score = (hp - hp_avg)/hp_std
    poke_zscores.append((name, hp, z_score))

high_hp_pokemon = []

for name, hp, zscore in poke_zscores:
    if zscore > 3:
        high_hp_pokemon.append((name, hp, zscore))


# Effective code version
z_scores = (hps - hp_avg)/hp_std
poke_zscores2 = [*zip(names, hps, z_scores)]
print(*poke_zscores2[:3], sep='\n')
print()

highest_hp_pokemon = [(name, hp, zscore) for name, hp, zscore in poke_zscores2 if zscore > 3]
print(*highest_hp_pokemon, sep='\n')

('Abomasnow', 45.0, -0.9026061903496466)
('Abra', 60.0, -0.33400219736223524)
('Absol', 80.0, 0.4241364599543133)

('Corphish', 250.0, 6.868315047144975)
('Diglett', 160.0, 3.4566910892205076)
('Fletchling', 190.0, 4.59389907519533)
('Golurk', 255.0, 7.0578497114741126)
('Igglybuff', 150.0, 3.0776217605622334)
('Kyogre', 170.0, 3.835760417878782)
('Ninjask', 150.0, 3.0776217605622334)
('Quilava', 150.0, 3.0776217605622334)
('Spoink', 165.0, 3.6462257535496447)
('Zoroark', 216.0, 5.579479329706843)


### 4. pandas DataFrame iteration

In [21]:
import pandas as pd

baseball_df = pd.read_csv('data/chap01/baseball_stats.csv')
pit_df = baseball_df[baseball_df.Team == 'PIT']
print(pit_df.head())

    Team League  Year   RS   RA   W    OBP    SLG     BA  Playoffs  \
21   PIT     NL  2012  651  674  79  0.304  0.395  0.243         0   
51   PIT     NL  2011  610  712  72  0.309  0.368  0.244         0   
81   PIT     NL  2010  587  866  57  0.304  0.373  0.242         0   
111  PIT     NL  2009  636  768  62  0.318  0.387  0.252         0   
141  PIT     NL  2008  735  884  67  0.320  0.403  0.258         0   

     RankSeason  RankPlayoffs    G   OOBP   OSLG  
21          NaN           NaN  162  0.314  0.390  
51          NaN           NaN  162  0.338  0.409  
81          NaN           NaN  162  0.348  0.449  
111         NaN           NaN  161  0.346  0.442  
141         NaN           NaN  162  0.362  0.454  


In [22]:
# Iterating with .iterrows()

for i, row in pit_df.iterrows():
    print(row)
    if i == 21: break

print()

for row_tuple in pit_df.iterrows():
    print(row_tuple)
    if row_tuple[0] == 21: break

Team              PIT
League             NL
Year             2012
RS                651
RA                674
W                  79
OBP             0.304
SLG             0.395
BA              0.243
Playoffs            0
RankSeason        NaN
RankPlayoffs      NaN
G                 162
OOBP            0.314
OSLG             0.39
Name: 21, dtype: object

(21, Team              PIT
League             NL
Year             2012
RS                651
RA                674
W                  79
OBP             0.304
SLG             0.395
BA              0.243
Playoffs            0
RankSeason        NaN
RankPlayoffs      NaN
G                 162
OOBP            0.314
OSLG             0.39
Name: 21, dtype: object)


In [23]:
# Run differentials with .iterrows()

# 각 시즌별 run differential 계산.
# runs_score (RS, 출루 횟수) - runs_allowed (RA, 총 출루 허용횟수)

giants_df = baseball_df[baseball_df.Team == 'SFG']

run_diffs = []

# Write a for loop and collect runs allowed and runs scored for each row
for i,row in giants_df.iterrows():
    run_diff = row['RS'] - row['RA']
    run_diffs.append(run_diff)

giants_df['RD'] = run_diffs
print(giants_df.head())

    Team League  Year   RS   RA   W    OBP    SLG     BA  Playoffs  \
24   SFG     NL  2012  718  649  94  0.327  0.397  0.269         1   
54   SFG     NL  2011  570  578  86  0.303  0.368  0.242         0   
84   SFG     NL  2010  697  583  92  0.321  0.408  0.257         1   
114  SFG     NL  2009  657  611  88  0.309  0.389  0.257         0   
144  SFG     NL  2008  640  759  72  0.321  0.382  0.262         0   

     RankSeason  RankPlayoffs    G   OOBP   OSLG   RD  
24          4.0           1.0  162  0.313  0.393   69  
54          NaN           NaN  162  0.309  0.346   -8  
84          5.0           1.0  162  0.313  0.370  114  
114         NaN           NaN  162  0.314  0.372   46  
144         NaN           NaN  162  0.341  0.404 -119  


In [24]:
# Iterating with .itertuples()

run_diffs = []

# Loop over the DataFrame and print each row's Index, Year and Wins (W)
for row in giants_df.itertuples():
    run_diff = row.RS - row.RA
    run_diffs.append(run_diff)

    if row.Playoffs == 1:
        print(row.Index, row.Year, row.W)   # W : wins

giants_df['RD'] = run_diffs
print(giants_df.head())

24 2012 94
84 2010 92
295 2003 100
325 2002 95
385 2000 97
474 1997 90
634 1989 92
686 1987 90
1041 1971 90
1229 1962 103
    Team League  Year   RS   RA   W    OBP    SLG     BA  Playoffs  \
24   SFG     NL  2012  718  649  94  0.327  0.397  0.269         1   
54   SFG     NL  2011  570  578  86  0.303  0.368  0.242         0   
84   SFG     NL  2010  697  583  92  0.321  0.408  0.257         1   
114  SFG     NL  2009  657  611  88  0.309  0.389  0.257         0   
144  SFG     NL  2008  640  759  72  0.321  0.382  0.262         0   

     RankSeason  RankPlayoffs    G   OOBP   OSLG   RD  
24          4.0           1.0  162  0.313  0.393   69  
54          NaN           NaN  162  0.309  0.346   -8  
84          5.0           1.0  162  0.313  0.370  114  
114         NaN           NaN  162  0.314  0.372   46  
144         NaN           NaN  162  0.341  0.404 -119  


In [25]:
# Analyzing baseball stats with .apply()

runs_df = giants_df[['RS','RA','W','Playoffs']]

# Gather sum of all columns
stat_totals = runs_df.apply(sum, axis=0)
print(stat_totals, '\n')

# Gather total runs scored in all games per year
total_runs_scored = runs_df[['RS', 'RA']].apply(sum, axis=1)
print(total_runs_scored[:5], '\n')

# Convert numeric playoffs to text
def text_playoffs(num_playoffs):
    if num_playoffs == 1:
        return 'Yes'
    else:
        return 'No'

textual_playoffs = runs_df.apply(lambda row: text_playoffs(row['Playoffs']), axis=1)
print(textual_playoffs[:10])

RS          33198
RA          32042
W            3975
Playoffs       10
dtype: int64 

24     1367
54     1148
84     1280
114    1268
144    1399
dtype: int64 

24     Yes
54      No
84     Yes
114     No
144     No
174     No
204     No
234     No
265     No
295    Yes
dtype: object


In [26]:
# Settle a debate with .apply()

def calc_win_perc(wins, games_played):
    win_perc = wins / games_played
    return np.round(win_perc,2)

# Create a win percentage Series
win_percs = giants_df.apply(lambda row: calc_win_perc(row['W'], row['G']), axis=1)
print(win_percs[:10], '\n')

giants_df['WP'] = win_percs
print(giants_df.head(), '\n')

# Display dbacks_df where WP is greater than 0.60
print(giants_df[giants_df['WP'] >= 0.60])

24     0.58
54     0.53
84     0.57
114    0.54
144    0.44
174    0.44
204    0.47
234    0.46
265    0.56
295    0.62
dtype: float64 

    Team League  Year   RS   RA   W    OBP    SLG     BA  Playoffs  \
24   SFG     NL  2012  718  649  94  0.327  0.397  0.269         1   
54   SFG     NL  2011  570  578  86  0.303  0.368  0.242         0   
84   SFG     NL  2010  697  583  92  0.321  0.408  0.257         1   
114  SFG     NL  2009  657  611  88  0.309  0.389  0.257         0   
144  SFG     NL  2008  640  759  72  0.321  0.382  0.262         0   

     RankSeason  RankPlayoffs    G   OOBP   OSLG   RD    WP  
24          4.0           1.0  162  0.313  0.393   69  0.58  
54          NaN           NaN  162  0.309  0.346   -8  0.53  
84          5.0           1.0  162  0.313  0.370  114  0.57  
114         NaN           NaN  162  0.314  0.372   46  0.54  
144         NaN           NaN  162  0.341  0.404 -119  0.44   

     Team League  Year   RS   RA    W    OBP    SLG     BA  Playoffs

In [27]:
# Replacing .iloc with underlying arrays

# old version
win_percs_list = []

for i in range(len(giants_df)):
    row = giants_df.iloc[i]

    wins = row['W']
    games_played = row['G']
    win_perc = calc_win_perc(wins, games_played)

    win_percs_list.append(win_perc)

giants_df['WP'] = win_percs_list


# Effective version
# Use the W array and G array to calculate win percentages
win_percs_np = calc_win_perc(giants_df['W'].values, giants_df['G'].values)

# Append a new column to baseball_df that stores all win percentages
giants_df['WP'] = win_percs_np

print(giants_df.head())

    Team League  Year   RS   RA   W    OBP    SLG     BA  Playoffs  \
24   SFG     NL  2012  718  649  94  0.327  0.397  0.269         1   
54   SFG     NL  2011  570  578  86  0.303  0.368  0.242         0   
84   SFG     NL  2010  697  583  92  0.321  0.408  0.257         1   
114  SFG     NL  2009  657  611  88  0.309  0.389  0.257         0   
144  SFG     NL  2008  640  759  72  0.321  0.382  0.262         0   

     RankSeason  RankPlayoffs    G   OOBP   OSLG   RD    WP  
24          4.0           1.0  162  0.313  0.393   69  0.58  
54          NaN           NaN  162  0.309  0.346   -8  0.53  
84          5.0           1.0  162  0.313  0.370  114  0.57  
114         NaN           NaN  162  0.314  0.372   46  0.54  
144         NaN           NaN  162  0.341  0.404 -119  0.44  


In [28]:
# Predict win percentage

def predict_win_perc(RS, RA):
    prediction = RS ** 2 / (RS ** 2 + RA ** 2)
    return np.round(prediction, 2)

# version 1 : Loop
win_perc_preds_loop = []

for row in baseball_df.itertuples():
    win_perc_pred = predict_win_perc(row.RS, row.RA)
    win_perc_preds_loop.append(win_perc_pred)


# version 2 : apply
win_perc_preds_apply = baseball_df.apply(lambda row: predict_win_perc(row['RS'], row['RA']), axis=1)

# version 3 : using NumPy arrays
win_perc_preds_np = predict_win_perc(baseball_df['RS'].values, baseball_df['RA'].values)
baseball_df['WP_preds'] = win_perc_preds_np

print(baseball_df.head())

  Team League  Year   RS   RA   W    OBP    SLG     BA  Playoffs  RankSeason  \
0  ARI     NL  2012  734  688  81  0.328  0.418  0.259         0         NaN   
1  ATL     NL  2012  700  600  94  0.320  0.389  0.247         1         4.0   
2  BAL     AL  2012  712  705  93  0.311  0.417  0.247         1         5.0   
3  BOS     AL  2012  734  806  69  0.315  0.415  0.260         0         NaN   
4  CHC     NL  2012  613  759  61  0.302  0.378  0.240         0         NaN   

   RankPlayoffs    G   OOBP   OSLG  WP_preds  
0           NaN  162  0.317  0.415      0.53  
1           5.0  162  0.306  0.378      0.58  
2           4.0  162  0.315  0.403      0.50  
3           NaN  162  0.331  0.428      0.45  
4           NaN  162  0.335  0.424      0.39  
