## Chapter 1:Foundations for efficiencies

* Q: In the context of this course, what is meant by efficient Python code?
* A: Code that executes quickly for the task at hand, minimizes the memory footprint and follows Python's coding style principles.


In [266]:
names = ['Jerry', 'Kramer', 'Elaine', 'George', 'Newman']

In [267]:
# Print the list created using the Non-Pythonic approach
i = 0
new_list= []
while i < len(names):
    if len(names[i]) >= 6:
        new_list.append(names[i])
    i += 1
print(new_list)

['Kramer', 'Elaine', 'George', 'Newman']


In [268]:
# Print the list created by looping over the contents of names
better_list = []
for name in names:
    if len(name) >= 6:
        better_list.append(name)
print(better_list)

['Kramer', 'Elaine', 'George', 'Newman']


In [269]:
# Print the list created by using list comprehension
best_list = [name for name in names if len(name) >= 6]
print(best_list)

['Kramer', 'Elaine', 'George', 'Newman']


In [270]:
import this

In [271]:
# Create a range object that goes from 0 to 5
nums = range(6)
print(type(nums))

# Convert nums to a list
nums_list = list(nums)
print(nums_list)

# Create a new list of odd numbers from 1 to 11 by unpacking a range object
nums_list2 = [*range(1,12,2)]
print(nums_list2)

<class 'range'>
[0, 1, 2, 3, 4, 5]
[1, 3, 5, 7, 9, 11]


In [272]:
# Rewrite the for loop to use enumerate
indexed_names = []
for i,name in enumerate(names):
    index_name = (i,name)
    indexed_names.append(index_name) 
print(indexed_names)

# Rewrite the above for loop using list comprehension
indexed_names_comp = [(i,name) for i,name in enumerate(names)]
print(indexed_names_comp)

# Unpack an enumerate object with a starting index of one
indexed_names_unpack = [*enumerate(names, 1)]
print(indexed_names_unpack)

[(0, 'Jerry'), (1, 'Kramer'), (2, 'Elaine'), (3, 'George'), (4, 'Newman')]
[(0, 'Jerry'), (1, 'Kramer'), (2, 'Elaine'), (3, 'George'), (4, 'Newman')]
[(1, 'Jerry'), (2, 'Kramer'), (3, 'Elaine'), (4, 'George'), (5, 'Newman')]


In [273]:
# Use map to apply str.upper to each element in names
names_map  = map(str.upper, names)

# Print the type of the names_map
print(type(names_map))

# Unpack names_map into a list
names_uppercase = [*names_map]

# Print the list created above
print(names_uppercase)

<class 'map'>
['JERRY', 'KRAMER', 'ELAINE', 'GEORGE', 'NEWMAN']


* Basic 1-D indexing (lists)
    * nums=[-2,-1,0,1,2]
    * nums[-1] -> 2
    * nums[1:4] -> [-1,0,1]
* Basic 1-D indexing (arrays)
    * numps_np=np.arry(nums)
    * numps_np[-1] -> 2
    * numps_np[1:4] -> array([-1,0,1])

* Basic 2-D indexing (lists)
    * nums2=[[1,2,3],[4,5,6]]
    * nums2[0][1] -> 2
    * [row[0] for row in nums2] -> [1,4]
    * nums2[1:4] -> [-1,0,1]
* Basic 2-D indexing (arrays)
    * numps2_np=np.arry(nums)
    * numps2_np[0,1] -> 2
    * numps2_np[:,0] -> array([1,4])
    * numps2_np[1:4] -> array([-1,0,1])

In [274]:
import numpy as np
nums=np.reshape(range(1,11), (2,5))

In [275]:

# Print second row of nums
print(nums[1,:])

# Print all elements of nums that are greater than six
print(nums[nums > 6])

# Double every element of nums
nums_dbl = nums * 2
print(nums_dbl)

# Replace the third column of nums
nums[:,2] = nums[:,2] + 1
print(nums)

[ 6  7  8  9 10]
[ 7  8  9 10]
[[ 2  4  6  8 10]
 [12 14 16 18 20]]
[[ 1  2  4  4  5]
 [ 6  7  9  9 10]]


In [276]:
# Use range() to create a list of arrival times (10 through 50 incremented by 10). Create the list arrival_times by unpacking the range object.
# Create a list of arrival times
arrival_times = [*range(10, 51, 10)]

print(arrival_times)

[10, 20, 30, 40, 50]


In [277]:
# Create a list of arrival times
arrival_times = [*range(10,60,10)]

# Convert arrival_times to an array and update the times
arrival_times_np = np.array(arrival_times)
new_times = arrival_times_np - 3

print(new_times)

[ 7 17 27 37 47]


In [278]:
# Create a list of arrival times
arrival_times = [*range(10,60,10)]

# Convert arrival_times to an array and update the times
arrival_times_np = np.array(arrival_times)
new_times = arrival_times_np - 3

# Use list comprehension and enumerate to pair guests to new times
guest_arrivals = [(names[i],time) for i,time in enumerate(new_times)]

print(guest_arrivals)

[('Jerry', 7), ('Kramer', 17), ('Elaine', 27), ('George', 37), ('Newman', 47)]


In [279]:
def welcome_guest(guest_arrival):
    name,time=guest_arrival
    return (f"Welcome to Festivus {name}... You're {time} min late.")

In [280]:
# Create a list of arrival times
arrival_times = [*range(10,60,10)]

# Convert arrival_times to an array and update the times
arrival_times_np = np.array(arrival_times)
new_times = arrival_times_np - 3

# Use list comprehension and enumerate to pair guests to new times
guest_arrivals = [(names[i],time) for i,time in enumerate(new_times)]

# Map the welcome_guest function to each (guest,time) pair
welcome_map = map(welcome_guest, guest_arrivals)

guest_welcomes = [*welcome_map]
print(*guest_welcomes, sep='\n')

Welcome to Festivus Jerry... You're 7 min late.
Welcome to Festivus Kramer... You're 17 min late.
Welcome to Festivus Elaine... You're 27 min late.
Welcome to Festivus George... You're 37 min late.
Welcome to Festivus Newman... You're 47 min late.


## Chapter 2:Timing and profiling code

In [281]:
%timeit

In [282]:
# Create a list of integers (0-50) using list comprehension
%timeit nums_list_comp = [num for num in range(51)]


# Create a list of integers (0-50) by unpacking range
%timeit nums_unpack = [*range(51)]


2.34 µs ± 175 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
566 ns ± 6.4 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [283]:
import pandas as pd
df=pd.read_csv('../datasets/heroes.csv', index_col=0)
print(df)
wts=np.array(df['Weight'])
hts=np.array(df['Height'])

name  Gender Eye color               Race        Hair color  \
X1                                                                            
0             A-Bomb    Male    yellow              Human           No Hair   
1         Abe Sapien    Male      blue      Icthyo Sapien           No Hair   
2           Abin Sur    Male      blue            Ungaran           No Hair   
3        Abomination    Male     green  Human / Radiation           No Hair   
4            Abraxas    Male      blue      Cosmic Entity             Black   
..               ...     ...       ...                ...               ...   
729  Yellowjacket II  Female      blue              Human  Strawberry Blond   
730             Ymir    Male     white        Frost Giant           No Hair   
731             Yoda    Male     brown     Yoda's species             White   
732          Zatanna  Female      blue              Human             Black   
733             Zoom    Male       red                  -           

In [284]:
heroes=np.array(df['name'])

In [285]:
# Question: What is the correct syntax when using %timeit and only using 5 runs with 25 loops per each run?
%timeit -r5 -n25 set(heroes)

47.1 µs ± 2.79 µs per loop (mean ± std. dev. of 5 runs, 25 loops each)


In [286]:
# Create a list using the formal name
formal_list = list()
print(formal_list)

# Create a list using the literal syntax
literal_list = []
print(literal_list)

# Print out the type of formal_list
print(type(formal_list))

# Print out the type of literal_list
print(type(literal_list))

[]
[]
<class 'list'>
<class 'list'>


In [287]:
%timeit formal_list = list()
%timeit literal_list = []

105 ns ± 0.969 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)
22.6 ns ± 2.04 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)


In [288]:
%%timeit 
hero_wts_lbs = []
for wt in wts:
    hero_wts_lbs.append(wt * 2.20462)


330 µs ± 5.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [289]:
%%timeit
wts_np = np.array(wts)
hero_wts_lbs_np = wts_np * 2.20462

2.5 µs ± 76.2 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [290]:
def convert_units(heroes, heights, weights):

    new_hts = [ht * 0.39370  for ht in heights]
    new_wts = [wt * 2.20462  for wt in weights]

    hero_data = {}

    for i,hero in enumerate(heroes):
        hero_data[hero] = (new_hts[i], new_wts[i])

    return hero_data

* Question:
What are the necessary steps you need to take in order to profile the convert_units() function acting on your superheroes data if you'd like to see line-by-line runtimes?
* Answer:
    * Use %load_ext line_profiler to load the line_profiler within your IPython session.
    * Use %lprun -f convert_units convert_units(heroes, hts, wts) to get line-by-line runtimes.


* Question: What percentage of time is spent on the new_hts list comprehension line of code relative to the total amount of time spent in the convert_units() function?
* Answer: 11% - 20%

* Q: What percentage of time is spent on the new_hts array broadcasting line of code relative to the total amount of time spent in the convert_units_broadcast() function?
* A: 0% - 10%

* Question: What are the necessary steps you need to take in order to profile the convert_units() function acting on your superheroes data if you'd like to see the line-by-line memory consumption of convert_units()?
* Answer: 
    * Use the command from hero_funcs import convert_units to load the function you'd like to profile.
    * Use %load_ext memory_profiler to load the memory_profiler within your IPython session.
    * Use %mprun -f convert_units convert_units(heroes, hts, wts) to get line-by-line memory allocations.

In [291]:
from bmi_lists import *

* Question: How much memory do the list comprehension lines of code consume in the calc_bmi_lists() function? (i.e., what is the total sum of the Increment column for these four lines of code?)
* Answer: After above code running, Increment column value = 0.1 MiB
* Note: In exercise pre-notes: A random sample of 25,000 superheroes has been loaded into your session as an array called sample_indices. 

In [292]:
sample_indices= np.random.randint(0,479,25000)
print(sample_indices)

[167 236  74 ... 318 230 114]


In [293]:
%reload_ext memory_profiler
%mprun -f calc_bmi_lists calc_bmi_lists(sample_indices, hts, wts)





Filename: d:\çalışma\ml_days\ML_Python101\datacamp\python_programming\bmi_lists.py

Line #    Mem usage    Increment   Line Contents
     1    116.2 MiB    116.2 MiB   def calc_bmi_lists(sample_indices, hts, wts):
     2                             
     3                                 # Gather sample heights and weights as lists
     4    116.2 MiB      0.0 MiB       s_hts = [hts[i] for i in sample_indices]
     5    116.2 MiB      0.0 MiB       s_wts = [wts[i] for i in sample_indices]
     6                             
     7                                 # Convert heights from cm to m and square with list comprehension
     8    116.3 MiB      0.1 MiB       s_hts_m_sqr = [(ht / 100) ** 2 for ht in s_hts]
     9                             
    10                                 # Calculate BMIs as a list with list comprehension
    11    116.7 MiB      0.1 MiB       bmis = [s_wts[i] / s_hts_m_sqr[i] for i in range(len(sample_indices))]
    12    116.7 MiB      0.0 MiB       ret

In [294]:
from bmi_arrays import calc_bmi_arrays

In [295]:
%reload_ext memory_profiler
%mprun -f calc_bmi_arrays calc_bmi_arrays(sample_indices, hts, wts)




Filename: d:\çalışma\ml_days\ML_Python101\datacamp\python_programming\bmi_arrays.py

Line #    Mem usage    Increment   Line Contents
     1    116.2 MiB    116.2 MiB   def calc_bmi_arrays(sample_indices, hts, wts):
     2                                 
     3                                 # Gather sample heights and weights as arrays
     4    116.2 MiB      0.0 MiB       s_hts = hts[sample_indices]
     5    116.2 MiB      0.0 MiB       s_wts = wts[sample_indices]
     6                             
     7                                 # Convert heights from cm to m and square with broadcasting
     8    116.2 MiB      0.0 MiB       s_hts_m_sqr = (s_hts / 100) ** 2
     9                             
    10                                 # Calculate BMIs as an array using broadcasting
    11    116.2 MiB      0.0 MiB       bmis = s_wts / s_hts_m_sqr
    12                             
    13    116.2 MiB      0.0 MiB       return bmis

In [296]:
from publishers import *

In [297]:
publishers= list(df['Publisher'])
desired_publisher=publishers[publishers==['George Lucas']]
# Use get_publisher_heroes() to gather Star Wars heroes
star_wars_heroes = get_publisher_heroes(heroes, publishers, desired_publisher)

print(star_wars_heroes)
print(type(star_wars_heroes))

# Use get_publisher_heroes_np() to gather Star Wars heroes
star_wars_heroes_np = get_publisher_heroes_np(heroes, publishers, desired_publisher)

print(star_wars_heroes_np)
print(type(star_wars_heroes_np))


['A-Bomb', 'Abomination', 'Abraxas', 'Absorbing Man', 'Agent 13', 'Agent Bob', 'Agent Zero', 'Air-Walker', 'Ajax', 'Ammo', 'Angel', 'Angel Dust', 'Angel Salvadore', 'Annihilus', 'Ant-Man', 'Ant-Man II', 'Anti-Venom', 'Apocalypse', 'Arachne', 'Archangel', 'Arclight', 'Ardina', 'Ares', 'Ariel', 'Armor', 'Atlas', 'Aurora', 'Azazel', 'Banshee', 'Bantam', 'Battlestar', 'Beak', 'Beast', 'Beetle', 'Beta Ray Bill', 'Beyonder', 'Big Man', 'Binary', 'Bird-Brain', 'Bird-Man', 'Bird-Man II', 'Bishop', 'Black Abbott', 'Black Bolt', 'Black Cat', 'Black Goliath', 'Black Knight III', 'Black Mamba', 'Black Panther', 'Black Widow', 'Black Widow II', 'Blackout', 'Blackwing', 'Blackwulf', 'Blade', 'Blaquesmith', 'Bling!', 'Blink', 'Blizzard', 'Blizzard', 'Blizzard II', 'Blob', 'Bloodaxe', 'Bloodhawk', 'Bloodwraith', 'Bolt', 'Boom-Boom', 'Boomer', 'Box', 'Box III', 'Box IV', 'Brother Voodoo', 'Bullseye', 'Bumbleboy', 'Cable', 'Callisto', 'Cannonball', 'Captain America', 'Captain Britain', 'Captain Mar-vell

In [298]:
%load_ext memory_profiler
%mprun -f get_publisher_heroes get_publisher_heroes(heroes, publishers, desired_publisher)



The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler



Filename: d:\çalışma\ml_days\ML_Python101\datacamp\python_programming\publishers.py

Line #    Mem usage    Increment   Line Contents
     2    116.2 MiB    116.2 MiB   def get_publisher_heroes(heroes, publishers, desired_publisher):
     3                                 
     4    116.2 MiB      0.0 MiB       desired_heroes = []
     5                             
     6    116.2 MiB      0.0 MiB       for i,pub in enumerate(publishers):
     7    116.2 MiB      0.0 MiB           if pub == desired_publisher:
     8    116.2 MiB      0.0 MiB               desired_heroes.append(heroes[i])
     9                             
    10    116.2 MiB      0.0 MiB       return desired_heroes

In [299]:
%load_ext memory_profiler
%mprun -f get_publisher_heroes_np get_publisher_heroes_np(heroes, publishers, desired_publisher)

The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler



Filename: d:\çalışma\ml_days\ML_Python101\datacamp\python_programming\publishers.py

Line #    Mem usage    Increment   Line Contents
    12    116.2 MiB    116.2 MiB   def get_publisher_heroes_np(heroes, publishers, desired_publisher):
    13                             
    14    116.2 MiB      0.0 MiB       heroes_np = np.array(heroes)
    15    116.2 MiB      0.0 MiB       pubs_np = np.array(publishers)
    16                             
    17    116.2 MiB      0.0 MiB       desired_heroes = heroes_np[pubs_np == desired_publisher]
    18                             
    19    116.2 MiB      0.0 MiB       return desired_heroes

In [300]:
#np function has less time and equal memory usage. so prefer np

## Chapter 3: Gaining efficiencies

In [301]:
pokeman_data=pd.read_csv('../datasets/pokemon.csv')
print(pokeman_data.columns)
names=list(pokeman_data['name'])
primary_types=list(pokeman_data['type1'])
secondary_types=list(pokeman_data['type2'])
generations=list(pokeman_data['generation'])

Index(['abilities', 'against_bug', 'against_dark', 'against_dragon',
       'against_electric', 'against_fairy', 'against_fight', 'against_fire',
       'against_flying', 'against_ghost', 'against_grass', 'against_ground',
       'against_ice', 'against_normal', 'against_poison', 'against_psychic',
       'against_rock', 'against_steel', 'against_water', 'attack',
       'base_egg_steps', 'base_happiness', 'base_total', 'capture_rate',
       'classfication', 'defense', 'experience_growth', 'height_m', 'hp',
       'japanese_name', 'name', 'percentage_male', 'pokedex_number',
       'sp_attack', 'sp_defense', 'speed', 'type1', 'type2', 'weight_kg',
       'generation', 'is_legendary'],
      dtype='object')


In [302]:
# Combine names and primary_types
names_type1 = [*zip(names, primary_types)]

print(*names_type1[:5], sep='\n')

('Bulbasaur', 'grass')
('Ivysaur', 'grass')
('Venusaur', 'grass')
('Charmander', 'fire')
('Charmeleon', 'fire')


In [303]:
# Combine all three lists together
names_types = [*zip(names, primary_types, secondary_types)]

print(*names_types[:5], sep='\n')

('Bulbasaur', 'grass', 'poison')
('Ivysaur', 'grass', 'poison')
('Venusaur', 'grass', 'poison')
('Charmander', 'fire', nan)
('Charmeleon', 'fire', nan)


In [304]:
# Combine five items from names and three items from primary_types
differing_lengths = [*zip(names[0:5], primary_types[:3])]

print(*differing_lengths, sep='\n')

('Bulbasaur', 'grass')
('Ivysaur', 'grass')
('Venusaur', 'grass')


In [305]:
from collections import Counter

In [306]:

# Collect the count of primary types
type_count = Counter(primary_types)
print(type_count, '\n')

# Collect the count of generations
gen_count = Counter(generations)
print(gen_count, '\n')

# Use list comprehension to get each Pokémon's starting letter
starting_letters = [name[0] for name in names]

# Collect the count of Pokémon for each starting_letter
starting_letters_count = Counter(starting_letters)
print(starting_letters_count)

Counter({'water': 114, 'normal': 105, 'grass': 78, 'bug': 72, 'psychic': 53, 'fire': 52, 'rock': 45, 'electric': 39, 'poison': 32, 'ground': 32, 'dark': 29, 'fighting': 28, 'ghost': 27, 'dragon': 27, 'steel': 24, 'ice': 23, 'fairy': 18, 'flying': 3}) 

Counter({5: 156, 1: 151, 3: 135, 4: 107, 2: 100, 7: 80, 6: 72}) 

Counter({'S': 111, 'M': 66, 'C': 63, 'P': 55, 'G': 51, 'T': 49, 'D': 46, 'B': 44, 'L': 38, 'A': 34, 'H': 29, 'R': 28, 'F': 27, 'K': 26, 'V': 23, 'W': 23, 'E': 21, 'N': 18, 'Z': 10, 'J': 8, 'O': 8, 'I': 6, 'Y': 5, 'U': 5, 'Q': 4, 'X': 3})


In [307]:
pokemon= ['Geodude', 'Cubone', 'Lickitung', 'Persian', 'Diglett']

In [308]:
# Import combinations from itertools
from itertools import combinations

# Create a combination object with pairs of Pokémon
combos_obj = combinations(pokemon, 2)
print(type(combos_obj), '\n')

# Convert combos_obj to a list by unpacking
combos_2 = [*combos_obj]
print(combos_2, '\n')

# Collect all possible combinations of 4 Pokémon directly into a list
combos_4 = [*combinations(pokemon, 4)]
print(combos_4)

<class 'itertools.combinations'> 

[('Geodude', 'Cubone'), ('Geodude', 'Lickitung'), ('Geodude', 'Persian'), ('Geodude', 'Diglett'), ('Cubone', 'Lickitung'), ('Cubone', 'Persian'), ('Cubone', 'Diglett'), ('Lickitung', 'Persian'), ('Lickitung', 'Diglett'), ('Persian', 'Diglett')] 

[('Geodude', 'Cubone', 'Lickitung', 'Persian'), ('Geodude', 'Cubone', 'Lickitung', 'Diglett'), ('Geodude', 'Cubone', 'Persian', 'Diglett'), ('Geodude', 'Lickitung', 'Persian', 'Diglett'), ('Cubone', 'Lickitung', 'Persian', 'Diglett')]


In [309]:
print('2 comb: ', len(combos_2), ' 4 comb:', len(combos_4))

2 comb:  10  4 comb: 5


In [310]:
ash_pokedex=['Pikachu', 'Bulbasaur', 'Koffing', 'Spearow', 'Vulpix', 'Wigglytuff', 'Zubat', 'Rattata', 'Psyduck', 'Squirtle']
misty_pokedex=['Krabby', 'Horsea', 'Slowbro', 'Tentacool', 'Vaporeon', 'Magikarp', 'Poliwag', 'Starmie', 'Psyduck', 'Squirtle']

In [311]:
# Convert both lists to sets
ash_set = set(ash_pokedex)
misty_set = set(misty_pokedex)

# Find the Pokémon that exist in both sets
both = ash_set.intersection(misty_set)
print(both)

# Find the Pokémon that Ash has, and Misty does not have
ash_only = ash_set.difference(misty_set)
print(ash_only)

# Find the Pokémon that are in only one set (not both)
unique_to_set = ash_set.symmetric_difference(misty_set)
print(unique_to_set)

{'Squirtle', 'Psyduck'}
{'Rattata', 'Pikachu', 'Koffing', 'Spearow', 'Bulbasaur', 'Vulpix', 'Zubat', 'Wigglytuff'}
{'Poliwag', 'Tentacool', 'Starmie', 'Rattata', 'Krabby', 'Pikachu', 'Vaporeon', 'Horsea', 'Slowbro', 'Koffing', 'Spearow', 'Bulbasaur', 'Vulpix', 'Zubat', 'Wigglytuff', 'Magikarp'}


In [312]:
brock_pokedex=['Onix', 'Geodude', 'Zubat', 'Golem', 'Vulpix', 'Tauros', 'Kabutops', 'Omastar', 'Machop', 'Dugtrio']

In [313]:
print(brock_pokedex)
# Convert Brock's Pokédex to a set
brock_pokedex_set = set(brock_pokedex)
print(brock_pokedex_set)

['Onix', 'Geodude', 'Zubat', 'Golem', 'Vulpix', 'Tauros', 'Kabutops', 'Omastar', 'Machop', 'Dugtrio']
{'Geodude', 'Omastar', 'Tauros', 'Machop', 'Dugtrio', 'Kabutops', 'Vulpix', 'Zubat', 'Golem', 'Onix'}


In [314]:
# Check if Psyduck is in Ash's list and Brock's set
print('Psyduck' in ash_pokedex)
print('Psyduck' in brock_pokedex_set)

True
False


In [315]:
# Check if Machop is in Ash's list and Brock's set
print('Machop' in ash_pokedex)
print('Machop' in brock_pokedex_set)

False
True


In [316]:
%timeit 'Psyduck' in ash_pokedex
%timeit 'Psyduck' in brock_pokedex_set
%timeit 'Machop' in ash_pokedex
%timeit 'Machop' in brock_pokedex_set

227 ns ± 51.1 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
48.6 ns ± 1.58 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)
222 ns ± 6.33 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
52.2 ns ± 4.34 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)


In [317]:
def find_unique_items(data):
    uniques = []

    for item in data:
        if item not in uniques:
            uniques.append(item)

    return uniques

In [318]:
# Use the provided function to collect unique Pokémon names
uniq_names_func = find_unique_items(names)
print(len(uniq_names_func))

801


In [319]:
# Convert the names list to a set to collect unique Pokémon names
uniq_names_set = set(names)
print(len(uniq_names_set))

# Check that both unique collections are equivalent
print(sorted(uniq_names_func) == sorted(uniq_names_set))

801
True


In [320]:
%timeit find_unique_items(names)
%timeit set(names)
"""Using a set to collect unique values is faster."""

5.9 ms ± 39.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
25.2 µs ± 1.62 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


'Using a set to collect unique values is faster.'

In [321]:
# Use the best approach to collect unique primary types and generations
uniq_types = set(primary_types) 
uniq_gens = set(generations)
print(uniq_types, uniq_gens, sep='\n') 

{'normal', 'fairy', 'bug', 'fire', 'ice', 'water', 'poison', 'psychic', 'dark', 'electric', 'ground', 'rock', 'flying', 'ghost', 'dragon', 'fighting', 'grass', 'steel'}
{1, 2, 3, 4, 5, 6, 7}


In [322]:
poke_names, poke_gens= names, generations
gen1_gen2_name_lengths_loop = []

# Collect Pokémon that belong to generation 1 or generation 2
gen1_gen2_pokemon = [name for name,gen in zip(poke_names, poke_gens) if gen < 3]

# Create a map object that gen1_gen2_pokemon the name lengths
name_lengths_map = map(len, gen1_gen2_pokemon)

# Combine gen1_gen2_pokemon and name_lengths_map into a list
gen1_gen2_name_lengths = [*zip(gen1_gen2_pokemon, name_lengths_map)]

print(gen1_gen2_name_lengths_loop[:5])
print(gen1_gen2_name_lengths[:5])

[]
[('Bulbasaur', 9), ('Ivysaur', 7), ('Venusaur', 8), ('Charmander', 10), ('Charmeleon', 10)]


In [323]:
#(HP, Attack, Defense, Special Attack, Special Defense, and Speed respectively.)
print(pokeman_data.columns, '\n', pokeman_data.head(1).T)
stats= np.array(pokeman_data[['hp', 'attack', 'defense', 'sp_attack', 'sp_defense', 'speed']])
poke_list= list((pokeman_data[['name', 'base_total', 'base_happiness']]))

Index(['abilities', 'against_bug', 'against_dark', 'against_dragon',
       'against_electric', 'against_fairy', 'against_fight', 'against_fire',
       'against_flying', 'against_ghost', 'against_grass', 'against_ground',
       'against_ice', 'against_normal', 'against_poison', 'against_psychic',
       'against_rock', 'against_steel', 'against_water', 'attack',
       'base_egg_steps', 'base_happiness', 'base_total', 'capture_rate',
       'classfication', 'defense', 'experience_growth', 'height_m', 'hp',
       'japanese_name', 'name', 'percentage_male', 'pokedex_number',
       'sp_attack', 'sp_defense', 'speed', 'type1', 'type2', 'weight_kg',
       'generation', 'is_legendary'],
      dtype='object') 
                                              0
abilities          ['Overgrow', 'Chlorophyll']
against_bug                                  1
against_dark                                 1
against_dragon                               1
against_electric                           0.5

In [324]:
# Create a total stats array
total_stats_np = stats.sum(axis=1)

# Create an average stats array
avg_stats_np = stats.mean(axis=1)

# Combine names, total_stats_np, and avg_stats_np into a list
poke_list_np = [*zip(names, total_stats_np, avg_stats_np)]

print(poke_list_np == poke_list, '\n')
print(poke_list_np[:3])
print(poke_list[:3], '\n')
top_3 = sorted(poke_list_np, key=lambda x: x[1], reverse=True)[:3]
print('3 strongest Pokémon:\n{}'.format(top_3))

False 

[('Bulbasaur', 318, 53.0), ('Ivysaur', 405, 67.5), ('Venusaur', 625, 104.16666666666667)]
['name', 'base_total', 'base_happiness'] 

3 strongest Pokémon:
[('Mewtwo', 780, 130.0), ('Rayquaza', 780, 130.0), ('Kyogre', 770, 128.33333333333334)]


Writing better loops

In [325]:
# Import Counter
from collections import Counter 

# Collect the count of each generation
gen_counts = Counter(generations)

# Improve for loop by moving one calculation above the loop


for gen,count in gen_counts.items():
    total_count = len(generations)
    gen_percent = round(count / total_count * 100, 2)
    print('generation {}: count = {:3} percentage = {}'
          .format(gen, count, gen_percent))

generation 1: count = 151 percentage = 18.85
generation 2: count = 100 percentage = 12.48
generation 3: count = 135 percentage = 16.85
generation 4: count = 107 percentage = 13.36
generation 5: count = 156 percentage = 19.48
generation 6: count =  72 percentage = 8.99
generation 7: count =  80 percentage = 9.99


In [326]:
pokemon_types=list(set(primary_types))

In [327]:

# Collect all possible pairs using combinations()
possible_pairs = [*combinations(pokemon_types, 2)]

# Create an empty list called enumerated_tuples
enumerated_tuples = []

# Add a line to append each enumerated_pair_tuple to the empty list above
for i,pair in enumerate(possible_pairs, 1):
    enumerated_pair_tuple = (i,) + pair
    enumerated_tuples.append(enumerated_pair_tuple)

# Convert all tuples in enumerated_tuples to a list
enumerated_pairs = [*map(list, enumerated_tuples)]
print(enumerated_pairs)

[[1, 'normal', 'fairy'], [2, 'normal', 'bug'], [3, 'normal', 'fire'], [4, 'normal', 'ice'], [5, 'normal', 'water'], [6, 'normal', 'poison'], [7, 'normal', 'psychic'], [8, 'normal', 'dark'], [9, 'normal', 'electric'], [10, 'normal', 'ground'], [11, 'normal', 'rock'], [12, 'normal', 'flying'], [13, 'normal', 'ghost'], [14, 'normal', 'dragon'], [15, 'normal', 'fighting'], [16, 'normal', 'grass'], [17, 'normal', 'steel'], [18, 'fairy', 'bug'], [19, 'fairy', 'fire'], [20, 'fairy', 'ice'], [21, 'fairy', 'water'], [22, 'fairy', 'poison'], [23, 'fairy', 'psychic'], [24, 'fairy', 'dark'], [25, 'fairy', 'electric'], [26, 'fairy', 'ground'], [27, 'fairy', 'rock'], [28, 'fairy', 'flying'], [29, 'fairy', 'ghost'], [30, 'fairy', 'dragon'], [31, 'fairy', 'fighting'], [32, 'fairy', 'grass'], [33, 'fairy', 'steel'], [34, 'bug', 'fire'], [35, 'bug', 'ice'], [36, 'bug', 'water'], [37, 'bug', 'poison'], [38, 'bug', 'psychic'], [39, 'bug', 'dark'], [40, 'bug', 'electric'], [41, 'bug', 'ground'], [42, 'bug'

In [328]:
hps=stats[:,0]
# Calculate the total HP avg and total HP standard deviation
hp_avg = np.mean(hps)
hp_std = np.std(hps)
print(hp_avg)
# Use NumPy to eliminate the previous for loop
z_scores = (hps - hp_avg)/hp_std

# Combine names, hps, and z_scores
poke_zscores2 = [*zip(names, hps, z_scores)]
print(*poke_zscores2[:3], sep='\n')

68.95880149812734
('Bulbasaur', 45, -0.9020830044724206)
('Ivysaur', 60, -0.3373116377517634)
('Venusaur', 80, 0.4157168512091129)


In [329]:
# Use list comprehension with the same logic as the highest_hp_pokemon code block
%timeit highest_hp_pokemon2 = [(name, hp, z_score) for name, hp, z_score in poke_zscores2 if z_score > 2]


237 µs ± 14.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [330]:
print(*highest_hp_pokemon2, sep='\n')

('Wigglytuff', 140, 2.674802318091742)
('Chansey', 250, 6.816459007376562)
('Lapras', 130, 2.2982880736113036)
('Vaporeon', 130, 2.2982880736113036)
('Snorlax', 160, 3.4278308070526182)
('Lanturn', 125, 2.1100309513710847)
('Wobbuffet', 190, 4.557373540493932)
('Blissey', 255, 7.004716129616781)
('Slaking', 150, 3.05131656257218)
('Hariyama', 144, 2.825408015883917)
('Wailmer', 130, 2.2982880736113036)
('Wailord', 170, 3.8043450515330566)
('Drifblim', 150, 3.05131656257218)
('Munchlax', 135, 2.486545195851523)
('Giratina', 150, 3.05131656257218)
('Alomomola', 165, 3.616087929292837)
('Kyurem', 125, 2.1100309513710847)
('Gogoat', 123, 2.034728102474997)
('Aurorus', 123, 2.034728102474997)
('Xerneas', 126, 2.1476823758191284)
('Yveltal', 126, 2.1476823758191284)
('Zygarde', 216, 5.536310576143072)
('Solgaleo', 137, 2.5618480447476104)
('Lunala', 137, 2.5618480447476104)
('Guzzlord', 223, 5.799870547279379)


In [331]:
%%timeit
poke_zscores = []

for name,hp in zip(names, hps):
    hp_avg = hps.mean()
    hp_std = hps.std()
    z_score = (hp - hp_avg)/hp_std
    poke_zscores.append((name, hp, z_score))
highest_hp_pokemon = []

for name,hp,zscore in poke_zscores:
    if zscore > 2:
        highest_hp_pokemon.append((name, hp, zscore))

37 ms ± 557 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Chapter 4: Basic pandas optimizations


In [332]:
baseball_df=pd.read_csv('../datasets/baseball_stats.csv')
pit_df=baseball_df[(baseball_df['Team']=='PIT') & (baseball_df['Year'] >2007)& (baseball_df['Year'] <2013)]

In [333]:
# Iterate over pit_df and print each row
for i,row in pit_df.iterrows():
    print(i)
    print(row)
    print(type(row))

21
Team              PIT
League             NL
Year             2012
RS                651
RA                674
W                  79
OBP             0.304
SLG             0.395
BA              0.243
Playoffs            0
RankSeason        NaN
RankPlayoffs      NaN
G                 162
OOBP            0.314
OSLG             0.39
Name: 21, dtype: object
<class 'pandas.core.series.Series'>
51
Team              PIT
League             NL
Year             2011
RS                610
RA                712
W                  72
OBP             0.309
SLG             0.368
BA              0.244
Playoffs            0
RankSeason        NaN
RankPlayoffs      NaN
G                 162
OOBP            0.338
OSLG            0.409
Name: 51, dtype: object
<class 'pandas.core.series.Series'>
81
Team              PIT
League             NL
Year             2010
RS                587
RA                866
W                  57
OBP             0.304
SLG             0.373
BA              0.242
Playoffs     

In [334]:
# Print the row and type of each row
for row_tuple in pit_df.iterrows():
    print(row_tuple)
    print(type(row_tuple))

(21, Team              PIT
League             NL
Year             2012
RS                651
RA                674
W                  79
OBP             0.304
SLG             0.395
BA              0.243
Playoffs            0
RankSeason        NaN
RankPlayoffs      NaN
G                 162
OOBP            0.314
OSLG             0.39
Name: 21, dtype: object)
<class 'tuple'>
(51, Team              PIT
League             NL
Year             2011
RS                610
RA                712
W                  72
OBP             0.309
SLG             0.368
BA              0.244
Playoffs            0
RankSeason        NaN
RankPlayoffs      NaN
G                 162
OOBP            0.338
OSLG            0.409
Name: 51, dtype: object)
<class 'tuple'>
(81, Team              PIT
League             NL
Year             2010
RS                587
RA                866
W                  57
OBP             0.304
SLG             0.373
BA              0.242
Playoffs            0
RankSeason        NaN
R

In [335]:
giants_df=baseball_df[(baseball_df['Team']=='SFG') & (baseball_df['Year'] >2007)& (baseball_df['Year'] <2013)]
print(giants_df)

Team League  Year   RS   RA   W    OBP    SLG     BA  Playoffs  \
24   SFG     NL  2012  718  649  94  0.327  0.397  0.269         1   
54   SFG     NL  2011  570  578  86  0.303  0.368  0.242         0   
84   SFG     NL  2010  697  583  92  0.321  0.408  0.257         1   
114  SFG     NL  2009  657  611  88  0.309  0.389  0.257         0   
144  SFG     NL  2008  640  759  72  0.321  0.382  0.262         0   

     RankSeason  RankPlayoffs    G   OOBP   OSLG  
24          4.0           1.0  162  0.313  0.393  
54          NaN           NaN  162  0.309  0.346  
84          5.0           1.0  162  0.313  0.370  
114         NaN           NaN  162  0.314  0.372  
144         NaN           NaN  162  0.341  0.404  


In [336]:
def calc_run_diff(runs_scored, runs_allowed):

    run_diff = runs_scored - runs_allowed

    return run_diff

In [337]:
# Create an empty list to store run differentials
run_diffs = []

# Write a for loop and collect runs allowed and runs scored for each row
for i,row in giants_df.iterrows():
    runs_scored = row['RS']
    runs_allowed = row['RA']
    
    # Use the provided function to calculate run_diff for each row
    run_diff = calc_run_diff(runs_scored, runs_allowed)
    
    # Append each run differential to the output list
    run_diffs.append(run_diff)

giants_df['RD'] = run_diffs
print(giants_df)

Team League  Year   RS   RA   W    OBP    SLG     BA  Playoffs  \
24   SFG     NL  2012  718  649  94  0.327  0.397  0.269         1   
54   SFG     NL  2011  570  578  86  0.303  0.368  0.242         0   
84   SFG     NL  2010  697  583  92  0.321  0.408  0.257         1   
114  SFG     NL  2009  657  611  88  0.309  0.389  0.257         0   
144  SFG     NL  2008  640  759  72  0.321  0.382  0.262         0   

     RankSeason  RankPlayoffs    G   OOBP   OSLG   RD  
24          4.0           1.0  162  0.313  0.393   69  
54          NaN           NaN  162  0.309  0.346   -8  
84          5.0           1.0  162  0.313  0.370  114  
114         NaN           NaN  162  0.314  0.372   46  
144         NaN           NaN  162  0.341  0.404 -119  


In [338]:
rangers_df=baseball_df[(baseball_df['Team']=='TEX') ][['Team', 'League', 'Year', 'RS', 'RA', 'W', 'G',  'Playoffs']]

In [339]:
# Loop over the DataFrame and print each row
for row in rangers_df.itertuples():
  print(row)
  i = row.Index
  year = row.Year
  wins = row.W
  print(i, year, wins)
    # Check if rangers made Playoffs (1 means yes; 0 means no)
  if row.Playoffs == 1:
    print(i, year, wins)

Pandas(Index=27, Team='TEX', League='AL', Year=2012, RS=808, RA=707, W=93, G=162, Playoffs=1)
27 2012 93
27 2012 93
Pandas(Index=57, Team='TEX', League='AL', Year=2011, RS=855, RA=677, W=96, G=162, Playoffs=1)
57 2011 96
57 2011 96
Pandas(Index=87, Team='TEX', League='AL', Year=2010, RS=787, RA=687, W=90, G=162, Playoffs=1)
87 2010 90
87 2010 90
Pandas(Index=117, Team='TEX', League='AL', Year=2009, RS=784, RA=740, W=87, G=162, Playoffs=0)
117 2009 87
Pandas(Index=147, Team='TEX', League='AL', Year=2008, RS=901, RA=967, W=79, G=162, Playoffs=0)
147 2008 79
Pandas(Index=177, Team='TEX', League='AL', Year=2007, RS=816, RA=844, W=75, G=162, Playoffs=0)
177 2007 75
Pandas(Index=207, Team='TEX', League='AL', Year=2006, RS=835, RA=784, W=80, G=162, Playoffs=0)
207 2006 80
Pandas(Index=237, Team='TEX', League='AL', Year=2005, RS=865, RA=858, W=79, G=162, Playoffs=0)
237 2005 79
Pandas(Index=268, Team='TEX', League='AL', Year=2004, RS=860, RA=794, W=89, G=162, Playoffs=0)
268 2004 89
Pandas(Ind

In [346]:
yankees_df=baseball_df[(baseball_df['Team']=='NYY') ][['Team', 'League', 'Year', 'RS', 'RA', 'W', 'G',  'Playoffs']]


Team League  Year   RS   RA   W    OBP    SLG     BA  Playoffs  \
26   TBR     AL  2012  697  577  90  0.317  0.394  0.240         0   
56   TBR     AL  2011  707  614  91  0.322  0.402  0.244         1   
86   TBR     AL  2010  802  649  96  0.333  0.403  0.247         1   
116  TBR     AL  2009  803  754  84  0.343  0.439  0.263         0   
146  TBR     AL  2008  774  671  97  0.340  0.422  0.260         1   

     RankSeason  RankPlayoffs    G   OOBP   OSLG  
26          NaN           NaN  162  0.294  0.352  
56          6.0           4.0  162  0.303  0.383  
86          2.0           4.0  162  0.308  0.404  
116         NaN           NaN  162  0.324  0.417  
146         2.0           2.0  162  0.314  0.400  


In [342]:
run_diffs = []

# Loop over the DataFrame and calculate each row's run differential
for row in yankees_df.itertuples():
    
    runs_scored = row.RS
    runs_allowed = row.RA

    run_diff = calc_run_diff(runs_scored, runs_allowed)
    
    run_diffs.append(run_diff)

# Append new column
yankees_df['RD'] = run_diffs
print(yankees_df)

Team League  Year   RS   RA    W    G  Playoffs   RD
18    NYY     AL  2012  804  668   95  162         1  136
48    NYY     AL  2011  867  657   97  162         1  210
78    NYY     AL  2010  859  693   95  162         1  166
108   NYY     AL  2009  915  753  103  162         1  162
138   NYY     AL  2008  789  727   89  162         0   62
168   NYY     AL  2007  968  777   94  162         1  191
198   NYY     AL  2006  930  767   97  162         1  163
228   NYY     AL  2005  886  789   95  162         1   97
259   NYY     AL  2004  897  808  101  162         1   89
289   NYY     AL  2003  877  716  101  163         1  161
319   NYY     AL  2002  897  697  103  161         1  200
349   NYY     AL  2001  804  713   95  161         1   91
379   NYY     AL  2000  871  814   87  161         1   57
409   NYY     AL  1999  900  731   98  162         1  169
439   NYY     AL  1998  965  656  114  162         1  309
468   NYY     AL  1997  891  688   96  162         1  203
496   NYY     AL  1

In [348]:
rays_df=baseball_df[(baseball_df['Team']=='TBR') ][['Year', 'RS', 'RA', 'W', 'Playoffs']].set_index('Year')
print(rays_df)

RS   RA   W  Playoffs
Year                        
2012  697  577  90         0
2011  707  614  91         1
2010  802  649  96         1
2009  803  754  84         0
2008  774  671  97         1


In [349]:
# Gather sum of all columns
stat_totals = rays_df.apply('sum', axis=1)
print(stat_totals)

Year
2012    1364
2011    1413
2010    1548
2009    1641
2008    1543
dtype: int64


In [351]:
# Gather total runs scored in all games per year
total_runs_scored = rays_df[['RS', 'RA']].apply(sum, axis=1)
print(total_runs_scored)

Year
2012    1274
2011    1321
2010    1451
2009    1557
2008    1445
dtype: int64


In [353]:
def text_playoffs(num_playoffs): 
    if num_playoffs == 1:
        return 'Yes'
    else:
        return 'No' 

In [354]:
# Convert numeric playoffs to text
textual_playoffs = rays_df.apply(lambda row: text_playoffs(row['Playoffs']), axis=1)
print(textual_playoffs)

Year
2012     No
2011    Yes
2010    Yes
2009     No
2008    Yes
dtype: object


In [355]:
dbacks_df=baseball_df[(baseball_df['Team']=='ARI') ][['Team', 'League', 'Year', 'RS', 'RA', 'W', 'G',  'Playoffs']]

In [356]:

# Display the first five rows of the DataFrame
print(dbacks_df.head())

Team League  Year   RS   RA   W    G  Playoffs
0    ARI     NL  2012  734  688  81  162         0
30   ARI     NL  2011  731  662  94  162         1
60   ARI     NL  2010  713  836  65  162         0
90   ARI     NL  2009  720  782  70  162         0
120  ARI     NL  2008  720  706  82  162         0


In [358]:
def calc_win_perc(wins, games_played):
    win_perc = wins / games_played
    return np.round(win_perc,2)

In [359]:
# Display the first five rows of the DataFrame
print(dbacks_df.head())

# Create a win percentage Series 
win_percs = dbacks_df.apply(lambda row: calc_win_perc(row['W'], row['G']), axis=1)
print(win_percs, '\n')


Team League  Year   RS   RA   W    G  Playoffs
0    ARI     NL  2012  734  688  81  162         0
30   ARI     NL  2011  731  662  94  162         1
60   ARI     NL  2010  713  836  65  162         0
90   ARI     NL  2009  720  782  70  162         0
120  ARI     NL  2008  720  706  82  162         0
0      0.50
30     0.58
60     0.40
90     0.43
120    0.51
150    0.56
180    0.47
210    0.48
241    0.31
271    0.52
301    0.60
331    0.57
361    0.52
391    0.62
421    0.40
dtype: float64 



In [360]:
# Append a new column to dbacks_df
dbacks_df['WP'] = win_percs
print(dbacks_df, '\n')

# Display dbacks_df where WP is greater than 0.50
print(dbacks_df[dbacks_df['WP'] >= 0.50])

Team League  Year   RS   RA    W    G  Playoffs    WP
0    ARI     NL  2012  734  688   81  162         0  0.50
30   ARI     NL  2011  731  662   94  162         1  0.58
60   ARI     NL  2010  713  836   65  162         0  0.40
90   ARI     NL  2009  720  782   70  162         0  0.43
120  ARI     NL  2008  720  706   82  162         0  0.51
150  ARI     NL  2007  712  732   90  162         1  0.56
180  ARI     NL  2006  773  788   76  162         0  0.47
210  ARI     NL  2005  696  856   77  162         0  0.48
241  ARI     NL  2004  615  899   51  162         0  0.31
271  ARI     NL  2003  717  685   84  162         0  0.52
301  ARI     NL  2002  819  674   98  162         1  0.60
331  ARI     NL  2001  818  677   92  162         1  0.57
361  ARI     NL  2000  792  754   85  162         0  0.52
391  ARI     NL  1999  908  676  100  162         1  0.62
421  ARI     NL  1998  665  812   65  162         0  0.40 

    Team League  Year   RS   RA    W    G  Playoffs    WP
0    ARI     NL 

In [363]:
def calc_win_perc(wins, games_played):
    win_perc = wins / games_played
    return np.round(win_perc,2)

In [364]:
# Use the W array and G array to calculate win percentages
win_percs_np = calc_win_perc(baseball_df['W'].values, baseball_df['G'].values)

In [365]:
# Append a new column to baseball_df that stores all win percentages
baseball_df['WP'] = win_percs_np

print(baseball_df.head())

Team League  Year   RS   RA   W    OBP    SLG     BA  Playoffs  RankSeason  \
0  ARI     NL  2012  734  688  81  0.328  0.418  0.259         0         NaN   
1  ATL     NL  2012  700  600  94  0.320  0.389  0.247         1         4.0   
2  BAL     AL  2012  712  705  93  0.311  0.417  0.247         1         5.0   
3  BOS     AL  2012  734  806  69  0.315  0.415  0.260         0         NaN   
4  CHC     NL  2012  613  759  61  0.302  0.378  0.240         0         NaN   

   RankPlayoffs    G   OOBP   OSLG    WP  
0           NaN  162  0.317  0.415  0.50  
1           5.0  162  0.306  0.378  0.58  
2           4.0  162  0.315  0.403  0.57  
3           NaN  162  0.331  0.428  0.43  
4           NaN  162  0.335  0.424  0.38  


In [367]:
%%timeit
win_percs_list = []

for i in range(len(baseball_df)):
    row = baseball_df.iloc[i]

    wins = row['W']
    games_played = row['G']

    win_perc = calc_win_perc(wins, games_played)

    win_percs_list.append(win_perc)

baseball_df['WP'] = win_percs_list

223 ms ± 9.46 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [369]:
%%timeit
win_percs_np = calc_win_perc(baseball_df['W'].values, baseball_df['G'].values)
baseball_df['WP'] = win_percs_np

376 µs ± 14 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


The NumPy array approach is faster than the .iloc approach

In [370]:
def predict_win_perc(RS, RA):
    prediction = RS ** 2 / (RS ** 2 + RA ** 2)
    return np.round(prediction, 2)

In [374]:
%%timeit
win_perc_preds_loop = []

# Use a loop and .itertuples() to collect each row's predicted win percentage
for baseball_tuple in baseball_df.itertuples():
    runs_scored = baseball_tuple.RS
    runs_allowed = baseball_tuple.RA
    win_perc_pred = predict_win_perc(runs_scored, runs_allowed)
    win_perc_preds_loop.append(win_perc_pred)

18.8 ms ± 782 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [376]:
%%timeit
# Apply predict_win_perc to each row of the DataFrame
win_perc_preds_apply = baseball_df.apply(lambda row: predict_win_perc(row['RS'], row['RA']), axis=1)

51.6 ms ± 1.84 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [377]:
%%timeit
# Calculate the win percentage predictions using NumPy arrays
win_perc_preds_np = predict_win_perc(baseball_df['RS'].values, baseball_df['RA'].values)
baseball_df['WP_preds'] = win_perc_preds_np


412 µs ± 6.98 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [None]:
print(baseball_df.head())