## Parallel Computing

 - Parallel computing can optimise the use of multiple processing units.
 - Parallel computing can optimise the use of memory between several machines.

## Prepare dataset and functions

In [13]:
import pandas as pd

In [14]:
import time

def print_timing(func):
    """
    Decorator function that prints the execution time of a function.
    """
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"Elapsed time: {end_time - start_time:.6f} seconds")
        return result
    return wrapper


In [15]:
athlete_events = pd.read_csv("athlete_events.csv")
# fiter the rows where the "Year" is greater or equal to 1993
athlete_events = athlete_events[athlete_events["Year"] >= 1993]
athlete_events.info()
athlete_events.sample(5)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 105803 entries, 1 to 271115
Data columns (total 15 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   ID      105803 non-null  int64  
 1   Name    105803 non-null  object 
 2   Sex     105803 non-null  object 
 3   Age     105788 non-null  float64
 4   Height  102950 non-null  float64
 5   Weight  102497 non-null  float64
 6   Team    105803 non-null  object 
 7   NOC     105803 non-null  object 
 8   Games   105803 non-null  object 
 9   Year    105803 non-null  int64  
 10  Season  105803 non-null  object 
 11  City    105803 non-null  object 
 12  Sport   105803 non-null  object 
 13  Event   105803 non-null  object 
 14  Medal   14751 non-null   object 
dtypes: float64(3), int64(2), object(10)
memory usage: 12.9+ MB


Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
90338,45783,Natasha Hansen,F,22.0,167.0,66.0,New Zealand,NZL,2012 Summer,2012,Summer,London,Cycling,Cycling Women's Sprint,
36464,18749,Myriam Casanova,F,19.0,172.0,70.0,Switzerland,SUI,2004 Summer,2004,Summer,Athina,Tennis,Tennis Women's Singles,
229141,115048,Ingo Steuer,M,31.0,173.0,68.0,Germany-1,GER,1998 Winter,1998,Winter,Nagano,Figure Skating,Figure Skating Mixed Pairs,Bronze
155603,78113,Kenneth Nathaniel Medwood,M,24.0,178.0,75.0,Belize,BIZ,2012 Summer,2012,Summer,London,Athletics,Athletics Men's 400 metres Hurdles,
105609,53414,Michael Georg Jakosits,M,26.0,184.0,85.0,Germany,GER,1996 Summer,1996,Summer,Atlanta,Shooting,"Shooting Men's Running Target, 10 metres",


In [16]:
# define a function to calcuate the mean age for each group
def take_mean_age(year_and_group):
    year, group = year_and_group
    return pd.DataFrame({"Age": group["Age"].mean()}, index=[year])

### multiprocessing.Pool API splits up a task to do calculations on several processing units.

In [17]:
"""
from multiprocessing import Pool

# Function to apply a function over multiple cores
@print_timing
def parallel_apply(apply_func, groups, nb_cores, chunksize):
    with Pool(nb_cores) as p:
        results = p.map(apply_func, groups, chunksize=chunksize)
    return pd.concat(results)

# Parallel apply using 2 core
parallel_apply(take_mean_age, athlete_events.groupby('Year'), 2, 10000)
"""

"\nfrom multiprocessing import Pool\n\n# Function to apply a function over multiple cores\n@print_timing\ndef parallel_apply(apply_func, groups, nb_cores, chunksize):\n    with Pool(nb_cores) as p:\n        results = p.map(apply_func, groups, chunksize=chunksize)\n    return pd.concat(results)\n\n# Parallel apply using 2 core\nparallel_apply(take_mean_age, athlete_events.groupby('Year'), 2, 10000)\n"

### Another method is using the dask framework and its abstraction of the pandas DataFrame.

In [18]:
import dask.dataframe as dd

# Set the number of partitions
athlete_events_dask = dd.from_pandas(athlete_events, npartitions=4)

# Calculate the mean Age per Year
print(athlete_events_dask.groupby('Year').Age.mean().compute())

Year
1994    24.422103
1996    24.915045
1998    25.163197
2000    25.422504
2002    25.916281
2004    25.639515
2006    25.959151
2008    25.734118
2010    26.124262
2012    25.961378
2014    25.987324
2016    26.207919
Name: Age, dtype: float64
