## Parallel Computing

 - Parallel computing can optimise the use of multiple processing units.
 - Parallel computing can optimise the use of memory between several machines.

In [1]:
import pandas as pd

from multiprocessing import Pool

In [3]:
import time

def print_timing(func):
    """
    Decorator function that prints the execution time of a function.
    """
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"Elapsed time: {end_time - start_time:.6f} seconds")
        return result
    return wrapper


In [4]:
athlete_events = pd.read_csv("athlete_events.csv")
# fiter the rows where the "Year" is greater or equal to 2000
athlete_events = athlete_events[athlete_events["Year"] >= 2000]
athlete_events.info()
athlete_events.sample(5)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 85258 entries, 1 to 271115
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ID      85258 non-null  int64  
 1   Name    85258 non-null  object 
 2   Sex     85258 non-null  object 
 3   Age     85255 non-null  float64
 4   Height  84547 non-null  float64
 5   Weight  84048 non-null  float64
 6   Team    85258 non-null  object 
 7   NOC     85258 non-null  object 
 8   Games   85258 non-null  object 
 9   Year    85258 non-null  int64  
 10  Season  85258 non-null  object 
 11  City    85258 non-null  object 
 12  Sport   85258 non-null  object 
 13  Event   85258 non-null  object 
 14  Medal   12138 non-null  object 
dtypes: float64(3), int64(2), object(10)
memory usage: 10.4+ MB


Unnamed: 0,ID,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
145903,73221,Tianna Madison-Bartoletta,F,26.0,168.0,58.0,United States,USA,2012 Summer,2012,Summer,London,Athletics,Athletics Women's 100 metres,
136863,68802,Leong Mun Yee,F,23.0,163.0,55.0,Malaysia,MAS,2008 Summer,2008,Summer,Beijing,Diving,Diving Women's Springboard,
118299,59844,Kim Dong-Hyeon,M,22.0,184.0,100.0,South Korea,KOR,2010 Winter,2010,Winter,Vancouver,Bobsleigh,Bobsleigh Men's Four,
79286,40265,Glenroy John Gilbert,M,32.0,183.0,79.0,Canada,CAN,2000 Summer,2000,Summer,Sydney,Athletics,Athletics Men's 4 x 100 metres Relay,
227679,114392,Brook Staples,M,34.0,177.0,66.0,Australia,AUS,2000 Summer,2000,Summer,Sydney,Equestrianism,"Equestrianism Mixed Three-Day Event, Individual",


In [5]:
# define a function to calcuate the mean age for each group
def take_mean_age(year_and_group):
    year, group = year_and_group
    return pd.DataFrame({"Age": group["Age"].mean()}, index=[year])

### split up a task and use the low-level python multiprocessing.Pool API to do calculations on several processing units.

In [5]:
# Function to apply a function over multiple cores
@print_timing
def parallel_apply(apply_func, groups, nb_cores):
    with Pool(nb_cores) as p:
        results = p.map(apply_func, groups)
    return pd.concat(results)

# Parallel apply using 1 core
parallel_apply(take_mean_age, athlete_events.groupby('Year'), 1)

# Parallel apply using 2 cores
parallel_apply(take_mean_age, athlete_events.groupby('Year'), 2)

### Another method is using the dask framework and its abstraction of the pandas DataFrame.

In [7]:
import dask.dataframe as dd

# Set the number of partitions
athlete_events_dask = dd.from_pandas(athlete_events, npartitions=4)

# Calculate the mean Age per Year
print(athlete_events_dask.groupby('Year').Age.mean().compute())

Year
2000    25.422504
2002    25.916281
2004    25.639515
2006    25.959151
2008    25.734118
2010    26.124262
2012    25.961378
2014    25.987324
2016    26.207919
Name: Age, dtype: float64
