# Feature Engineering for Olympic results
Purpose of this workbook is to build a dataset ready for proper EDA and modelling.

Plan initially is to join historic population and GDP per capita data onto the Olympics results data scraped from IOC website. Will also add the home country to each games (home advantage is reported as a key predictor of olympic success).

The population and GDPpc data will need joining based on country (fuzzy match perhaps needed where country names don't perfectly align) and also their values will need interpolating in many cases onto the dates of each games (data starts very course for further back in time, and shifts to annual data for more recent dates).

### Handling dates outside the data ranges
Some games data for some countries will be outside of the range the population and GDPpc data too. Three options for this:
1. 'Saturated': Take value of nearest date (e.g. oldest data point)
2. Extrapolated: Make a linear extrapolation based on the closest two points
3. Ignore values that can't be interpolated

Option 3 is preferable initially because it doesn't make any assumptions outside of the known datasets, and it is expected that there will be plenty of observations in the data to train models with.

In [2]:
import pandas as pd
from geopy.geocoders import Nominatim
from fuzzywuzzy import fuzz
from tqdm import tqdm
import numpy as np
import os



### Utility functions

Interpolate population data onto results

In [3]:
def historical_interp(df_lookup, country_code, year, y_col, country_code_col='alpha_3', x_col='year'):
    '''Function that interpolates historic population data for a year of an Olympic games for a given team'''
    
    if country_code is not None:
        df_history = df_lookup.loc[df_lookup[country_code_col]==country_code, [x_col, y_col]].astype('float')
        if df_history.shape[0] > 1:
            df_interp = np.interp([year],
                                  df_history[x_col],
                                  df_history[y_col],
                                          left=np.nan,
                                          right=np.nan)
            return df_interp[0]
        else:
            return np.nan
    else:
        return np.nan

## Importing the summer Olympics results

In [4]:
# load results. Path set to relative in repo (i.e. assumes the same folder)
project_path = r''
results_fname = r'olympics_summer_results.csv'
results = pd.read_csv(os.path.join(project_path, results_fname))
results_f = results.copy()
results_f.head()

Unnamed: 0,games,country,alpha_3,ioc_code,host,year,gold,silver,bronze,total,summer,associate_nations
0,Athens 1896,Australia,AUS,AUS,0,1896,2,0,0,2,1,
1,Athens 1896,Austria,AUT,AUT,0,1896,2,1,2,5,1,
2,Athens 1896,Bulgaria,BGR,BUL,0,1896,0,0,0,0,1,
3,Athens 1896,Chile,CHL,CHI,0,1896,0,0,0,0,1,
4,Athens 1896,Denmark,DNK,DEN,0,1896,1,2,3,6,1,


## Final cleaning of ISO Alpha 3 codes
Some codes have errors in them (probably due to the use of fuzzy matching). These need resolving to ensure that countries can be successfully visualised without incorrect duplicates.

In [12]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
countries_codes = results_f[['country', 'alpha_3', 'ioc_code']].drop_duplicates().sort_values('country')
def build_google_iso_search(row):
    country = row['country'].replace(' ', '+')
    ggl_url = f'www.google.com/search?q=iso+alpha3+{country}'
    return ggl_url
countries_codes['google_iso'] = countries_codes.apply(lambda row: build_google_iso_search(row), axis=1)
print(countries_codes)
# pd.reset_option('display.max_rows')

                                    country alpha_3 ioc_code  \
267                             Afghanistan     AFG      AFG   
809                                 Albania     ALB      ALB   
602                                 Algeria     DZA      ALG   
1244                         American Samoa     ASM      ASA   
929                                 Andorra     AND      AND   
1024                                 Angola     AGO      ANG   
930                     Antigua and Barbuda     ATG      ANT   
16                                Argentina     ARG      ARG   
1579                                Armenia     ARM      ARM   
1249                                  Aruba     ABW      ARU   
62                              Australasia     NaN      ANZ   
0                                 Australia     AUS      AUS   
1                                   Austria     AUT      AUT   
1583                             Azerbaijan     AZE      AZE   
378                                 Baha

In [47]:
iso_corrections = {'Czechoslovakia': 'CSK', # correction
                   'ROC': 'RUS', # assume ROC is Russia
                   'Kosovo': 'XXK', # corrected from Serbia SRB
                   'Niger': 'NER'} # corrected from Nigeria NGA
for country in iso_corrections:
    results_f.loc[results_f.country==country, 'alpha_3'] = iso_corrections[country]
results_f[results_f.country.isin(iso_corrections.keys())].drop_duplicates('country').head()

Unnamed: 0,games,country,alpha_3,ioc_code,host,year,gold,silver,bronze,total,summer,associate_nations
117,Antwerp 1920,Czechoslovakia,CSK,TCH,0,1920,0,0,2,2,1,
665,Tokyo 1964,Niger,NER,NIG,0,1964,0,0,0,0,1,
2683,Rio 2016,Kosovo,XXK,KOS,0,2016,1,0,0,1,1,
2944,Tokyo 2020,ROC,RUS,ROC,0,2021,20,28,23,71,1,


In [49]:
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')

## Historic population by country (different times ahead of games)
Difficult to know how soon population impacts olympic success. Will generate features for year -0, -4, -8, -12, -16, -20,  -24, -28, -32, -36, -40.

In [50]:
# load data
population_fname2 = r'maddison_population.csv'
population = pd.read_csv(os.path.join(project_path, population_fname2))

# extract country codes from top row (rename, convert to dataframe) and set headers
country_codes = population.iloc[0,1:].rename('alpha_3').to_frame()
population = population.rename(columns={'Population': 'year'})
population = population.iloc[1:]

#unpivot data, drop blank rows, convert year and population to numeric types
population = pd.melt(population, id_vars=['year'],
                     value_vars=population.columns[1:],
                     var_name='country',
                     value_name='population')
population.dropna(axis=0, inplace=True)
population = population.sort_values(['country', 'year'])
population['year'] = population.year.astype('int')
population['population'] = population.population.str.replace(',', '').astype('float')

# join country codes back onto data & reorder cols
population = population.merge(country_codes,
                              how='left',
                              left_on='country',
                              right_index=True)
population = population[['country', 'alpha_3', 'year', 'population']]

print('max year: {}'.format(population.year.max()))
population.head()

max year: 2018


Unnamed: 0,country,alpha_3,year,population
573,Afghanistan,AFG,1820,3280.0
623,Afghanistan,AFG,1870,4207.0
666,Afghanistan,AFG,1913,5730.0
703,Afghanistan,AFG,1950,8150.0
704,Afghanistan,AFG,1951,8284.0


In [51]:
pop_trans_fname = r'population_transformed.csv'
population.to_csv(os.path.join(project_path, pop_trans_fname), index=False)

Now we have population data for up to 2018.

### Interpolating population onto games

In [52]:
year_deltas = list(range(0, 44, 4))

for yd in tqdm(year_deltas):
    results_f[f'pop_{yd}'] = results_f.apply(lambda row: historical_interp(country_code=row.alpha_3,
                                                                           year=row.year - yd,
                                                                           df_lookup=population,
                                                                           x_col='year',
                                                                           y_col='population',
                                                                           country_code_col='alpha_3'), axis=1)

results_f.head()

100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [02:22<00:00, 12.96s/it]


Unnamed: 0,games,country,alpha_3,ioc_code,host,year,gold,silver,bronze,total,...,pop_4,pop_8,pop_12,pop_16,pop_20,pop_24,pop_28,pop_32,pop_36,pop_40
0,Athens 1896,Australia,AUS,AUS,0,1896,2,0,0,2,...,3274.0,2932.0,2556.0,2197.0,1929.0,1722.0,1700.0,1495.0,1326.0,1065.0
1,Athens 1896,Austria,AUT,AUT,0,1896,2,1,2,5,...,5504.0,5303.0,5121.0,4941.0,4772.0,4604.0,4469.0,4350.0,4235.0,4120.0
2,Athens 1896,Bulgaria,BGR,BUL,0,1896,0,0,0,0,...,3556.0,3359.1,3187.3,3015.5,2843.7,2671.9,2577.4,2560.2,2543.0,2525.8
3,Athens 1896,Chile,CHL,CHI,0,1896,0,0,0,0,...,2678.0,2538.0,2400.0,2264.0,2132.0,2006.0,1885.0,1770.0,1661.0,1557.0
4,Athens 1896,Denmark,DNK,DEN,0,1896,1,2,3,6,...,2327.0,2257.0,2160.0,2081.0,1994.0,1918.0,1852.0,1777.0,1696.0,1612.0


In [53]:
results_f[(results_f.alpha_3=='PAK') & results_f.pop_0.isna()]

Unnamed: 0,games,country,alpha_3,ioc_code,host,year,gold,silver,bronze,total,...,pop_4,pop_8,pop_12,pop_16,pop_20,pop_24,pop_28,pop_32,pop_36,pop_40
355,London 1948,Pakistan,PAK,PAK,0,1948,0,0,0,0,...,,,,,,,,,,
2931,Tokyo 2020,Pakistan,PAK,PAK,0,2021,0,0,0,0,...,215301.0,197992.0,180688.0,165075.0,150448.0,134717.0,122052.0,111528.0,99076.0,88417.0


In [54]:
results_f.isna().sum() / results_f.shape[0] * 100

games                  0.000000
country                0.000000
alpha_3                0.700935
ioc_code               0.000000
host                   0.000000
year                   0.000000
gold                   0.000000
silver                 0.000000
bronze                 0.000000
total                  0.000000
summer                 0.000000
associate_nations    100.000000
pop_0                 21.428571
pop_4                 16.088117
pop_8                 16.355140
pop_12                16.688919
pop_16                17.323097
pop_20                18.090788
pop_24                19.025367
pop_28                19.359146
pop_32                20.126836
pop_36                21.628838
pop_40                23.164219
dtype: float64

~20% of population values haven't interpolated. Worth investigating why this is (e.g. Andorra seems to be a complete miss--is it not in the population data?)

Likely because some codes are missing and others have dates that start in mid-1900s for example.

Remember there is no population data beyond 2018, so no values will be interpolated onto Tokyo 2020

In [55]:
results_f.loc[results_f.pop_4.isna(), 'country'].sort_values().unique()

array(['American Samoa', 'Andorra', 'Antigua and Barbuda', 'Aruba',
       'Australasia', 'Bahamas', 'Belize', 'Bermuda', 'Bhutan', 'Bohemia',
       'British Virgin Islands', 'Brunei', 'Cayman Islands',
       'Cook Islands', 'East Timor', 'Eritrea', 'Estonia',
       'Federated States of Micronesia', 'Fiji',
       'German Democratic Republic (Germany)', 'Grenada', 'Guam',
       'Guyana', 'Iceland', 'Independent Olympic Athletes', 'Israel',
       'Kiribati', 'Kosovo', 'Latvia', 'Liechtenstein', 'Lithuania',
       'Luxembourg', 'MIX', 'Maldives', 'Malta', 'Marshall Islands',
       'Monaco', 'Nauru', 'Netherlands Antilles', 'Nigeria',
       'North Borneo', 'North Yemen', 'Pakistan', 'Palau',
       'Papua New Guinea', 'Refugee Olympic Team', 'Russian Federation',
       'Saar', 'Saint Kitts and Nevis',
       'Saint Vincent and the Grenadines', 'Samoa', 'San Marino',
       'Serbia', 'Serbia and Montenegro', 'Solomon Islands', 'Somalia',
       'South Sudan', 'South Yemen', 'Surin

## Historic GDP per capita by country
Expect best feature to be 4yrs prior (start of cycle), but will generate 0, -4, -8, -12

In [56]:
gdp_fname = r'gdp-per-capita-maddison-2020.csv'
gdp = pd.read_csv(os.path.join(project_path, gdp_fname))

col_renames = {'Entity': 'country', 'Code': 'alpha_3', 'Year': 'year', 'GDP per capita': 'gdp_pc'}
gdp.rename(columns=col_renames, inplace=True)
               
gdp['year'] = gdp.year.astype('int')
               
print('max year: {}'.format(gdp.year.max()))
gdp.head()

max year: 2018


Unnamed: 0,country,alpha_3,year,gdp_pc,145446-annotations
0,Afghanistan,AFG,1950,1156.0,
1,Afghanistan,AFG,1951,1170.0,
2,Afghanistan,AFG,1952,1189.0,
3,Afghanistan,AFG,1953,1240.0,
4,Afghanistan,AFG,1954,1245.0,


GDPpc data goes up to 2018, which should be good enough (and as good as can get).

### Inerpolating GDP per capita onto games

In [57]:
year_deltas = list(range(0, 44, 4))

for yd in tqdm(year_deltas):
    results_f[f'gdppc_{yd}'] = results_f.apply(lambda row: historical_interp(country_code=row.alpha_3,
                                                                           year=row.year - yd,
                                                                           df_lookup=gdp,
                                                                           x_col='year',
                                                                           y_col='gdp_pc',
                                                                           country_code_col='alpha_3'), axis=1)

results_f.head()

100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [02:58<00:00, 16.26s/it]


Unnamed: 0,games,country,alpha_3,ioc_code,host,year,gold,silver,bronze,total,...,gdppc_4,gdppc_8,gdppc_12,gdppc_16,gdppc_20,gdppc_24,gdppc_28,gdppc_32,gdppc_36,gdppc_40
0,Athens 1896,Australia,AUS,AUS,0,1896,2,0,0,2,...,6368.0,7179.0,6861.0,6830.0,6387.0,5663.0,5031.0,4616.0,4613.0,4866.0
1,Athens 1896,Austria,AUT,AUT,0,1896,2,1,2,5,...,4041.0,3792.0,3583.0,3314.0,3188.0,3150.0,2942.8,2888.4,2834.0,2752.4
2,Athens 1896,Bulgaria,BGR,BUL,0,1896,0,0,0,0,...,1804.0,1719.454545,1634.909091,1550.363636,1465.818182,1381.272727,,,,
3,Athens 1896,Chile,CHL,CHI,0,1896,0,0,0,0,...,3005.0,2617.0,2703.0,2418.0,1988.0,1910.0,1714.0,1667.0,1588.0,1427.0
4,Athens 1896,Denmark,DNK,DEN,0,1896,1,2,3,6,...,4141.0,3808.0,3642.0,3476.0,3395.0,3327.0,2954.0,2922.0,2775.0,2735.0


In [58]:
results_f.isna().sum() / results_f.shape[0] * 100

games                  0.000000
country                0.000000
alpha_3                0.700935
ioc_code               0.000000
host                   0.000000
year                   0.000000
gold                   0.000000
silver                 0.000000
bronze                 0.000000
total                  0.000000
summer                 0.000000
associate_nations    100.000000
pop_0                 21.428571
pop_4                 16.088117
pop_8                 16.355140
pop_12                16.688919
pop_16                17.323097
pop_20                18.090788
pop_24                19.025367
pop_28                19.359146
pop_32                20.126836
pop_36                21.628838
pop_40                23.164219
gdppc_0               22.897196
gdppc_4               17.590120
gdppc_8               17.990654
gdppc_12              18.457944
gdppc_16              19.359146
gdppc_20              20.460614
gdppc_24              22.096128
gdppc_28              23.064085
gdppc_32

Similar numbers (~20%) of NaNs as with population. Investigate where they are in EDA.

## Home Games
Home games are already flagged in the data at this stage (from the Wikipedia participation data). Hypotheses are:
1. Having a future games awareded may raise performance in games prior. Recently they have been awarded 7 years in advance, so only one cycle ahead of the games would see a difference
2. A country may continue to perform highly after hosting due to effects of investment etc. Assume up to +3 cycles initially (can look to more)

-1, +1, +2, +3 cycles

In [59]:
results_f[results_f.host==1].head(3)

Unnamed: 0,games,country,alpha_3,ioc_code,host,year,gold,silver,bronze,total,...,gdppc_4,gdppc_8,gdppc_12,gdppc_16,gdppc_20,gdppc_24,gdppc_28,gdppc_32,gdppc_36,gdppc_40
9,Athens 1896,Greece,GRC,GRE,1,1896,10,18,19,47,...,2053.0,2566.0,2351.0,2028.0,1921.0,1806.0,2020.0,1854.0,2125.0,1757.0
26,Paris 1900,France,FRA,FRA,1,1900,27,38,37,102,...,4280.0,3974.0,3617.0,3591.0,3379.0,3233.0,3312.0,3159.0,3169.0,3016.0
59,St. Louis 1904,United States of America,USA,USA,1,1904,78,82,79,239,...,8037.57,6885.67,7324.06,6447.28,6523.13,6255.73,5049.84,4991.98,4815.86,4889.14


In [60]:
def home_advantage(df_in, relative_games=[-1, 0, 1, 2, 3]):
    '''Function that adds columns of flags for whether countries are due to host games in the futures or have hosted games in the past.
    
    Args:
        df_in: The full pandas dataframe to insert the outputs to
        relative_games (optional): the relative games-cycles to a hosting year. For example, -1 would flag 1 for hosting year - 4 (i.e. the prior Olympics)
        
    Returns:
        A pandas dataframe with columns appended for each value in the relative games list. Naming uses "n" for prior cycles and "p" for future ones. Cycles are converted to years.
        
        For example, -1 would be host_n4, and 3 would be host_p12'''
    
    # copy df to output
    df = df_in.copy()
    
    # get list of hosts to iterate through
    hosts = df[df.host==1].copy()
    
    for i, row in hosts.iterrows():
        
        # filter temp dataframe for current host and extract year of hosting
        df_host = df[df.alpha_3==row['alpha_3']]
        yr = row['year']
        
        # iterate through relative games
        for rg in relative_games:
            # convert to years
            rg *= 4
            
            # set column prefixes
            if rg < 0:
                prefix = 'n'
            elif rg > 0:
                prefix = 'p'
            else:
                prefix = ''
            
            # generate column name and add to df if doesn't exist
            col = f'host_{prefix}{abs(rg)}'
            if col not in df.columns:
                df[col] = 0
            
            # get df index for relative year
            relative_yr = yr + rg
            if df_host[df_host.year==relative_yr].shape[0] != 0:
                idx = df_host[df_host.year==relative_yr].index[0]
                df.loc[idx, col] = 1
            
    return df

In [61]:
results_f = home_advantage(df_in=results_f, relative_games=[-1, 0, 1, 2, 3])
results_f[(results_f.alpha_3=='CHN') & (results_f.year>2000)]

Unnamed: 0,games,country,alpha_3,ioc_code,host,year,gold,silver,bronze,total,...,gdppc_24,gdppc_28,gdppc_32,gdppc_36,gdppc_40,host_n4,host_0,host_p4,host_p8,host_p12
2009,Athens 2004,People's Republic of China,CHN,CHN,0,2004,32,17,14,63,...,1930.0,1519.0,1431.0,1178.0,1152.0,1,0,0,0,0
2210,Beijing 2008,People's Republic of China,CHN,CHN,1,2008,48,22,30,100,...,2444.0,1930.0,1519.0,1431.0,1178.0,0,1,0,0,0
2415,London 2012,People's Republic of China,CHN,CHN,0,2012,38,31,22,91,...,3045.0,2444.0,1930.0,1519.0,1431.0,0,0,1,0,0
2622,Rio 2016,People's Republic of China,CHN,CHN,0,2016,26,18,26,70,...,3227.74,3045.0,2444.0,1930.0,1519.0,0,0,0,1,0
2830,Tokyo 2020,People's Republic of China,CHN,CHN,0,2021,38,32,18,88,...,4310.83,3506.48,3022.0,2619.0,1962.0,0,0,0,0,0


In [62]:
results_f.isna().sum() / results_f.shape[0] * 100

games                  0.000000
country                0.000000
alpha_3                0.700935
ioc_code               0.000000
host                   0.000000
year                   0.000000
gold                   0.000000
silver                 0.000000
bronze                 0.000000
total                  0.000000
summer                 0.000000
associate_nations    100.000000
pop_0                 21.428571
pop_4                 16.088117
pop_8                 16.355140
pop_12                16.688919
pop_16                17.323097
pop_20                18.090788
pop_24                19.025367
pop_28                19.359146
pop_32                20.126836
pop_36                21.628838
pop_40                23.164219
gdppc_0               22.897196
gdppc_4               17.590120
gdppc_8               17.990654
gdppc_12              18.457944
gdppc_16              19.359146
gdppc_20              20.460614
gdppc_24              22.096128
gdppc_28              23.064085
gdppc_32

## Medals Per Team
Hypothesis: the number of medals won is a function of medals available per team participating.

In [63]:
results_f[results_f.year==1912].shape

(28, 39)

In [64]:
games_group = results_f.groupby('games')
games = games_group[['year']].first()
games['games_gold'] = games_group['gold'].sum()
games['games_silver'] = games_group['silver'].sum()
games['games_bronze'] = games_group['bronze'].sum()
games['games_medals'] = games_group['total'].sum()
games['n_teams'] = games_group['total'].count()
games['medals_per_team'] = games.games_medals / games.n_teams
games = games.sort_values('year').reset_index()
games.head()

Unnamed: 0,games,year,games_gold,games_silver,games_bronze,games_medals,n_teams,medals_per_team
0,Athens 1896,1896,43,43,36,122,16,7.625
1,Paris 1900,1900,96,95,93,284,32,8.875
2,St. Louis 1904,1904,97,96,93,286,13,22.0
3,London 1908,1908,110,107,107,324,22,14.727273
4,Stockholm 1912,1912,108,105,104,317,28,11.321429


In [65]:
results_f[results_f.year==1896].sort_values('country')

Unnamed: 0,games,country,alpha_3,ioc_code,host,year,gold,silver,bronze,total,...,gdppc_24,gdppc_28,gdppc_32,gdppc_36,gdppc_40,host_n4,host_0,host_p4,host_p8,host_p12
0,Athens 1896,Australia,AUS,AUS,0,1896,2,0,0,2,...,5663.0,5031.0,4616.0,4613.0,4866.0,0,0,0,0,0
1,Athens 1896,Austria,AUT,AUT,0,1896,2,1,2,5,...,3150.0,2942.8,2888.4,2834.0,2752.4,0,0,0,0,0
2,Athens 1896,Bulgaria,BGR,BUL,0,1896,0,0,0,0,...,1381.272727,,,,,0,0,0,0,0
3,Athens 1896,Chile,CHL,CHI,0,1896,0,0,0,0,...,1910.0,1714.0,1667.0,1588.0,1427.0,0,0,0,0,0
4,Athens 1896,Denmark,DNK,DEN,0,1896,1,2,3,6,...,3327.0,2954.0,2922.0,2775.0,2735.0,0,0,0,0,0
5,Athens 1896,Egypt,EGY,EGY,0,1896,0,0,0,0,...,1217.27907,1185.44,1166.32,1147.2,1128.08,0,0,0,0,0
6,Athens 1896,France,FRA,FRA,0,1896,5,4,2,11,...,3312.0,3159.0,3169.0,3016.0,2687.0,1,0,0,0,0
7,Athens 1896,Germany,DEU,GER,0,1896,6,5,2,13,...,3078.0,2966.0,2837.0,2613.0,2440.0,0,0,0,0,0
8,Athens 1896,Great Britain,GBR,GBR,0,1896,2,3,2,7,...,5769.0,5636.0,5255.0,5086.0,5013.0,0,0,0,0,0
9,Athens 1896,Greece,GRC,GRE,1,1896,10,18,19,47,...,1806.0,2020.0,1854.0,2125.0,1757.0,0,1,0,0,0


In [66]:
cols = ['games', 'games_gold', 'games_silver', 'games_bronze', 'games_medals', 'n_teams', 'medals_per_team']
results_f = results_f.merge(games[cols],
                            how='left',
                            left_on='games',
                            right_on='games')
results_f.head()

Unnamed: 0,games,country,alpha_3,ioc_code,host,year,gold,silver,bronze,total,...,host_0,host_p4,host_p8,host_p12,games_gold,games_silver,games_bronze,games_medals,n_teams,medals_per_team
0,Athens 1896,Australia,AUS,AUS,0,1896,2,0,0,2,...,0,0,0,0,43,43,36,122,16,7.625
1,Athens 1896,Austria,AUT,AUT,0,1896,2,1,2,5,...,0,0,0,0,43,43,36,122,16,7.625
2,Athens 1896,Bulgaria,BGR,BUL,0,1896,0,0,0,0,...,0,0,0,0,43,43,36,122,16,7.625
3,Athens 1896,Chile,CHL,CHI,0,1896,0,0,0,0,...,0,0,0,0,43,43,36,122,16,7.625
4,Athens 1896,Denmark,DNK,DEN,0,1896,1,2,3,6,...,0,0,0,0,43,43,36,122,16,7.625


## Medals won at previous games
Total medals won at last x games. Store nan where a country hasn't attended a games.

In [67]:
def year_country_total(year, country, df):
    ''' Returns total value for a given year/country combination in df. If no year country match, returns nan.'''
    
    mask = (df.year==year) & (df.country==country)
    if df[mask].shape[0] != 0:
        return df.loc[mask, 'total'].iloc[0]
    else:
        return np.nan

    
def previous_games_medal_total(df, n_games):
    ''' Adds historic medal counts for countries from.
    
    ARGS:
        df: dataframe containing olympic results data. Must have country, year, total columns
        n_games: list of integers used to count back the number of four-year cycles
    
    RETURNS:
        Dataframe with column(s) added for each value in n_games'''
    
    df = df.copy()
    
    # add games name year col (required to account for Tokyo)
    games_name_year_col = 'games_name_year'
    df[games_name_year_col] = df.games.str[-4:].astype(int)
    
    # iterate through n_games
    for n in tqdm(n_games):
        feature_name = f'past_tot_{n}'
        df[feature_name] = df.apply(lambda row: year_country_total(year=row[games_name_year_col] - 4 * n, 
                                                                   country=row['country'], 
                                                                   df=df), axis=1)
    # drop games name year col
    df.drop(columns=[games_name_year_col], inplace=True)
    return df

In [68]:
year_country_total(2016, 'Great Britain', results_f)

67

In [69]:
results_f = previous_games_medal_total(results_f, [1, 2, 3, 4, 5])
results_f[results_f.country=='Great Britain'].head()

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:30<00:00,  6.04s/it]


Unnamed: 0,games,country,alpha_3,ioc_code,host,year,gold,silver,bronze,total,...,games_silver,games_bronze,games_medals,n_teams,medals_per_team,past_tot_1,past_tot_2,past_tot_3,past_tot_4,past_tot_5
8,Athens 1896,Great Britain,GBR,GBR,0,1896,2,3,2,7,...,43,36,122,16,7.625,,,,,
28,Paris 1900,Great Britain,GBR,GBR,0,1900,15,8,9,32,...,95,93,284,32,8.875,7.0,,,,
54,St. Louis 1904,Great Britain,GBR,GBR,0,1904,1,1,0,2,...,96,93,286,13,22.0,32.0,7.0,,,
71,London 1908,Great Britain,GBR,GBR,1,1908,56,51,39,146,...,107,107,324,22,14.727273,2.0,32.0,7.0,,
94,Stockholm 1912,Great Britain,GBR,GBR,0,1912,10,15,16,41,...,105,104,317,28,11.321429,146.0,2.0,32.0,7.0,


In [70]:
results_f.head()

Unnamed: 0,games,country,alpha_3,ioc_code,host,year,gold,silver,bronze,total,...,games_silver,games_bronze,games_medals,n_teams,medals_per_team,past_tot_1,past_tot_2,past_tot_3,past_tot_4,past_tot_5
0,Athens 1896,Australia,AUS,AUS,0,1896,2,0,0,2,...,43,36,122,16,7.625,,,,,
1,Athens 1896,Austria,AUT,AUT,0,1896,2,1,2,5,...,43,36,122,16,7.625,,,,,
2,Athens 1896,Bulgaria,BGR,BUL,0,1896,0,0,0,0,...,43,36,122,16,7.625,,,,,
3,Athens 1896,Chile,CHL,CHI,0,1896,0,0,0,0,...,43,36,122,16,7.625,,,,,
4,Athens 1896,Denmark,DNK,DEN,0,1896,1,2,3,6,...,43,36,122,16,7.625,,,,,


## Save results with features

In [71]:
results_f = results_f.drop(columns=['associate_nations'])
results_f.head()

Unnamed: 0,games,country,alpha_3,ioc_code,host,year,gold,silver,bronze,total,...,games_silver,games_bronze,games_medals,n_teams,medals_per_team,past_tot_1,past_tot_2,past_tot_3,past_tot_4,past_tot_5
0,Athens 1896,Australia,AUS,AUS,0,1896,2,0,0,2,...,43,36,122,16,7.625,,,,,
1,Athens 1896,Austria,AUT,AUT,0,1896,2,1,2,5,...,43,36,122,16,7.625,,,,,
2,Athens 1896,Bulgaria,BGR,BUL,0,1896,0,0,0,0,...,43,36,122,16,7.625,,,,,
3,Athens 1896,Chile,CHL,CHI,0,1896,0,0,0,0,...,43,36,122,16,7.625,,,,,
4,Athens 1896,Denmark,DNK,DEN,0,1896,1,2,3,6,...,43,36,122,16,7.625,,,,,


In [72]:
olympics_data_fname = r'olympics_summer_features.csv'
results_f.to_csv(os.path.join(project_path, olympics_data_fname), index=False)