In [1]:
import pandas as pd
import numpy as np
import altair as alt
import fidap
from datetime import datetime
import fidap
import config

# instantiate connection
fidap = fidap.fidap_client(api_key = config.api_key)

### Olympics and the Eastern Bloc  
  
Those of us old enough to remember the 1990s would remember that the Union of Soviet Socialist Republics disintegrated rather messily in the early to mid-1990s to form a total of 15 nation-states today. Athletes from these 15 countries used to compete under a different flag.  
  
The question here today is, how have these new republics performed vis-a-vis their respective previous political entity?  
  
Since the USSR only started participating in the Olympics starting in 1952, we can look at how it fared between 1952 and 1988.    

In [2]:
ussr_performance_event = fidap.sql("""
WITH ussr AS (
SELECT DISTINCT Games, Sport, Event, Medal
FROM fidap-301014.kaggle.olympics_athlete_events
WHERE Sex IN ('M', 'F')
AND NOC = 'URS'
AND Season = 'Summer'
AND CAST(Year AS INT) >= 1948
AND Medal <> "NA"
)

SELECT Games, 
    SUM(CASE Medal WHEN 'Gold' THEN 1 ELSE 0 END) AS n_gold,
    SUM(CASE Medal WHEN 'Silver' THEN 1 ELSE 0 END) AS n_silver,
    SUM(CASE Medal WHEN 'Bronze' THEN 1 ELSE 0 END) AS n_bronze,
    COUNT(Medal) AS total_medal_count
FROM ussr
GROUP BY Games
ORDER BY Games
""")

In [3]:
print(ussr_performance_event)

         Games  n_gold  n_silver  n_bronze  total_medal_count
0  1952 Summer      22        29        19                 70
1  1956 Summer      37        29        32                 98
2  1960 Summer      43        29        31                103
3  1964 Summer      30        30        33                 93
4  1968 Summer      29        32        30                 91
5  1972 Summer      50        27        22                 99
6  1976 Summer      49        41        35                125
7  1980 Summer      80        69        46                195
8  1988 Summer      54        31        46                131


We can skip 1992 because the USSR was in the midst of its breakup and the respective International Olympic Committees had not been formed in time for the 1992 Barcelona Games. We can begin comparing the succesor nations starting in 1996. Basically, we want to ask if the successor nations' combined performance is at least similar to that of the USSR. 

In [4]:
former_ussr_performance_event = fidap.sql("""
WITH ussr AS (
SELECT DISTINCT Games, Sport, Event, Medal
FROM fidap-301014.kaggle.olympics_athlete_events
WHERE Sex IN ('M', 'F')
AND Team IN ('Lithuania', 'Moldova', 'Latvia', 'Estonia', 'Ukraine', 'Belarus', 'Russia', 
'Azerbaijan', 'Armenia', 'Georgia', 'Turkmenistan', 'Tajikistan', 'Uzbekistan', 'Kazakhstan', 'Kyrgystan')
AND Season = 'Summer'
AND CAST(Year AS INT) >= 1996
AND Medal <> "NA"
)

SELECT Games, 
    SUM(CASE Medal WHEN 'Gold' THEN 1 ELSE 0 END) AS n_gold,
    SUM(CASE Medal WHEN 'Silver' THEN 1 ELSE 0 END) AS n_silver,
    SUM(CASE Medal WHEN 'Bronze' THEN 1 ELSE 0 END) AS n_bronze,
    COUNT(Medal) AS total_medal_count
FROM ussr
GROUP BY Games
ORDER BY Games
""")

In [5]:
former_ussr_performance_event

Unnamed: 0,Games,n_gold,n_silver,n_bronze,total_medal_count
0,1996 Summer,40,38,43,121
1,2000 Summer,48,48,64,160
2,2004 Summer,45,50,60,155
3,2008 Summer,43,44,76,163
4,2012 Summer,46,45,65,156
5,2016 Summer,33,45,59,137


Some might even argue that the former Soviet Union republics have done better post-independence.

In [6]:
fsr_medal_count = fidap.sql("""
WITH event AS (
SELECT DISTINCT r.region AS Region, e.Event, e.Medal, e.Year
FROM fidap-301014.kaggle.olympics_athlete_events AS e
LEFT JOIN fidap-301014.kaggle.olympics_noc_regions AS r
ON r.NOC = e.NOC
WHERE Season = 'Summer'
AND CAST(Year AS INT) > 1992
AND Medal <> "NA"
)

SELECT DISTINCT Region as Country, COUNT(Medal) AS n_medals, Year,
ROW_NUMBER() OVER (PARTITION BY Year ORDER BY COUNT(Medal) DESC) AS rank
FROM event
WHERE Region IS NOT NULL
AND Region IN ('Lithuania', 'Moldova', 'Latvia', 'Estonia', 'Ukraine', 'Belarus', 'Russia', 
'Azerbaijan', 'Armenia', 'Georgia', 'Turkmenistan', 'Tajikistan', 'Uzbekistan', 'Kazakhstan', 'Kyrgystan')
GROUP BY Region, Year;
""")

In [7]:
alt.Chart(fsr_medal_count).mark_line(point = True).encode(
    x = 'Year:O',
    y = alt.Y('n_medals', title = 'Medal Count'),
    color = alt.Color("Country:N"),
    tooltip = [alt.Tooltip('Country')]).properties(
    title = "Comrades in Sports",
    width = 550,
    height = 350
)

Clearly, Russia is the powerhouse here although Ukraine and Belarus do a pretty good job as well.

### Sporting Dominance

What were some of the top 10 sports that the USSR was really good at in the Olympics? 

In [8]:
ussr_sporting_dominance = fidap.sql("""
SELECT r.region AS Region, Sport, SUM(CASE Medal
    WHEN 'Gold' THEN 3
    WHEN 'Silver' THEN 2
    WHEN 'Bronze' THEN 1
    ELSE 0
    END) AS Performance
FROM fidap-301014.kaggle.olympics_athlete_events AS e
LEFT JOIN fidap-301014.kaggle.olympics_noc_regions AS r
ON r.NOC = e.NOC
WHERE e.NOC = 'URS'
AND Season = 'Summer'
AND CAST(Year AS INT) < 1992
GROUP BY e.Sport, r.region
ORDER BY Performance DESC
LIMIT 10;
""")

# resetting index
ussr_sporting_dominance.index = ussr_sporting_dominance.index + 1
ussr_sporting_dominance = ussr_sporting_dominance.reset_index()
ussr_sporting_dominance = ussr_sporting_dominance.rename(columns = {
    "index" : "USSR Rank",
    'Performance': 'USSR Performance'
})
ussr_sporting_dominance

Unnamed: 0,USSR Rank,Region,Sport,USSR Performance
0,1,Russia,Gymnastics,669
1,2,Russia,Athletics,478
2,3,Russia,Volleyball,351
3,4,Russia,Fencing,319
4,5,Russia,Basketball,292
5,6,Russia,Rowing,288
6,7,Russia,Wrestling,271
7,8,Russia,Canoeing,216
8,9,Russia,Handball,208
9,10,Russia,Swimming,206


Are these sports still what they do best in?

In [9]:
fsr_sporting_performance = fidap.sql("""
SELECT Sport, SUM(CASE Medal
    WHEN 'Gold' THEN 3
    WHEN 'Silver' THEN 2
    WHEN 'Bronze' THEN 1
    ELSE 0
    END) AS Performance
FROM fidap-301014.kaggle.olympics_athlete_events AS e
LEFT JOIN fidap-301014.kaggle.olympics_noc_regions AS r
ON r.NOC = e.NOC
WHERE e.Team IN ('Lithuania', 'Moldova', 'Latvia', 'Estonia', 'Ukraine', 'Belarus', 'Russia', 
'Azerbaijan', 'Armenia', 'Georgia', 'Turkmenistan', 'Tajikistan', 'Uzbekistan', 'Kazakhstan', 'Kyrgystan')
AND Season = 'Summer'
AND CAST(Year AS INT) > 1992
GROUP BY e.Sport
ORDER BY Performance DESC;
""")

# resetting index
fsr_sporting_performance.index = fsr_sporting_performance.index + 1
fsr_sporting_performance = fsr_sporting_performance.reset_index()
fsr_sporting_performance = fsr_sporting_performance.rename(columns = {
    "index" : "Post-USSR Rank",
    'Performance': 'Post-USSR Performance'
})

In [10]:
sporting_performance_comparison = ussr_sporting_dominance[['USSR Rank', 'Sport']].merge(fsr_sporting_performance[['Post-USSR Rank', 'Sport']], on = 'Sport')
sporting_performance_comparison

Unnamed: 0,USSR Rank,Sport,Post-USSR Rank
0,1,Gymnastics,3
1,2,Athletics,2
2,3,Volleyball,10
3,4,Fencing,5
4,5,Basketball,18
5,6,Rowing,14
6,7,Wrestling,1
7,8,Canoeing,12
8,9,Handball,8
9,10,Swimming,11


In [11]:
fsr_sporting_performance.head(n=10)

Unnamed: 0,Post-USSR Rank,Sport,Post-USSR Performance
0,1,Wrestling,304
1,2,Athletics,299
2,3,Gymnastics,204
3,4,Boxing,166
4,5,Fencing,164
5,6,Synchronized Swimming,162
6,7,Rhythmic Gymnastics,159
7,8,Handball,147
8,9,Weightlifting,140
9,10,Volleyball,128


If we look at it a little more closely, we notice that even though the top 10 sports have remained largely unchanged, the order has changed. Some sports have been in ascendance, such as Wrestling, Boxing, Weightlifting, and Synchronized Swimming.  
  
This implies that countries' competencies in sports can change over time.   

In [12]:
fsr_sporting_performance_country = fidap.sql("""
SELECT r.region AS Country, Sport, SUM(CASE Medal
    WHEN 'Gold' THEN 3
    WHEN 'Silver' THEN 2
    WHEN 'Bronze' THEN 1
    ELSE 0
    END) AS Performance
FROM fidap-301014.kaggle.olympics_athlete_events AS e
LEFT JOIN fidap-301014.kaggle.olympics_noc_regions AS r
ON r.NOC = e.NOC
WHERE e.Team IN ('Lithuania', 'Moldova', 'Latvia', 'Estonia', 'Ukraine', 'Belarus', 'Russia', 
'Azerbaijan', 'Armenia', 'Georgia', 'Turkmenistan', 'Tajikistan', 'Uzbekistan', 'Kazakhstan', 'Kyrgystan')
AND Season = 'Summer'
AND CAST(Year AS INT) > 1992
GROUP BY e.Sport, r.region
ORDER BY Performance DESC
LIMIT 10;
""")
fsr_sporting_performance_country

Unnamed: 0,Country,Sport,Performance
0,Russia,Athletics,206
1,Russia,Synchronized Swimming,162
2,Russia,Gymnastics,153
3,Russia,Fencing,138
4,Russia,Handball,132
5,Russia,Volleyball,128
6,Russia,Wrestling,128
7,Russia,Rhythmic Gymnastics,117
8,Russia,Swimming,77
9,Russia,Water Polo,63


That being said, Russia is still THE powerhouse amoung the former Soviet satellite states. This is not a surprise because of its population size.  
  
### US - Russia Rivalry  
  
US and Russia have always been rivals, even at the Olympics. Since we can conclude from our little analysis above that even among the CIS-states, Russia dominates in the Olympics. Hence, the USSR's overall performance since its inclusion in 1952 is largely reflective of Russia's. Furthermore, Russia is the successor state of the USSR.  
  
However, we still cannot compare the two directly because the USSR is quite different from post-Communist Russia. Hence, we can only start the comparison in 1996.     
  
1) Has Russia outperformed the United States in terms of medal count and medal score?  
2) Is performance a matter of contingent size? On a per-athlete basis, which country is more efficient?  
3) Which are the fields in which Russia and the United States dominate in respectively?  


If we assign each Gold, Silver, and Bronze medal 3, 2, and 1 point respectively, and sum them up by country, we can see that the United States has outperformed Russia. 

In [13]:
united_states_russia_2 = fidap.sql("""
WITH event AS (
SELECT DISTINCT r.region AS Country, e.Event, e.Medal
FROM fidap-301014.kaggle.olympics_athlete_events AS e
LEFT JOIN fidap-301014.kaggle.olympics_noc_regions AS r
ON r.NOC = e.NOC
WHERE r.region IN ('USA', 'Russia')
AND Season = 'Summer'
AND CAST(Year AS INT) > 1992
)

SELECT Country, SUM(CASE Medal
    WHEN 'Gold' THEN 3
    WHEN 'Silver' THEN 2
    WHEN 'BRONZE' THEN 1
    ELSE 0 END) AS Performance
FROM event
GROUP BY Country
ORDER BY Performance DESC;                                                              
""")
united_states_russia_2

Unnamed: 0,Country,Performance
0,USA,574
1,Russia,483


What about the medal count per Games? Team events will only be counted once.  

In [14]:
medal_count = fidap.sql("""
WITH event AS (
SELECT DISTINCT r.region AS Country, e.Event, e.Medal, e.Year
FROM fidap-301014.kaggle.olympics_athlete_events AS e
LEFT JOIN fidap-301014.kaggle.olympics_noc_regions AS r
ON r.NOC = e.NOC
WHERE r.region IN ('USA', 'Russia')
AND Season = 'Summer'
AND CAST(Year AS INT) > 1992
AND Medal <> "NA"
)

SELECT DISTINCT Country, COUNT(Medal) AS n_medals, Year
FROM event
GROUP BY Country, Year
""")

In [15]:
alt.Chart(medal_count).mark_bar().encode(
    x = alt.X('Country', axis = alt.Axis(labels = False), title = None),
    y = alt.Y('n_medals', title = "No. of Medals"),
    color = alt.Color('Country'),
    column = 'Year'
)

What if we look at the size of each country's contingent?   

In [16]:
contingent_size = fidap.sql("""
SELECT DISTINCT r.Region,  e.Year, COUNT(Name) AS n_athletes 
FROM fidap-301014.kaggle.olympics_athlete_events AS e
LEFT JOIN fidap-301014.kaggle.olympics_noc_regions AS r
ON e.NOC = r.NOC
WHERE r.Region IN ('USA', 'Russia')
AND Season = 'Summer'
AND CAST(YEAR AS INT) > 1992
GROUP BY r.Region, Year
ORDER BY e.Year, r.Region;
""")

In [17]:
alt.Chart(contingent_size).mark_bar().encode(
    x = alt.X('Region', axis = alt.Axis(labels = False), title = None),
    y = alt.Y('n_athletes', title = "No. of Athletes"),
    color = alt.Color('Region', title = "Country"),
    column = 'Year'
)

The United States definitely sends more athletes to the Games than Russia.

In [18]:
medal_count_per_Games = fidap.sql("""
WITH athlete_count AS (
SELECT DISTINCT r.Region AS Country,  e.Year, COUNT(Name) AS n_athletes 
FROM fidap-301014.kaggle.olympics_athlete_events AS e
LEFT JOIN fidap-301014.kaggle.olympics_noc_regions AS r
ON e.NOC = r.NOC
WHERE r.Region IN ('USA', 'Russia')
AND Season = 'Summer'
AND CAST(YEAR AS INT) > 1992
GROUP BY r.Region, Year
ORDER BY e.Year, r.Region
),

medal_count AS (
WITH event AS (
SELECT DISTINCT r.region AS Region, e.Event, e.Medal, e.Year
FROM fidap-301014.kaggle.olympics_athlete_events AS e
LEFT JOIN fidap-301014.kaggle.olympics_noc_regions AS r
ON r.NOC = e.NOC
WHERE r.Region IN ('USA', 'Russia')
AND Season = 'Summer'
AND CAST(Year AS INT) > 1992
AND Medal <> "NA"
)

SELECT DISTINCT Region as Country, COUNT(Medal) AS n_medals, Year
FROM event
GROUP BY Region, Year
)

SELECT DISTINCT ac.Country, ac.Year, ROUND(mc.n_medals/ac.n_athletes,2) AS medals_per_athlete,
FROM medal_count AS mc
INNER JOIN athlete_count AS ac
ON mc.Country = ac.Country
AND mc.Year = ac.Year
ORDER BY ac.Year, ac.Country 
""")


In [19]:
alt.Chart(medal_count_per_Games).mark_line(point = True).encode(
    x = "Year:N",
    y = 'medals_per_athlete',
    color = 'Country'
).properties(width = 500, height = 300)

Of course, countries that send more athletes are more likely to win more medals simply because they compete in more events.    
  
How about dominance? Are the US and Russia good at different things? We can count this by looking at each country's medal score for each sport between 1996 and 2016.    

In [20]:
sport_dominance = fidap.sql("""
WITH event AS (
SELECT DISTINCT r.region AS Region, e.Event, e.Medal, e.Sport, e.Year
FROM fidap-301014.kaggle.olympics_athlete_events AS e
LEFT JOIN fidap-301014.kaggle.olympics_noc_regions AS r
ON r.NOC = e.NOC
WHERE r.Region IN ('USA', 'Russia')
AND Season = 'Summer'
AND CAST(Year AS INT) > 1992
AND Medal <> "NA"
)

SELECT region AS Region, Sport, SUM(CASE Medal
    WHEN 'Gold' THEN 3
    WHEN 'Silver' THEN 2
    WHEN 'Bronze' THEN 1
    ELSE 0
    END) AS Performance
FROM event
GROUP BY Sport, region
ORDER BY Sport, Performance DESC;
""")

In [21]:
sport_dominance_wide = sport_dominance.pivot(index = 'Sport', columns = 'Region', values = 'Performance')
sport_dominance_wide = sport_dominance_wide.reset_index().fillna(0)
sport_dominance_wide['performance_gap'] = abs(sport_dominance_wide['Russia'] - sport_dominance_wide['USA'])
sport_dominance_wide = sport_dominance_wide.sort_values('performance_gap', ascending = False)

In [22]:
mean_performance_gap = round(np.mean(sport_dominance_wide.performance_gap),2)
median_performance_gap = np.median(sport_dominance_wide.performance_gap)

In [23]:
print("Mean Performance Gap: " + str(mean_performance_gap))
print("Median Performance Gap: " + str(median_performance_gap))

Mean Performance Gap: 32.42
Median Performance Gap: 16.0


The statistics above suggest that there are certain sports where either country significantly outshines the other. Given the ability of both countries to rack up high medal counts, it probably is the case that they dominate in different sports. And we can see this in the table below where Russia does best in Wrestling, Weightlifting, Gymnastics etc and the United States powers ahead in Swimming and Athletics. 

In [24]:
#alt.Chart(sport_dominance_wide).mark_bar().encode(
#    x = alt.X('performance_gap:Q', bin = True),
#    y = 'count()'
#)
sport_dominance_wide.head(n=10)

Region,Sport,Russia,USA,performance_gap
23,Swimming,42.0,394.0,352.0
1,Athletics,156.0,309.0,153.0
32,Wrestling,128.0,59.0,69.0
31,Weightlifting,45.0,5.0,40.0
18,Rhythmic Gymnastics,40.0,0.0,40.0
11,Fencing,57.0,22.0,35.0
4,Basketball,3.0,34.0,31.0
10,Equestrianism,0.0,30.0,30.0
19,Rowing,5.0,32.0,27.0
6,Boxing,55.0,29.0,26.0


### New Kids On The Podium  
  
We know that Russia and the United States have both done very well historically speaking. We also see that their medal gain has remained fairly consistent. Are there other countries that are fast becoming sporting powerhouses?    

In [25]:
up_and_coming = fidap.sql("""
WITH event AS (
SELECT DISTINCT r.region AS Region, e.Event, e.Medal, e.Year
FROM fidap-301014.kaggle.olympics_athlete_events AS e
LEFT JOIN fidap-301014.kaggle.olympics_noc_regions AS r
ON r.NOC = e.NOC
AND Season = 'Summer'
AND CAST(Year AS INT) > 1992
AND Medal <> "NA"
),

games_medal_count AS(
SELECT DISTINCT Region as Country, COUNT(Medal) AS n_medals, Year,
ROW_NUMBER() OVER (PARTITION BY Year ORDER BY COUNT(Medal) DESC) AS rank
FROM event
WHERE Region IS NOT NULL
GROUP BY Region, Year
)

SELECT *,
n_medals - (LAG(n_medals) OVER (PARTITION BY Country ORDER BY Year)) AS delta_previous
FROM games_medal_count
WHERE rank <= 10;
""")

In [26]:
alt.Chart(up_and_coming).mark_line(point = True).encode(
    x = 'Year:O',
    y = 'rank:O',
    color = alt.Color("Country:N"),
    tooltip = [alt.Tooltip('Country')]).properties(
    title = "Top Performing Countries",
    width = 550,
    height = 350
)

Interestingly, we see that the United States has consistently remained at the top where medal haul is concerned. The UK has actually improved over the years to be a serious contender whereas Russia's performance looks like it is on the way down. China is now the US' most serious contender as a sporting power. 