In [1]:
import pandas as pd
import numpy as np
import altair as alt
import fidap
from datetime import datetime
import fidap
import config

# instantiate connection
fidap = fidap.fidap_client(api_key = config.api_key)

### Olympics and the Eastern Bloc  
  
Those of us old enough to remember the 1990s would remember that there were two nations, one called the Union of Soviet Socialist Republics and the other Socialist Federal Republic of Yugoslavia, that disintegrated rather messily in the early to mid-1990s to form a total of 22 nation-states today. Athletes from these 22 countries used to compete under a different flag.  
  
The question here today is, how have these new republics performed vis-a-vis their respective previous political entity?  
  
We can start by looking at just the post-WWII Summer Games up until 1988.    

In [24]:
yugoslavia_performance_event = fidap.sql("""
WITH yugoslavia AS (
SELECT DISTINCT Games, Sport, Event, Medal
FROM fidap-301014.kaggle.olympics_athlete_events
WHERE Sex IN ('M', 'F')
AND (Team LIKE '%Yugoslavia%')
AND Season = 'Summer'
AND CAST(Year AS INT) >= 1948
AND Medal <> "NA"
)

SELECT Games, 
    SUM(CASE Medal WHEN 'Gold' THEN 1 ELSE 0 END) AS n_gold,
    SUM(CASE Medal WHEN 'Silver' THEN 1 ELSE 0 END) AS n_silver,
    SUM(CASE Medal WHEN 'Bronze' THEN 1 ELSE 0 END) AS n_bronze,
    COUNT(Medal) AS total_medal_count
FROM yugoslavia
GROUP BY Games
""") 

In [26]:
print(yugoslavia_performance_event)

          Games  n_gold  n_silver  n_bronze  total_medal_count
0   1948 Summer       0         2         0                  2
1   1952 Summer       1         2         0                  3
2   1956 Summer       0         3         0                  3
3   1960 Summer       1         1         0                  2
4   1964 Summer       2         1         2                  5
5   1968 Summer       3         3         2                  8
6   1972 Summer       2         1         2                  5
7   1976 Summer       2         3         3                  8
8   1980 Summer       2         3         4                  9
9   1984 Summer       7         4         7                 18
10  1988 Summer       3         4         5                 12


In [27]:
ussr_performance_event = fidap.sql("""
WITH ussr AS (
SELECT DISTINCT Games, Sport, Event, Medal
FROM fidap-301014.kaggle.olympics_athlete_events
WHERE Sex IN ('M', 'F')
AND NOC = 'URS'
AND Season = 'Summer'
AND CAST(Year AS INT) >= 1948
AND Medal <> "NA"
)

SELECT Games, 
    SUM(CASE Medal WHEN 'Gold' THEN 1 ELSE 0 END) AS n_gold,
    SUM(CASE Medal WHEN 'Silver' THEN 1 ELSE 0 END) AS n_silver,
    SUM(CASE Medal WHEN 'Bronze' THEN 1 ELSE 0 END) AS n_bronze,
    COUNT(Medal) AS total_medal_count
FROM ussr
GROUP BY Games
""")

In [28]:
print(ussr_performance_event)

         Games  n_gold  n_silver  n_bronze  total_medal_count
0  1960 Summer      43        29        31                103
1  1952 Summer      22        29        19                 70
2  1956 Summer      37        29        32                 98
3  1964 Summer      30        30        33                 93
4  1968 Summer      29        32        30                 91
5  1972 Summer      50        27        22                 99
6  1976 Summer      49        41        35                125
7  1980 Summer      80        69        46                195
8  1988 Summer      54        31        46                131


We can skip 1992 because Yugoslavia and the USSR were in the midst of their breakup. We can begin comparing the succesor nations starting in 1996.

In [29]:
former_ussr_performance_event = fidap.sql("""
WITH ussr AS (
SELECT DISTINCT Games, Sport, Event, Medal
FROM fidap-301014.kaggle.olympics_athlete_events
WHERE Sex IN ('M', 'F')
AND Team IN ('Lithuania', 'Moldova', 'Latvia', 'Estonia', 'Ukraine', 'Belarus', 'Russia', 
'Azerbaijan', 'Armenia', 'Georgia', 'Turkmenistan', 'Tajikistan', 'Uzbekistan', 'Kazakhstan', 'Kyrgystan')
AND Season = 'Summer'
AND CAST(Year AS INT) >= 1996
AND Medal <> "NA"
)

SELECT Games, 
    SUM(CASE Medal WHEN 'Gold' THEN 1 ELSE 0 END) AS n_gold,
    SUM(CASE Medal WHEN 'Silver' THEN 1 ELSE 0 END) AS n_silver,
    SUM(CASE Medal WHEN 'Bronze' THEN 1 ELSE 0 END) AS n_bronze,
    COUNT(Medal) AS total_medal_count
FROM ussr
GROUP BY Games
""")

In [30]:
former_ussr_performance_event

Unnamed: 0,Games,n_gold,n_silver,n_bronze,total_medal_count
0,1996 Summer,40,38,43,121
1,2000 Summer,48,48,64,160
2,2004 Summer,45,50,60,155
3,2008 Summer,43,44,76,163
4,2012 Summer,46,45,65,156
5,2016 Summer,33,45,59,137


In [33]:
former_sfry_performance_event = fidap.sql("""
WITH sfry AS (
SELECT DISTINCT Games, Sport, Event, Medal
FROM fidap-301014.kaggle.olympics_athlete_events
WHERE Sex IN ('M', 'F')
AND Team IN ('Serbia', 'Kosovo', 'Montenegro', 'Croatia', 'Slovenia', 'Bosnia and Herzegovina', 'North Macedonia')
AND Season = 'Summer'
AND CAST(Year AS INT) >= 1996
AND Medal <> "NA"
)

SELECT Games, 
    SUM(CASE Medal WHEN 'Gold' THEN 1 ELSE 0 END) AS n_gold,
    SUM(CASE Medal WHEN 'Silver' THEN 1 ELSE 0 END) AS n_silver,
    SUM(CASE Medal WHEN 'Bronze' THEN 1 ELSE 0 END) AS n_bronze,
    COUNT(Medal) AS total_medal_count
FROM sfry
GROUP BY Games
ORDER BY Games
""")

In [34]:
former_sfry_performance_event

Unnamed: 0,Games,n_gold,n_silver,n_bronze,total_medal_count
0,1996 Summer,1,3,0,4
1,2000 Summer,3,0,1,4
2,2004 Summer,1,3,5,9
3,2008 Summer,1,5,7,13
4,2012 Summer,5,4,6,15
5,2016 Summer,9,9,5,23


### Sporting Dominance

What were some of the top 10 sports that the USSR was really good at in the Olympics? 

In [47]:
ussr_sporting_dominance = fidap.sql("""
SELECT r.region AS Region, Sport, SUM(CASE Medal
    WHEN 'Gold' THEN 3
    WHEN 'Silver' THEN 2
    WHEN 'Bronze' THEN 1
    ELSE 0
    END) AS Performance
FROM fidap-301014.kaggle.olympics_athlete_events AS e
LEFT JOIN fidap-301014.kaggle.olympics_noc_regions AS r
ON r.NOC = e.NOC
WHERE e.NOC = 'URS'
AND Season = 'Summer'
AND CAST(Year AS INT) < 1992
GROUP BY e.Sport, r.region
ORDER BY Performance DESC
LIMIT 10;
""")

In [48]:
ussr_sporting_dominance

Unnamed: 0,Region,Sport,Performance
0,Russia,Gymnastics,669
1,Russia,Athletics,478
2,Russia,Volleyball,351
3,Russia,Fencing,319
4,Russia,Basketball,292
5,Russia,Rowing,288
6,Russia,Wrestling,271
7,Russia,Canoeing,216
8,Russia,Handball,208
9,Russia,Swimming,206


And what about its successor republics today? 

In [51]:
fsr_sporting_performance = fidap.sql("""
SELECT r.region AS Region, Sport, SUM(CASE Medal
    WHEN 'Gold' THEN 3
    WHEN 'Silver' THEN 2
    WHEN 'Bronze' THEN 1
    ELSE 0
    END) AS Performance
FROM fidap-301014.kaggle.olympics_athlete_events AS e
LEFT JOIN fidap-301014.kaggle.olympics_noc_regions AS r
ON r.NOC = e.NOC
WHERE e.Team IN ('Lithuania', 'Moldova', 'Latvia', 'Estonia', 'Ukraine', 'Belarus', 'Russia', 
'Azerbaijan', 'Armenia', 'Georgia', 'Turkmenistan', 'Tajikistan', 'Uzbekistan', 'Kazakhstan', 'Kyrgystan')
AND Season = 'Summer'
AND CAST(Year AS INT) > 1992
GROUP BY e.Sport, r.region
ORDER BY Performance DESC;
""")

In [56]:
fsr_sporting_performance = fsr_sporting_performance.loc[(fsr_sporting_performance.loc[:,'Performance']>20),:]

In [57]:
fsr_sporting_performance

Unnamed: 0,Region,Sport,Performance
0,Russia,Athletics,206
1,Russia,Synchronized Swimming,162
2,Russia,Gymnastics,153
3,Russia,Fencing,138
4,Russia,Handball,132
5,Russia,Volleyball,128
6,Russia,Wrestling,128
7,Russia,Rhythmic Gymnastics,117
8,Russia,Swimming,77
9,Russia,Water Polo,63


Unsurprisingly, it is still dominated by Russia which is not a surprise because of its population size.  
  
### US - Russia Rivalry  
  
US and Russia have always been rivals, even at the Olympics. Since we can conclude from our little analysis above that even among the CIS-states, Russia dominates in the Olympics. Hence, the USSR's overall performance since its inclusion in 1952 is largely reflective of Russia's. Furthermore, Russia is the successor state of the USSR.  
  
However, we still cannot compare the two directly because the USSR is quite different from post-Communist Russia. Hence, we can only start the comparison in 1996.     
  
1) Has Russia outperformed the United States in terms of medal count and medal score?  
2) Is performance a matter of contingent size? On a per-athlete basis, which country is more efficient?  
3) Which are the fields in which Russia and the United States dominate in respectively?  


In [60]:
united_states_russia = fidap.sql("""
SELECT r.region AS Region, SUM(CASE Medal
    WHEN 'Gold' THEN 3
    WHEN 'Silver' THEN 2
    WHEN 'Bronze' THEN 1
    ELSE 0
    END) AS Performance
FROM fidap-301014.kaggle.olympics_athlete_events AS e
LEFT JOIN fidap-301014.kaggle.olympics_noc_regions AS r
ON r.NOC = e.NOC
WHERE r.Region IN ('USA', 'Russia')
AND Season = 'Summer'
AND CAST(Year AS INT) > 1992
GROUP BY r.region
ORDER BY Performance DESC
LIMIT 10;
""")

In [61]:
united_states_russia

Unnamed: 0,Region,Performance
0,USA,3596
1,Russia,1749


If we assign each Gold, Silver, and Bronze medal 3, 2, and 1 point respectively, and sum them up by country, we can see that the United States has outperformed Russia. 

In [66]:
united_states_russia_2 = fidap.sql("""
WITH event AS (
SELECT DISTINCT r.region AS Region, e.Event, e.Medal
FROM fidap-301014.kaggle.olympics_athlete_events AS e
LEFT JOIN fidap-301014.kaggle.olympics_noc_regions AS r
ON r.NOC = e.NOC
WHERE r.Region IN ('USA', 'Russia')
AND Season = 'Summer'
AND CAST(Year AS INT) > 1992
)

SELECT Region, SUM(CASE Medal
    WHEN 'Gold' THEN 3
    WHEN 'Silver' THEN 2
    WHEN 'BRONZE' THEN 1
    ELSE 0 END) AS Performance
FROM event
GROUP BY Region
ORDER BY Performance DESC;                                                              
""")

In [67]:
united_states_russia_2

Unnamed: 0,Region,Performance
0,USA,574
1,Russia,483


What about the medal count per Games? 

In [110]:
medal_count = fidap.sql("""
WITH event AS (
SELECT DISTINCT r.region AS Region, e.Event, e.Medal, e.Year
FROM fidap-301014.kaggle.olympics_athlete_events AS e
LEFT JOIN fidap-301014.kaggle.olympics_noc_regions AS r
ON r.NOC = e.NOC
WHERE r.Region IN ('USA', 'Russia')
AND Season = 'Summer'
AND CAST(Year AS INT) > 1992
AND Medal <> "NA"
)

SELECT DISTINCT Region as Country, COUNT(Medal) AS n_medals, Year
FROM event
GROUP BY Region, Year
""")

In [111]:
medal_count

Unnamed: 0,Country,n_medals,Year
0,Russia,63,1996
1,Russia,89,2000
2,Russia,90,2004
3,Russia,72,2008
4,Russia,82,2012
5,Russia,56,2016
6,USA,96,1996
7,USA,91,2000
8,USA,101,2004
9,USA,110,2008


The previous count while technically correct, was also a little imprecise. It recorded the medal haul of the United States by athletes. That assigns team sports such as Basketball and Soccer a stronger weight in the final medal tally. If we look at each event only once, regardless of the number of participants needed for each event, the outcome is a lot more nuanced. The United States is still ahead. But not by so much.  
  
What if we look at the size of each country's contingent?   

In [80]:
contingent_size = fidap.sql("""
SELECT DISTINCT r.Region,  e.Year, COUNT(Name) AS n_athletes 
FROM fidap-301014.kaggle.olympics_athlete_events AS e
LEFT JOIN fidap-301014.kaggle.olympics_noc_regions AS r
ON e.NOC = r.NOC
WHERE r.Region IN ('USA', 'Russia')
AND Season = 'Summer'
AND CAST(YEAR AS INT) > 1992
GROUP BY r.Region, Year
ORDER BY e.Year, r.Region;
""")

In [90]:
alt.Chart(contingent_size).mark_bar().encode(
    x = alt.X('Region', axis = alt.Axis(labels = False), title = None),
    y = alt.Y('n_athletes', title = "No. of Athletes"),
    color = alt.Color('Region', title = "Country"),
    column = 'Year'
)

The United States definitely sends more athletes to the Games than Russia.

In [115]:
medal_count_per_Games = fidap.sql("""
WITH athlete_count AS (
SELECT DISTINCT r.Region AS Country,  e.Year, COUNT(Name) AS n_athletes 
FROM fidap-301014.kaggle.olympics_athlete_events AS e
LEFT JOIN fidap-301014.kaggle.olympics_noc_regions AS r
ON e.NOC = r.NOC
WHERE r.Region IN ('USA', 'Russia')
AND Season = 'Summer'
AND CAST(YEAR AS INT) > 1992
GROUP BY r.Region, Year
ORDER BY e.Year, r.Region
),

medal_count AS (
WITH event AS (
SELECT DISTINCT r.region AS Region, e.Event, e.Medal, e.Year
FROM fidap-301014.kaggle.olympics_athlete_events AS e
LEFT JOIN fidap-301014.kaggle.olympics_noc_regions AS r
ON r.NOC = e.NOC
WHERE r.Region IN ('USA', 'Russia')
AND Season = 'Summer'
AND CAST(Year AS INT) > 1992
AND Medal <> "NA"
)

SELECT DISTINCT Region as Country, COUNT(Medal) AS n_medals, Year
FROM event
GROUP BY Region, Year
)

SELECT DISTINCT ac.Country, ac.Year, ROUND(mc.n_medals/ac.n_athletes,2) AS medals_per_athlete,
FROM medal_count AS mc
INNER JOIN athlete_count AS ac
ON mc.Country = ac.Country
AND mc.Year = ac.Year
ORDER BY ac.Year, ac.Country 
""")


In [125]:
alt.Chart(medal_count_per_Games).mark_line(point = True).encode(
    x = "Year:N",
    y = 'medals_per_athlete',
    color = 'Country'
).properties(width = 800, height = 300)

Of course, countries that send more athletes are more likely to win more medals simply because they compete in more events.    
  
How about dominance? Are the US and Russia good at different things? We can count this by looking at each country's medal score for each sport between 1996 and 2016.    

In [142]:
sport_dominance = fidap.sql("""
WITH event AS (
SELECT DISTINCT r.region AS Region, e.Event, e.Medal, e.Sport, e.Year
FROM fidap-301014.kaggle.olympics_athlete_events AS e
LEFT JOIN fidap-301014.kaggle.olympics_noc_regions AS r
ON r.NOC = e.NOC
WHERE r.Region IN ('USA', 'Russia')
AND Season = 'Summer'
AND CAST(Year AS INT) > 1992
AND Medal <> "NA"
)

SELECT region AS Region, Sport, SUM(CASE Medal
    WHEN 'Gold' THEN 3
    WHEN 'Silver' THEN 2
    WHEN 'Bronze' THEN 1
    ELSE 0
    END) AS Performance
FROM event
GROUP BY Sport, region
ORDER BY Sport, Performance DESC;
""")

In [144]:
sport_dominance_wide = sport_dominance.pivot(index = 'Sport', columns = 'Region', values = 'Performance')
sport_dominance_wide = sport_dominance_wide.reset_index().fillna(0)
sport_dominance_wide['performance_gap'] = abs(sport_dominance_wide['Russia'] - sport_dominance_wide['USA'])
sport_dominance_wide = sport_dominance_wide.sort_values('performance_gap', ascending = False)

In [148]:
mean_performance_gap = round(np.mean(sport_dominance_wide.performance_gap),2)
median_performance_gap = np.median(sport_dominance_wide.performance_gap)

In [150]:
print("Mean Performance Gap: " + str(mean_performance_gap))
print("Median Performance Gap: " + str(median_performance_gap))

Mean Performance Gap: 32.42
Median Performance Gap: 16.0


The statistics above suggest that there are certain sports where either country significantly outshines the other. Given the ability of both countries to rack up high medal counts, it probably is the case that they dominate in different sports. And we can see this in the table below where Russia does best in Wrestling, Weightlifting, Gymnastics etc and the United States powers ahead in Swimming and Athletics. 

In [151]:
sport_dominance_wide.head(n=10)

Region,Sport,Russia,USA,performance_gap
23,Swimming,42.0,394.0,352.0
1,Athletics,156.0,309.0,153.0
32,Wrestling,128.0,59.0,69.0
31,Weightlifting,45.0,5.0,40.0
18,Rhythmic Gymnastics,40.0,0.0,40.0
11,Fencing,57.0,22.0,35.0
4,Basketball,3.0,34.0,31.0
10,Equestrianism,0.0,30.0,30.0
19,Rowing,5.0,32.0,27.0
6,Boxing,55.0,29.0,26.0


In [143]:
sport_dominance

Unnamed: 0,Region,Sport,Performance
0,USA,Archery,14
1,Russia,Archery,3
2,USA,Athletics,309
3,Russia,Athletics,156
4,Russia,Badminton,1
5,USA,Baseball,5
6,USA,Basketball,34
7,Russia,Basketball,3
8,USA,Beach Volleyball,24
9,Russia,Boxing,55


In [None]:
contingent_size = fidap.sql("""
SELECT DISTINCT r.Region,  e.Year, COUNT(Name) AS n_athletes 
FROM fidap-301014.kaggle.olympics_athlete_events AS e
LEFT JOIN fidap-301014.kaggle.olympics_noc_regions AS r
ON e.NOC = r.NOC
WHERE r.Region IN ('USA', 'Russia')
AND Season = 'Summer'
AND CAST(YEAR AS INT) > 1992
GROUP BY r.Region, Year
ORDER BY e.Year, r.Region;
""")

In [None]:
eastern_bloc = fidap.sql("""
SELECT * 
FROM fidap-301014.kaggle.olympics_athlete_events
WHERE Sex IN ('M', 'F')
AND (Team IN ('Soviet Union', 'Lithuania', 'Moldova', 'Latvia', 'Estonia', 'Ukraine', 'Belarus', 'Russia', 
'Azerbaijan', 'Armenia', 'Georgia', 'Turkmenistan', 'Tajikistan', 'Uzbekistan', 'Kazakhstan', 'Kyrgystan',
'Yugoslavia', 'Serbia', 'Kosovo', 'Montenegro', 'Croatia', 'Slovenia', 'Bosnia and Herzegovina') OR Team LIKE '%Macedonia%' OR Team LIKE '%Serbia%')
AND Season = 'Summer'
AND CAST(Year AS INT) >= 1948
""")



