In [1]:
from pandasql import sqldf
import pandas as pd

# Create helper function for easier query execution
execute = lambda q: sqldf(q, globals())

# Load your CSV files into DataFrames
Summer_Medals = pd.read_csv("dataset/summer.csv")

# Execute query with a join and store the result
query = """
    SELECT *
    FROM Summer_Medals
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
0,1896,Athens,Aquatics,Swimming,HAJOS Alfred,HUN,Men,100M Freestyle,Gold
1,1896,Athens,Aquatics,Swimming,HERSCHMANN Otto,AUT,Men,100M Freestyle,Silver
2,1896,Athens,Aquatics,Swimming,DRIVAS Dimitrios,GRE,Men,100M Freestyle For Sailors,Bronze
3,1896,Athens,Aquatics,Swimming,MALOKINIS Ioannis,GRE,Men,100M Freestyle For Sailors,Gold
4,1896,Athens,Aquatics,Swimming,CHASAPIS Spiridon,GRE,Men,100M Freestyle For Sailors,Silver


# Future gold medalists

Fetching functions allow you to get values from different parts of the table into one row. If you have time-ordered data, you can "peek into the future" with the LEAD fetching function. This is especially useful if you want to compare a current value to a future value.

In [3]:
query = """
  SELECT DISTINCT
    Year,
    Athlete
  FROM Summer_Medals
  WHERE Medal = 'Gold'
    AND Event = 'Discus Throw'
    AND Gender = 'Women'
    AND Year >= 2000
  ORDER BY Year ASC;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,Year,Athlete
0,2000,ZVEREVA Ellina
1,2004,SADOVA Natalya
2,2008,BROWN TRAFTON Stephanie
3,2012,PERKOVIC Sandra


In [2]:
query = """
WITH Discus_Medalists AS (
  SELECT DISTINCT
    Year,
    Athlete
  FROM Summer_Medals
  WHERE Medal = 'Gold'
    AND Event = 'Discus Throw'
    AND Gender = 'Women'
    AND Year >= 2000)

SELECT
  -- For each year, fetch the current and future medalists
  year,
  athlete,
  LEAD(athlete,3) OVER (ORDER BY year ASC) AS Future_Champion
FROM Discus_Medalists
ORDER BY Year ASC;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,Year,Athlete,Future_Champion
0,2000,ZVEREVA Ellina,PERKOVIC Sandra
1,2004,SADOVA Natalya,
2,2008,BROWN TRAFTON Stephanie,
3,2012,PERKOVIC Sandra,


# First athlete by name

It's often useful to get the first or last value in a dataset to compare all other values to it. With absolute fetching functions like FIRST_VALUE, you can fetch a value at an absolute position in the table, like its beginning or end.

In [6]:
query = """
  SELECT DISTINCT
    Athlete
  FROM Summer_Medals
  WHERE Medal = 'Gold'
    AND Gender = 'Men'
  ORDER BY athlete ASC
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,Athlete
0,AABYE Edgar
1,AALTONEN Paavo Johannes
2,AAS Thomas Valentin
3,ABALMASAU Aliaksei
4,ABALO Luc


In [7]:
query = """
WITH All_Male_Medalists AS (
  SELECT DISTINCT
    Athlete
  FROM Summer_Medals
  WHERE Medal = 'Gold'
    AND Gender = 'Men')

SELECT
  -- Fetch all athletes and the first athlete alphabetically
  athlete,
  FIRST_VALUE(athlete) OVER (
    ORDER BY athlete ASC
  ) AS First_Athlete
FROM All_Male_Medalists;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,Athlete,First_Athlete
0,AABYE Edgar,AABYE Edgar
1,AALTONEN Paavo Johannes,AABYE Edgar
2,AAS Thomas Valentin,AABYE Edgar
3,ABALMASAU Aliaksei,AABYE Edgar
4,ABALO Luc,AABYE Edgar


# Last country by name

Just like you can get the first row's value in a dataset, you can get the last row's value. This is often useful when you want to compare the most recent value to previous values.

In [10]:
query = """
  SELECT DISTINCT Year, City
    FROM Summer_Medals
ORDER BY year ASC
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,Year,City
0,1896,Athens
1,1900,Paris
2,1904,St Louis
3,1908,London
4,1912,Stockholm


In [8]:
query = """
WITH Hosts AS (
  SELECT DISTINCT Year, City
    FROM Summer_Medals)

SELECT
  Year,
  City,
  -- Get the last city in which the Olympic games were held
  LAST_VALUE(city) OVER (
   ORDER BY year ASC
   RANGE BETWEEN
     UNBOUNDED PRECEDING AND
     UNBOUNDED FOLLOWING
  ) AS Last_City
FROM Hosts
ORDER BY Year ASC;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,Year,City,Last_City
0,1896,Athens,London
1,1900,Paris,London
2,1904,St Louis,London
3,1908,London,London
4,1912,Stockholm,London


# Ranking athletes by medals earned

In chapter 1, you used ROW_NUMBER to rank athletes by awarded medals. However, ROW_NUMBER assigns different numbers to athletes with the same count of awarded medals, so it's not a useful ranking function; if two athletes earned the same number of medals, they should have the same rank.

In [11]:
query = """
  SELECT
    Athlete,
    COUNT(*) AS Medals
  FROM Summer_Medals
  GROUP BY Athlete
ORDER BY Medals DESC;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,Athlete,Medals
0,PHELPS Michael,22
1,LATYNINA Larisa,18
2,ANDRIANOV Nikolay,15
3,SHAKHLIN Boris,13
4,ONO Takashi,13


In [12]:
query = """
WITH Athlete_Medals AS (
  SELECT
    Athlete,
    COUNT(*) AS Medals
  FROM Summer_Medals
  GROUP BY Athlete)

SELECT
  Athlete,
  Medals,
  -- Rank athletes by the medals they've won
  RANK() OVER (ORDER BY medals DESC) AS Rank_N
FROM Athlete_Medals
ORDER BY Medals DESC;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,Athlete,Medals,Rank_N
0,PHELPS Michael,22,1
1,LATYNINA Larisa,18,2
2,ANDRIANOV Nikolay,15,3
3,MANGIAROTTI Edoardo,13,4
4,ONO Takashi,13,4


# Ranking athletes from multiple countries

In the previous exercise, you used RANK to assign rankings to one group of athletes. In real-world data, however, you'll often find numerous groups within your data. Without partitioning your data, one group's values will influence the rankings of the others.

Also, while RANK skips numbers in case of identical values, the most natural way to assign rankings is not to skip numbers. If two countries are tied for second place, the country after them is considered to be third by most people.

In [16]:
query = """
  SELECT
    Country, Athlete, COUNT(*) AS Medals
  FROM Summer_Medals
  WHERE
    Country IN ('JPN', 'KOR')
    AND Year >= 2000
  GROUP BY Country, Athlete
  HAVING COUNT(*) > 1
  ORDER BY Country, Medals DESC
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,Country,Athlete,Medals
0,JPN,KITAJIMA Kosuke,7
1,JPN,UCHIMURA Kohei,5
2,JPN,TAKEDA Miho,4
3,JPN,TACHIBANA Miya,4
4,JPN,YOSHIDA Saori,3


In [14]:
query = """
WITH Athlete_Medals AS (
  SELECT
    Country, Athlete, COUNT(*) AS Medals
  FROM Summer_Medals
  WHERE
    Country IN ('JPN', 'KOR')
    AND Year >= 2000
  GROUP BY Country, Athlete
  HAVING COUNT(*) > 1)

SELECT
  Country,
  -- Rank athletes in each country by the medals they've won
  Athlete,
  DENSE_RANK() OVER (PARTITION BY country
                ORDER BY Medals DESC) AS Rank_N
FROM Athlete_Medals
ORDER BY Country ASC, RANK_N ASC;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,Country,Athlete,Rank_N
0,JPN,KITAJIMA Kosuke,1
1,JPN,UCHIMURA Kohei,2
2,JPN,TACHIBANA Miya,3
3,JPN,TAKEDA Miho,3
4,JPN,ICHO Kaori,4


# DENSE_RANK's output

You have the following table:
```
| Country | Medals |
|---------|--------|
| IRN     | 23     |
| IRQ     | 19     |
| LBN     | 19     |
| SYR     | 19     |
| BHR     | 7      |
| KSA     | 3      |
```
If you were to use DENSE_RANK to order the Medals column in descending order, what rank would BHR be assigned?

- 3

# Paging events

There are exactly 666 unique events in the Summer Medals Olympics dataset. If you want to chunk them up to analyze them piece by piece, you'll need to split the events into groups of approximately equal size.

In [19]:
query = """
  SELECT DISTINCT Event
  FROM Summer_Medals
  ORDER BY event ASC
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,Event
0,+ 100KG
1,+ 100KG (Heavyweight)
2,+ 100KG (Super Heavyweight)
3,+ 105KG
4,+ 108KG Total (Super Heavyweight)


In [17]:
query = """
WITH Events AS (
  SELECT DISTINCT Event
  FROM Summer_Medals)
  
SELECT
  --- Split up the distinct events into 111 unique groups
  event,
  NTILE(111) OVER (ORDER BY event ASC) AS Page
FROM Events
ORDER BY Event ASC;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,Event,Page
0,+ 100KG,1
1,+ 100KG (Heavyweight),1
2,+ 100KG (Super Heavyweight),1
3,+ 105KG,1
4,+ 108KG Total (Super Heavyweight),1


# Top, middle, and bottom thirds

Splitting your data into thirds or quartiles is often useful to understand how the values in your dataset are spread. Getting summary statistics (averages, sums, standard deviations, etc.) of the top, middle, and bottom thirds can help you determine what distribution your values follow.

In [23]:
query = """
  SELECT Athlete, COUNT(*) AS Medals
  FROM Summer_Medals
  GROUP BY Athlete
  HAVING COUNT(*) > 1
  ORDER BY Medals DESC, Athlete ASC;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,Athlete,Medals
0,PHELPS Michael,22
1,LATYNINA Larisa,18
2,ANDRIANOV Nikolay,15
3,MANGIAROTTI Edoardo,13
4,ONO Takashi,13


In [21]:
query = """
WITH Athlete_Medals AS (
  SELECT Athlete, COUNT(*) AS Medals
  FROM Summer_Medals
  GROUP BY Athlete
  HAVING COUNT(*) > 1)
  
SELECT
  Athlete,
  Medals,
  -- Split athletes into thirds by their earned medals
  NTILE(3) OVER(ORDER BY medals DESC) AS Third
FROM Athlete_Medals
ORDER BY Medals DESC, Athlete ASC;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,Athlete,Medals,Third
0,PHELPS Michael,22,1
1,LATYNINA Larisa,18,1
2,ANDRIANOV Nikolay,15,1
3,MANGIAROTTI Edoardo,13,1
4,ONO Takashi,13,1


In [24]:
query = """
WITH Athlete_Medals AS (
  SELECT Athlete, COUNT(*) AS Medals
  FROM Summer_Medals
  GROUP BY Athlete
  HAVING COUNT(*) > 1),
  
  Thirds AS (
  SELECT
    Athlete,
    Medals,
    NTILE(3) OVER (ORDER BY Medals DESC) AS Third
  FROM Athlete_Medals)
  
SELECT
  -- Get the average medals earned in each third
  Third,
  AVG(Medals) AS Avg_Medals
FROM Thirds
GROUP BY Third
ORDER BY Third ASC;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,Third,Avg_Medals
0,1,3.786446
1,2,2.0
2,3,2.0
