In [1]:
from pandasql import sqldf
import pandas as pd

# Create helper function for easier query execution
execute = lambda q: sqldf(q, globals())

# Load your CSV files into DataFrames
Summer_Medals = pd.read_csv("dataset/summer.csv")

# Execute query with a join and store the result
query = """
    SELECT *
    FROM Summer_Medals
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
0,1896,Athens,Aquatics,Swimming,HAJOS Alfred,HUN,Men,100M Freestyle,Gold
1,1896,Athens,Aquatics,Swimming,HERSCHMANN Otto,AUT,Men,100M Freestyle,Silver
2,1896,Athens,Aquatics,Swimming,DRIVAS Dimitrios,GRE,Men,100M Freestyle For Sailors,Bronze
3,1896,Athens,Aquatics,Swimming,MALOKINIS Ioannis,GRE,Men,100M Freestyle For Sailors,Gold
4,1896,Athens,Aquatics,Swimming,CHASAPIS Spiridon,GRE,Men,100M Freestyle For Sailors,Silver


# Running totals of athlete medals

The running total (or cumulative sum) of a column helps you determine what each row's contribution is to the total sum.

In [2]:
query = """
  SELECT
    Athlete, COUNT(*) AS Medals
  FROM Summer_Medals
  WHERE
    Country = 'USA' AND Medal = 'Gold'
    AND Year >= 2000
  GROUP BY Athlete
ORDER BY Athlete ASC;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,Athlete,Medals
0,ABDUR-RAHIM Shareef,1
1,ABERNATHY Brent,1
2,ADRIAN Nathan,3
3,AHRENS Chris,1
4,AINSWORTH Kurt,1


In [5]:
query = """
WITH Athlete_Medals AS (
  SELECT
    Athlete, COUNT(*) AS Medals
  FROM Summer_Medals
  WHERE
    Country = 'USA' AND Medal = 'Gold'
    AND Year >= 2000
  GROUP BY Athlete)

SELECT
  -- Calculate the running total of athlete medals
  athlete,
  medals,
  SUM(medals) OVER (ORDER BY athlete ASC) AS RT_Medals
FROM Athlete_Medals
ORDER BY Athlete ASC;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,Athlete,Medals,RT_Medals
0,ABDUR-RAHIM Shareef,1,1
1,ABERNATHY Brent,1,2
2,ADRIAN Nathan,3,5
3,AHRENS Chris,1,6
4,AINSWORTH Kurt,1,7


# Maximum country medals by year

Getting the maximum of a country's earned medals so far helps you determine whether a country has broken its medals record by comparing the current year's earned medals and the maximum so far.

In [3]:
query = """
  SELECT
    Year, Country, COUNT(*) AS Medals
  FROM Summer_Medals
  WHERE
    Country IN ('CHN', 'KOR', 'JPN')
    AND Medal = 'Gold' AND Year >= 2000
  GROUP BY Year, Country
  ORDER BY Country ASC, Year ASC;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,Year,Country,Medals
0,2000,CHN,39
1,2004,CHN,52
2,2008,CHN,74
3,2012,CHN,56
4,2000,JPN,5


In [4]:
query = """
WITH Country_Medals AS (
  SELECT
    Year, Country, COUNT(*) AS Medals
  FROM Summer_Medals
  WHERE
    Country IN ('CHN', 'KOR', 'JPN')
    AND Medal = 'Gold' AND Year >= 2000
  GROUP BY Year, Country)

SELECT
  -- Return the max medals earned so far per country
  year,
  country,
  medals,
  MAX(medals) OVER (PARTITION BY country
                ORDER BY year ASC) AS Max_Medals
FROM Country_Medals
ORDER BY Country ASC, Year ASC;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,Year,Country,Medals,Max_Medals
0,2000,CHN,39,39
1,2004,CHN,52,52
2,2008,CHN,74,74
3,2012,CHN,56,74
4,2000,JPN,5,5


# Minimum country medals by year

So far, you've seen MAX and SUM, aggregate functions normally used with GROUP BY, being used as window functions. You can also use the other aggregate functions, like MIN, as window functions.

In [6]:
query = """
  SELECT
    Year, COUNT(*) AS Medals
  FROM Summer_Medals
  WHERE
    Country = 'FRA'
    AND Medal = 'Gold' AND Year >= 2000
  GROUP BY Year
  ORDER BY Year ASC;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,Year,Medals
0,2000,22
1,2004,21
2,2008,25
3,2012,30


In [5]:
query = """
WITH France_Medals AS (
  SELECT
    Year, COUNT(*) AS Medals
  FROM Summer_Medals
  WHERE
    Country = 'FRA'
    AND Medal = 'Gold' AND Year >= 2000
  GROUP BY Year)

SELECT
  Year,
  Medals,
  MIN(Medals) OVER (ORDER BY Year ASC) AS Min_Medals
FROM France_Medals
ORDER BY Year ASC;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,Year,Medals,Min_Medals
0,2000,22,22
1,2004,21,21
2,2008,25,21
3,2012,30,21


# Number of rows in a frame

How many rows does the following frame span?

`ROWS BETWEEN 3 PRECEDING AND 2 FOLLOWING`

- 6 (3 previous row, 1 current row and 2 next rows)

# Moving maximum of Scandinavian athletes' medals

Frames allow you to restrict the rows passed as input to your window function to a sliding window for you to define the start and finish.

Adding a frame to your window function allows you to calculate "moving" metrics, inputs of which slide from row to row.

In [8]:
query = """
  SELECT
    Year, COUNT(*) AS Medals
  FROM Summer_Medals
  WHERE
    Country IN ('DEN', 'NOR', 'FIN', 'SWE', 'ISL')
    AND Medal = 'Gold'
  GROUP BY Year
ORDER BY Year ASC;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,Year,Medals
0,1896,1
1,1900,1
2,1908,77
3,1912,141
4,1920,159


In [7]:
query = """
WITH Scandinavian_Medals AS (
  SELECT
    Year, COUNT(*) AS Medals
  FROM Summer_Medals
  WHERE
    Country IN ('DEN', 'NOR', 'FIN', 'SWE', 'ISL')
    AND Medal = 'Gold'
  GROUP BY Year)

SELECT
  -- Select each year's medals
  year,
  Medals,
  -- Get the max of the current and next years'  medals
  MAX(medals) OVER (ORDER BY year ASC
             ROWS BETWEEN CURRENT ROW
             AND 1 FOLLOWING) AS Max_Medals
FROM Scandinavian_Medals
ORDER BY Year ASC;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,Year,Medals,Max_Medals
0,1896,1,1
1,1900,1,77
2,1908,77,141
3,1912,141,159
4,1920,159,159


# Moving maximum of Chinese athletes' medals

Frames allow you to "peek" forwards or backward without first using the relative fetching functions, LAG and LEAD, to fetch previous rows' values into the current row.

In [10]:
query = """
  SELECT
    Athlete, COUNT(*) AS Medals
  FROM Summer_Medals
  WHERE
    Country = 'CHN' AND Medal = 'Gold'
    AND Year >= 2000
  GROUP BY Athlete
  ORDER BY Athlete ASC;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,Athlete,Medals
0,CAI Yalin,1
1,CAI Yun,1
2,CAO Lei,1
3,CAO Yuan,1
4,CHEN Ding,1


In [9]:
query = """
WITH Chinese_Medals AS (
  SELECT
    Athlete, COUNT(*) AS Medals
  FROM Summer_Medals
  WHERE
    Country = 'CHN' AND Medal = 'Gold'
    AND Year >= 2000
  GROUP BY Athlete)

SELECT
  -- Select the athletes and the medals they've earned
  Athlete,
  Medals,
  -- Get the max of the last two and current rows' medals 
  MAX(Medals) OVER (ORDER BY Athlete ASC
            ROWS BETWEEN 2 PRECEDING
            AND CURRENT ROW) AS Max_Medals
FROM Chinese_Medals
ORDER BY Athlete ASC;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,Athlete,Medals,Max_Medals
0,CAI Yalin,1,1
1,CAI Yun,1,1
2,CAO Lei,1,1
3,CAO Yuan,1,1
4,CHEN Ding,1,1


# Moving average's frame

If you want your moving average to cover the last 3 and current Olympic games, how would you define its frame?

- `ROWS BETWEEN 3 PRECEDING AND CURRENT ROW`

# Moving average of Russian medals

Using frames with aggregate window functions allow you to calculate many common metrics, including moving averages and totals. These metrics track the change in performance over time.

In [11]:
query = """
  SELECT
    Year, COUNT(*) AS Medals
  FROM Summer_Medals
  WHERE
    Country = 'RUS'
    AND Medal = 'Gold'
    AND Year >= 1980
  GROUP BY Year
ORDER BY Year ASC;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,Year,Medals
0,1996,36
1,2000,66
2,2004,47
3,2008,43
4,2012,47


In [12]:
query = """
WITH Russian_Medals AS (
  SELECT
    Year, COUNT(*) AS Medals
  FROM Summer_Medals
  WHERE
    Country = 'RUS'
    AND Medal = 'Gold'
    AND Year >= 1980
  GROUP BY Year)

SELECT
  Year, Medals,
  --- Calculate the 3-year moving average of medals earned
  AVG(Medals) OVER
    (ORDER BY Year ASC
     ROWS BETWEEN
     2 PRECEDING AND CURRENT ROW) AS Medals_MA
FROM Russian_Medals
ORDER BY Year ASC;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,Year,Medals,Medals_MA
0,1996,36,36.0
1,2000,66,51.0
2,2004,47,49.666667
3,2008,43,52.0
4,2012,47,45.666667


# Moving total of countries' medals

What if your data is split into multiple groups spread over one or more columns in the table? Even with a defined frame, if you can't somehow separate the groups' data, one group's values will affect the average of another group's values.

In [14]:
query = """
  SELECT
    Year, Country, COUNT(*) AS Medals
  FROM Summer_Medals
  GROUP BY Year, Country
ORDER BY Country ASC, Year ASC;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,Year,Country,Medals
0,2012,,4
1,2008,AFG,1
2,2012,AFG,1
3,1988,AHO,1
4,1984,ALG,2


In [13]:
query = """
WITH Country_Medals AS (
  SELECT
    Year, Country, COUNT(*) AS Medals
  FROM Summer_Medals
  GROUP BY Year, Country)

SELECT
  Year, Country, Medals,
  -- Calculate each country's 3-game moving total
  SUM(Medals) OVER
    (PARTITION BY country
     ORDER BY Year ASC
     ROWS BETWEEN
     2 PRECEDING AND CURRENT ROW) AS Medals_MA
FROM Country_Medals
ORDER BY Country ASC, Year ASC;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,Year,Country,Medals,Medals_MA
0,2012,,4,4
1,2008,AFG,1,1
2,2012,AFG,1,2
3,1988,AHO,1,1
4,1984,ALG,2,2
