In [1]:
from pandasql import sqldf
import pandas as pd

# Create helper function for easier query execution
execute = lambda q: sqldf(q, globals())

# Load your CSV files into DataFrames
Summer_Medals = pd.read_csv("dataset/summer.csv")

# Execute query with a join and store the result
query = """
    SELECT *
    FROM Summer_Medals
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
0,1896,Athens,Aquatics,Swimming,HAJOS Alfred,HUN,Men,100M Freestyle,Gold
1,1896,Athens,Aquatics,Swimming,HERSCHMANN Otto,AUT,Men,100M Freestyle,Silver
2,1896,Athens,Aquatics,Swimming,DRIVAS Dimitrios,GRE,Men,100M Freestyle For Sailors,Bronze
3,1896,Athens,Aquatics,Swimming,MALOKINIS Ioannis,GRE,Men,100M Freestyle For Sailors,Gold
4,1896,Athens,Aquatics,Swimming,CHASAPIS Spiridon,GRE,Men,100M Freestyle For Sailors,Silver


# A basic pivot

You have the following table of Pole Vault gold medalist countries by gender in 2008 and 2012.
```
| Gender | Year | Country |
|--------|------|---------|
| Men    | 2008 | AUS     |
| Men    | 2012 | FRA     |
| Women  | 2008 | RUS     |
| Women  | 2012 | USA     |
```
Pivot it by Year to get the following reshaped, cleaner table.
```
| Gender | 2008 | 2012 |
|--------|------|------|
| Men    | AUS  | FRA  |
| Women  | RUS  | USA  |
```

In [4]:
query = """
SELECT
    Gender, Year, Country
  FROM Summer_Medals
  WHERE
    Year IN (2008, 2012)
    AND Medal = 'Gold'
    AND Event = 'Pole Vault'
  ORDER By Gender ASC, Year ASC;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,Gender,Year,Country
0,Men,2008,AUS
1,Men,2012,FRA
2,Women,2008,RUS
3,Women,2012,USA


In [5]:
query = """
-- Conditional Aggregation without CROSSTAB
SELECT
  Gender,
  MAX(CASE WHEN Year = 2008 THEN Country END) AS "2008",
  MAX(CASE WHEN Year = 2012 THEN Country END) AS "2012"
FROM Summer_Medals
WHERE
  Year IN (2008, 2012)
  AND Medal = 'Gold'
  AND Event = 'Pole Vault'
GROUP BY Gender
ORDER BY Gender ASC;

"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,Gender,2008,2012
0,Men,AUS,FRA
1,Women,RUS,USA


In [6]:
# query = """
# -- Create the correct extention to enable CROSSTAB
# CREATE EXTENSION IF NOT EXISTS tablefunc;

# SELECT * FROM CROSSTAB($$
#   SELECT
#     Gender, Year, Country
#   FROM Summer_Medals
#   WHERE
#     Year IN (2008, 2012)
#     AND Medal = 'Gold'
#     AND Event = 'Pole Vault'
#   ORDER By Gender ASC, Year ASC;
# -- Fill in the correct column names for the pivoted table
# $$) AS ct (Gender VARCHAR,
#            "2008" VARCHAR,
#            "2012" VARCHAR)

# ORDER BY Gender ASC;

# """
# result_df = execute(query)

# # Show results
# result_df.head()

# Pivoting with ranking

You want to produce an easy scannable table of the rankings of the three most populous EU countries by how many gold medals they've earned in the 2004 through 2012 Olympic games. The table needs to be in this format:
```
| Country | 2004 | 2008 | 2012 |
|---------|------|------|------|
| FRA     | ...  | ...  | ...  |
| GBR     | ...  | ...  | ...  |
| GER     | ...  | ...  | ...  |
```
You'll need to count the gold medals each country has earned, produce the ranks of each country by medals earned, then pivot the table to this shape.

In [7]:
query = """
-- Count the gold medals per country and year
SELECT
  country,
  year,
  COUNT(*) AS Awards
FROM Summer_Medals
WHERE
  Country IN ('FRA', 'GBR', 'GER')
  AND Year IN (2004, 2008, 2012)
  AND Medal = 'Gold'
GROUP BY country, year
ORDER BY Country ASC, Year ASC
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,Country,Year,Awards
0,FRA,2004,21
1,FRA,2008,25
2,FRA,2012,30
3,GBR,2004,17
4,GBR,2008,31


In [9]:
query = """
WITH Country_Awards AS (
  SELECT
    Country,
    Year,
    COUNT(*) AS Awards
  FROM Summer_Medals
  WHERE
    Country IN ('FRA', 'GBR', 'GER')
    AND Year IN (2004, 2008, 2012)
    AND Medal = 'Gold'
  GROUP BY Country, Year)

SELECT
  -- Select Country and Year
  Country,
  year,
  -- Rank by gold medals earned per year
  DENSE_RANK() OVER(ORDER BY Awards) AS rank
FROM Country_Awards
ORDER BY Country ASC, Year ASC;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,Country,Year,rank
0,FRA,2004,2
1,FRA,2008,3
2,FRA,2012,4
3,GBR,2004,1
4,GBR,2008,5


In [10]:
# query = """
# CREATE EXTENSION IF NOT EXISTS tablefunc;

# SELECT * FROM CROSSTAB($$
#   WITH Country_Awards AS (
#     SELECT
#       Country,
#       Year,
#       COUNT(*) AS Awards
#     FROM Summer_Medals
#     WHERE
#       Country IN ('FRA', 'GBR', 'GER')
#       AND Year IN (2004, 2008, 2012)
#       AND Medal = 'Gold'
#     GROUP BY Country, Year)

#   SELECT
#     Country,
#     Year,
#     RANK() OVER
#       (PARTITION BY Year
#        ORDER BY Awards DESC) :: INTEGER AS rank
#   FROM Country_Awards
#   ORDER BY Country ASC, Year ASC;
# -- Fill in the correct column names for the pivoted table
# $$) AS ct (Country VARCHAR,
#            "2004" INTEGER,
#            "2008" INTEGER,
#            "2012" INTEGER)

# Order by Country ASC;
# """
# result_df = execute(query)

# # Show results
# result_df.head()

In [21]:
query = """
WITH Country_Ranks AS (
  SELECT
    Country,
    Year,
    DENSE_RANK() OVER (PARTITION BY Year ORDER BY COUNT(*) DESC) AS rank
  FROM Summer_Medals
  WHERE
    Country IN ('FRA', 'GBR', 'GER')
    AND Year IN (2004, 2008, 2012)
    AND Medal = 'Gold'
  GROUP BY Country, Year
)

SELECT
  Country,
  MAX(CASE WHEN Year = 2004 THEN rank END) AS "2004",
  MAX(CASE WHEN Year = 2008 THEN rank END) AS "2008",
  MAX(CASE WHEN Year = 2012 THEN rank END) AS "2012"
FROM Country_Ranks
GROUP BY Country
ORDER BY Country ASC;


"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,Country,2004,2008,2012
0,FRA,2,3,3
1,GBR,3,2,1
2,GER,1,1,2


# Country-level subtotals

You want to look at three Scandinavian countries' earned gold medals per country and gender in the year 2004. You're also interested in Country-level subtotals to get the total medals earned for each country, but Gender-level subtotals don't make much sense in this case, so disregard them.

In [23]:
# query = """
# -- Count the gold medals per country and gender
# SELECT
#   country,
#   gender,
#   COUNT(*) AS Gold_Awards
# FROM Summer_Medals
# WHERE
#   Year = 2004
#   AND Medal = 'Gold'
#   AND Country IN ('DEN', 'NOR', 'SWE')
# -- Generate Country-level subtotals
# GROUP BY country, ROLLUP(gender)
# ORDER BY Country ASC, Gender ASC;
# """
# result_df = execute(query)

# # Show results
# result_df.head()

In [24]:
query = """
-- Count the gold medals per country and gender
WITH Gold_Awards AS (
  SELECT
    country,
    gender,
    COUNT(*) AS Gold_Awards
  FROM Summer_Medals
  WHERE
    Year = 2004
    AND Medal = 'Gold'
    AND Country IN ('DEN', 'NOR', 'SWE')
  GROUP BY country, gender
)

-- Generate Country-level subtotals
SELECT
  country,
  gender,
  Gold_Awards
FROM Gold_Awards

UNION ALL

-- Calculate the total count for each country
SELECT
  country,
  NULL AS gender,  -- Use NULL to represent the total
  SUM(Gold_Awards) AS Gold_Awards
FROM Gold_Awards
GROUP BY country

ORDER BY country ASC, gender ASC;

"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,country,gender,Gold_Awards
0,DEN,,19
1,DEN,Men,4
2,DEN,Women,15
3,NOR,,5
4,NOR,Men,3


# All group-level subtotals

You want to break down all medals awarded to Russia in the 2012 Olympic games per gender and medal type. Since the medals all belong to one country, Russia, it makes sense to generate all possible subtotals (Gender- and Medal-level subtotals), as well as a grand total.

Generate a breakdown of the medals awarded to Russia per country and medal type, including all group-level subtotals and a grand total.

In [26]:
# query = """
# -- Count the medals per gender and medal type
# SELECT
#   gender,
#   medal,
#   COUNT(*) AS Awards
# FROM Summer_Medals
# WHERE
#   Year = 2012
#   AND Country = 'RUS'
# -- Get all possible group-level subtotals
# GROUP BY CUBE(gender, medal)
# ORDER BY Gender ASC, Medal ASC;

# """
# result_df = execute(query)

# # Show results
# result_df.head()

In [29]:
query = """
-- Count the medals per gender and medal type
WITH Gender_Medal_Awards AS (
  SELECT
    gender,
    medal,
    COUNT(*) AS Awards
  FROM Summer_Medals
  WHERE
    Year = 2012
    AND Country = 'RUS'
  GROUP BY gender, medal
)

-- Get all possible group-level subtotals
SELECT
  gender,
  medal,
  Awards
FROM Gender_Medal_Awards

UNION ALL

-- Calculate subtotals by gender
SELECT
  gender,
  NULL AS medal,  -- Use NULL to represent the total for gender
  SUM(Awards) AS Awards
FROM Gender_Medal_Awards
GROUP BY gender

UNION ALL

-- Calculate subtotals by medal type
SELECT
  NULL AS gender,  -- Use NULL to represent the total for medal
  medal,
  SUM(Awards) AS Awards
FROM Gender_Medal_Awards
GROUP BY medal

UNION ALL

-- Calculate the grand total
SELECT
  NULL AS gender,  -- Use NULL to represent the total
  NULL AS medal,   -- Use NULL to represent the total
  SUM(Awards) AS Awards
FROM Gender_Medal_Awards;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,gender,medal,Awards
0,Men,Bronze,34
1,Men,Gold,23
2,Men,Silver,7
3,Women,Bronze,17
4,Women,Gold,24


# Cleaning up results

Returning to the breakdown of Scandinavian awards you previously made, you want to clean up the results by replacing the nulls with meaningful text.

In [32]:
# query = """
# SELECT
#   -- Replace the nulls in the columns with meaningful text
#   COALESCE(Country, 'All countries') AS Country,
#   COALESCE(Gender, 'All genders') AS Gender,
#   COUNT(*) AS Awards
# FROM Summer_Medals
# WHERE
#   Year = 2004
#   AND Medal = 'Gold'
#   AND Country IN ('DEN', 'NOR', 'SWE')
# GROUP BY ROLLUP(Country, Gender)
# ORDER BY Country ASC, Gender ASC;
# """
# result_df = execute(query)

# # Show results
# result_df.head()

In [37]:
query = """
-- Count the gold medals per country and gender
WITH Gold_Awards AS (
  SELECT
    country,
    gender,
    COUNT(*) AS Gold_Awards
  FROM Summer_Medals
  WHERE
    Year = 2004
    AND Medal = 'Gold'
    AND Country IN ('DEN', 'NOR', 'SWE')
  GROUP BY country, gender
)

-- Generate Country-level subtotals
SELECT
  COALESCE(country, 'All countries') AS country,
  COALESCE(gender, 'All genders') AS gender,
  Gold_Awards
FROM Gold_Awards

UNION ALL

-- Calculate the total count for each country
SELECT
  COALESCE(country, 'All countries') AS country,
  'All genders' AS gender,
  SUM(Gold_Awards) AS Gold_Awards
FROM Gold_Awards
GROUP BY country

ORDER BY country ASC, gender ASC;

"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,country,gender,Gold_Awards
0,DEN,All genders,19
1,DEN,Men,4
2,DEN,Women,15
3,NOR,All genders,5
4,NOR,Men,3


# Summarizing results

After ranking each country in the 2000 Olympics by gold medals awarded, you want to return the top 3 countries in one row, as a comma-separated string. In other words, turn this:
```
| Country | Rank |
|---------|------|
| USA     | 1    |
| RUS     | 2    |
| AUS     | 3    |
| ...     | ...  |
```
into this:

`USA, RUS, AUS`

In [38]:
query = """
WITH Country_Medals AS (
  SELECT
    Country,
    COUNT(*) AS Medals
  FROM Summer_Medals
  WHERE Year = 2000
    AND Medal = 'Gold'
  GROUP BY Country)

  SELECT
    Country,
    -- Rank countries by the medals awarded
    RANK() OVER(ORDER BY Medals DESC) AS Rank
  FROM Country_Medals
  ORDER BY Rank ASC;
"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,Country,Rank
0,USA,1
1,RUS,2
2,AUS,3
3,CHN,4
4,GER,5


In [42]:
# query = """
# WITH Country_Medals AS (
#   SELECT
#     Country,
#     COUNT(*) AS Medals
#   FROM Summer_Medals
#   WHERE Year = 2000
#     AND Medal = 'Gold'
#   GROUP BY Country),

#   Country_Ranks AS (
#   SELECT
#     Country,
#     RANK() OVER (ORDER BY Medals DESC) AS Rank
#   FROM Country_Medals
#   ORDER BY Rank ASC)

# -- Compress the countries column
# SELECT STRING_AGG(Country, ', ')
# FROM Country_Ranks
# -- Select only the top three ranks
# WHERE rank <=3;
# """
# result_df = execute(query)

# # Show results
# result_df.head()

In [44]:
query = """
WITH Country_Medals AS (
  SELECT
    Country,
    COUNT(*) AS Medals
  FROM Summer_Medals
  WHERE Year = 2000
    AND Medal = 'Gold'
  GROUP BY Country
),
Country_Ranks AS (
  SELECT
    Country,
    RANK() OVER (ORDER BY Medals DESC) AS Rank
  FROM Country_Medals
)

-- Select only the top three ranks
SELECT Country
FROM Country_Ranks
WHERE Rank <= 3
ORDER BY Rank ASC;

"""
result_df = execute(query)

# Show results
result_df.head()

Unnamed: 0,Country
0,USA
1,RUS
2,AUS
