Answers for the SQL coding challenge for the STL Big Data Meetup in May 2025.

Questions and the source data are posted in Kit's [Github repository](https://github.com/kitmenke/sql-coding-challenge).

In [1]:
#Import libraries
import pandas as pd
from sqlalchemy import create_engine, text

# Create database connection
engine = create_engine('postgresql+psycopg2://tharinduabeysinghe:#####@localhost/practice_datasets')

# Run query and load data to a dataframe
def execute_sql_query(sql):
    # Load data into a pandas DataFrame
    df = pd.DataFrame()
    with engine.connect() as conn:
        df = pd.read_sql_query(text(sql), conn)
    return df

Question 1: Retrieve the names of all attractions located in the city of 'St. Louis'.

In [2]:
sql = '''
SELECT DISTINCT(name)
FROM Attractions;
'''

execute_sql_query(sql)

Unnamed: 0,name
0,Gateway Arch National Park
1,Cahokia Mounds State Hist.
2,City Museum
3,Saint Louis Zoo
4,Forest Park
5,Missouri Botanical Garden
6,Ted Drewes Frozen Custard


Question 2: List the names of attractions along with the number of reviews they have received. Order the results from the attraction with the most reviews to the least.

In [3]:
sql = '''
SELECT name,
    COUNT(ReviewID) AS review_count
FROM Attractions a
JOIN Reviews r
  ON a.AttractionID = r.AttractionID
GROUP BY name
ORDER BY review_count DESC;
'''

execute_sql_query(sql)

Unnamed: 0,name,review_count
0,Gateway Arch National Park,3
1,City Museum,2
2,Saint Louis Zoo,2
3,Forest Park,1
4,Missouri Botanical Garden,1
5,Ted Drewes Frozen Custard,1


Question 3: Find the average rating for each attraction. Display the attraction name and its average rating, rounded to two decimal places.

In [4]:
sql = '''
WITH cte AS (
    SELECT 
        a.attractionid,
        name,
        rating
    FROM Attractions a
    JOIN Reviews r
        ON a.AttractionID = r.AttractionID
)
SELECT
    name,
    ROUND(AVG(rating), 2) AS average_rating
FROM cte
GROUP BY name;
'''

execute_sql_query(sql)

Unnamed: 0,name,average_rating
0,Gateway Arch National Park,4.33
1,City Museum,4.0
2,Saint Louis Zoo,4.0
3,Forest Park,5.0
4,Missouri Botanical Garden,5.0
5,Ted Drewes Frozen Custard,5.0


Question 4: Identify the names of attractions that have at least one review with a rating of 5 and at least one review with a rating of 3.

In [5]:
sql = '''
SELECT
    name,
    rating
FROM Attractions a
JOIN Reviews r
    ON a.AttractionID = r.AttractionID
GROUP BY name, rating
HAVING
    rating = 5
    OR rating = 3;
'''

execute_sql_query(sql)

Unnamed: 0,name,rating
0,City Museum,3
1,City Museum,5
2,Forest Park,5
3,Gateway Arch National Park,3
4,Gateway Arch National Park,5
5,Missouri Botanical Garden,5
6,Ted Drewes Frozen Custard,5


Question 5: For each attraction category, find the attraction with the highest average rating. Display the category, attraction name, and its average rating. (Hint: You might need to use window functions or a subquery).


In [6]:
sql = '''
WITH cte AS (
    SELECT 
        a.attractionid,
        category,
        name,
        rating
    FROM Attractions a
    JOIN Reviews r
        ON a.AttractionID = r.AttractionID		
),
cte2 AS (
    SELECT
        name,
        category,
        ROUND(AVG(rating), 2) AS average_rating
    FROM cte
    GROUP BY name, category
)
SELECT category,
       name,
       average_rating
FROM (
    SELECT 
        category,
        name,
        average_rating,
        RANK() OVER(PARTITION BY category ORDER BY average_rating DESC) as rn
    FROM cte2
    GROUP BY category, name, average_rating
) as ranked
WHERE rn = 1;
'''
execute_sql_query(sql)

Unnamed: 0,category,name,average_rating
0,Food,Ted Drewes Frozen Custard,5.0
1,Garden,Missouri Botanical Garden,5.0
2,Landmark,Gateway Arch National Park,4.33
3,Museum,City Museum,4.0
4,Park,Forest Park,5.0
