In [1]:
import sqlite3
import pandas as pd

def execute(query, database_path='dataset/database.sqlite'):
    connection = sqlite3.connect(database_path)
    result = connection.execute(query).fetchall()
    column_names = [description[0] for description in connection.execute(query).description]
    df = pd.DataFrame(result, columns=column_names)
    connection.close()
    return df


In [2]:
from pandasql import sqldf

# Create helper function for easier query execution
execute_df = lambda q: sqldf(q, globals())

In [3]:
import sqlite3
import pandas as pd

def get_table_names(database_path='dataset/database.sqlite'):
    connection = sqlite3.connect(database_path)
    query = "SELECT name FROM sqlite_master WHERE type='table';"
    result = connection.execute(query).fetchall()
    table_names = [row[0] for row in result]
    connection.close()
    return table_names

# Get and print all table names in the database
tables = get_table_names()
print("Tables in the database:", tables)


Tables in the database: ['sqlite_sequence', 'Player_Attributes', 'Player', 'Match', 'League', 'Country', 'Team', 'Team_Attributes']


In [4]:
player_attributes = execute("SELECT * FROM Player_Attributes;")
player = execute("SELECT * FROM Player;")
match = execute("SELECT * FROM Match;")
league = execute("SELECT * FROM League;")
country = execute("SELECT * FROM Country;")
team = execute("SELECT * FROM Team;")
team_attributes = execute("SELECT * FROM Team_Attributes;")

# Filtering using scalar subqueries

Subqueries are incredibly powerful for performing complex filters and transformations. You can filter data based on single, scalar values using a subquery in ways you cannot by using WHERE statements or joins. Subqueries can also be used for more advanced manipulation of your data set. You will likely encounter subqueries in any real-world setting that uses relational databases.

In this exercise, you will generate a list of matches where the total goals scored (for both teams in total) is more than 3 times the average for games in the matches_2013_2014 table, which includes all games played in the 2013/2014 season.

In [9]:
# matches_2013_2014
query = """
-- Select the average of home + away goals, multiplied by 3
SELECT * 
FROM match
WHERE season='2013/2014';
"""
matches_2013_2014 = execute_df(query)

# Show results
matches_2013_2014.head()

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
0,1237,1,1,2013/2014,1,2014-03-29 00:00:00,1677179,8475,9989,2,...,,,,,,,,,,
1,1238,1,1,2013/2014,1,2014-03-29 00:00:00,1677180,9991,8573,0,...,,,,,,,,,,
2,1239,1,1,2013/2014,2,2014-04-05 00:00:00,1677181,9989,9991,1,...,,,,,,,,,,
3,1240,1,1,2013/2014,2,2014-04-05 00:00:00,1677182,8573,8475,0,...,,,,,,,,,,
4,1241,1,1,2013/2014,3,2014-04-12 00:00:00,1677183,9991,8475,2,...,,,,,,,,,,


In [10]:
query = """
-- Select the average of home + away goals, multiplied by 3
SELECT 
	3 * AVG(home_team_goal + away_team_goal)
FROM matches_2013_2014;
"""
result_df = execute_df(query)

# Show results
result_df.head()

Unnamed: 0,3 * AVG(home_team_goal + away_team_goal)
0,8.300462


In [11]:
query = """
SELECT 
	-- Select the date, home goals, and away goals scored
    date,
	home_team_goal,
	away_team_goal
FROM  matches_2013_2014
-- Filter for matches where total goals exceeds 3x the average
WHERE (home_team_goal + away_team_goal) > 
       (SELECT 3 * AVG(home_team_goal + away_team_goal)
        FROM matches_2013_2014); 
"""
result_df = execute_df(query)

# Show results
result_df.head()

Unnamed: 0,date,home_team_goal,away_team_goal
0,2013-12-14 00:00:00,6,3
1,2014-03-22 00:00:00,3,6
2,2013-10-30 00:00:00,7,3


# Filtering using a subquery with a list

Your goal in this exercise is to generate a list of teams that never played a game in their home city. Using a subquery, you will generate a list of unique hometeam_ID values from the unfiltered match table to exclude in the team table's team_api_ID column.

In addition to filtering using a single-value (scalar) subquery, you can create a list of values in a subquery to filter data based on a complex set of conditions. This type of subquery generates a one column reference list for the main query. As long as the values in your list match a column in your main query's table, you don't need to use a join -- even if the list is from a separate table.

In [15]:
query = """
SELECT 
	-- Select the team long and short names
	team_long_name,
	team_short_name
FROM team 
-- Exclude all values from the subquery
WHERE team_api_id NOT IN
     (SELECT DISTINCT home_team_api_id  FROM matches_2013_2014);
"""
result_df = execute_df(query)

# Show results
result_df.head()

Unnamed: 0,team_long_name,team_short_name
0,KRC Genk,GEN
1,Beerschot AC,BAC
2,SV Zulte-Waregem,ZUL
3,Sporting Lokeren,LOK
4,KSV Cercle Brugge,CEB


# Filtering with more complex subquery conditions

In the previous exercise, you generated a list of teams that have no home matches listed in the soccer database using a subquery in WHERE. Let's do some further exploration in this database by creating a list of teams that scored 8 or more goals in a home match.

In order to do this, you will construct a subquery in the WHERE statement with its own filtering condition.

In [17]:
query = """
SELECT
	-- Select the team long and short names
	team_long_name,
	team_short_name
FROM team
-- Filter for teams with 8 or more home goals
WHERE team_api_id IN
	  (SELECT home_team_api_id 
       FROM match
       WHERE home_team_goal >= 8);
"""
result_df = execute_df(query)

# Show results
result_df.head()

Unnamed: 0,team_long_name,team_short_name
0,Manchester United,MUN
1,Tottenham Hotspur,TOT
2,Chelsea,CHE
3,Southampton,SOU
4,FC Bayern Munich,BMU


# Joining Subqueries in FROM

The match table in the European Soccer Database does not contain country or team names. You can get this information by joining it to the country table, and use this to aggregate information, such as the number of matches played in each country.

If you're interested in filtering data from one of these tables, you can also create a subquery from one of the tables, and then join it to an existing table in the database. A subquery in FROM is an effective way of answering detailed questions that requires filtering or transforming data before including it in your final results.

Your goal in this exercise is to generate a subquery using the match table, and then join that subquery to the country table to calculate information about matches with 10 or more goals in total!

In [18]:
query = """
SELECT 
	-- Select the country ID and match ID
	country_id, 
    id 
FROM match
-- Filter for matches with 10 or more goals in total
WHERE (home_team_goal + away_team_goal) >= 10;
"""
result_df = execute_df(query)

# Show results
result_df.head()

Unnamed: 0,country_id,id
0,1729,2157
1,1729,3093
2,1729,3369
3,1729,3566
4,4769,5192


In [19]:
query = """
SELECT
	-- Select country name and the count match IDs
    c.name AS country_name,
    COUNT(sub.id) AS matches
FROM country AS c
-- Inner join the subquery onto country
-- Select the country id and match id columns
INNER JOIN (SELECT 
	-- Select the country ID and match ID
	country_id, 
    id 
FROM match
-- Filter for matches with 10 or more goals in total
WHERE (home_team_goal + away_team_goal) >= 10) AS sub
ON c.id = sub.country_id
GROUP BY country_name;
"""
result_df = execute_df(query)

# Show results
result_df.head()

Unnamed: 0,country_name,matches
0,England,4
1,France,1
2,Germany,1
3,Netherlands,2
4,Scotland,1


# Building on Subqueries in FROM

In the previous exercise, you found that England, Netherlands, Germany and Spain were the only countries that had matches in the database where 10 or more goals were scored overall. Let's find out some more details about those matches -- when they were played, during which seasons, and how many of the goals were home versus away goals.

You'll notice that in this exercise, the table alias is excluded for every column selected in the main query. This is because the main query is extracting data from the subquery, which is treated as a single table.

In [21]:
query = """
SELECT
	-- Select country, date, home, and away goals from the subquery
    country,
    date,
    home_team_goal,
    away_team_goal
FROM 
	-- Select country name, date, home_team_goal, away_team_goal, and total goals in the subquery
	(SELECT c.name AS country, 
     	    m.date, 
     		m.home_team_goal, 
     		m.away_team_goal,
           (m.home_team_goal + m.away_team_goal) AS total_goals
    FROM match AS m
    LEFT JOIN country AS c
    ON m.country_id = c.id) AS subq
-- Filter by total goals scored in the main query
WHERE total_goals >= 10;
"""
result_df = execute_df(query)

# Show results
result_df.head()

Unnamed: 0,country,date,home_team_goal,away_team_goal
0,England,2009-11-22 00:00:00,9,1
1,England,2011-08-28 00:00:00,8,2
2,England,2012-12-29 00:00:00,7,3
3,England,2013-05-19 00:00:00,5,5
4,France,2009-11-08 00:00:00,5,5


# Add a subquery to the SELECT clause

Subqueries in SELECT statements generate a single value that allow you to pass an aggregate value down a data frame. This is useful for performing calculations on data within your database.

In the following exercise, you will construct a query that calculates the average number of goals per match in each country's league.

In [22]:
query = """
SELECT 
	l.name AS league,
    -- Select and round the league's total goals
    ROUND(AVG(m.home_team_goal + m.away_team_goal), 2) AS avg_goals,
    -- Select & round the average total goals for the season
    (SELECT ROUND(AVG(home_team_goal + away_team_goal), 2) 
     FROM match
     WHERE season = '2013/2014') AS overall_avg
FROM league AS l
LEFT JOIN match AS m
ON l.country_id = m.country_id
-- Filter for the 2013/2014 season
WHERE m.season = '2013/2014'
GROUP BY league;
"""
result_df = execute_df(query)

# Show results
result_df.head()

Unnamed: 0,league,avg_goals,overall_avg
0,Belgium Jupiler League,2.5,2.77
1,England Premier League,2.77,2.77
2,France Ligue 1,2.46,2.77
3,Germany 1. Bundesliga,3.16,2.77
4,Italy Serie A,2.72,2.77


# Subqueries in Select for Calculations

Subqueries in SELECT are a useful way to create calculated columns in a query. A subquery in SELECT can be treated as a single numeric value to use in your calculations. When writing queries in SELECT, it's important to remember that filtering the main query does not filter the subquery -- and vice versa.

In the previous exercise, you created a column to compare each league's average total goals to the overall average goals in the 2013/2014 season. In this exercise, you will add a column that directly compares these values by subtracting the overall average from the subquery.

In [23]:
query = """
SELECT
	-- Select the league name and average goals scored
	l.name AS league,
	ROUND(AVG(m.home_team_goal + m.away_team_goal),2) AS avg_goals,
    -- Subtract the overall average from the league average
	ROUND(AVG(m.home_team_goal + m.away_team_goal) - 
		(SELECT AVG(home_team_goal + away_team_goal)
		 FROM match 
         WHERE season = '2013/2014'),2) AS diff
FROM league AS l
LEFT JOIN match AS m
ON l.country_id = m.country_id
-- Only include 2013/2014 results
WHERE season = '2013/2014'
GROUP BY l.name;
"""
result_df = execute_df(query)

# Show results
result_df.head()

Unnamed: 0,league,avg_goals,diff
0,Belgium Jupiler League,2.5,-0.27
1,England Premier League,2.77,0.0
2,France Ligue 1,2.46,-0.31
3,Germany 1. Bundesliga,3.16,0.39
4,Italy Serie A,2.72,-0.04


# ALL the subqueries EVERYWHERE

In soccer leagues, games are played at different stages. Winning teams progress from one stage to the next, until they reach the final stage. In each stage, the stakes become higher than the previous one. The match table includes data about the different stages that each match took place in.

In this lesson, you will build a final query across 3 exercises that will contain three subqueries -- one in the SELECT clause, one in the FROM clause, and one in the WHERE clause. In the final exercise, your query will extract data examining the average goals scored in each stage of a match. Does the average number of goals scored change as the stakes get higher from one stage to the next?

In [24]:
query = """
SELECT 
	-- Select the stage and average goals for each stage
	m.stage,
    ROUND(AVG(m.home_team_goal + m.away_team_goal),2) AS avg_goals,
    -- Select the average overall goals for the 2012/2013 season
    ROUND((SELECT AVG(home_team_goal + away_team_goal) 
           FROM match 
           WHERE season = '2012/2013'),2) AS overall
FROM match AS m
-- Filter for the 2012/2013 season
WHERE season = '2012/2013'
-- Group by stage
GROUP BY m.stage;
"""
result_df = execute_df(query)

# Show results
result_df.head()

Unnamed: 0,stage,avg_goals,overall
0,1,2.68,2.77
1,2,2.65,2.77
2,3,2.83,2.77
3,4,2.8,2.77
4,5,2.61,2.77


# Add a subquery in FROM

In the previous exercise, you created a data set listing the average home and away goals in each match stage of the 2012/2013 match season.

In this next step, you will turn the main query into a subquery to extract a list of stages where the average home goals in a stage is higher than the overall average for home goals in a match.

In [25]:
query = """
SELECT 
	-- Select the stage and average goals from the subquery
	s.stage,
	ROUND(s.avg_goals,2) AS avg_goals
FROM 
	-- Select the stage and average goals in 2012/2013
	(SELECT
		 stage,
         AVG(home_team_goal + away_team_goal) AS avg_goals
	 FROM match
	 WHERE season = '2012/2013'
	 GROUP BY stage) AS s
WHERE 
	-- Filter the main query using the subquery
	s.avg_goals > (SELECT AVG(home_team_goal + away_team_goal) 
                    FROM match WHERE season = '2012/2013');
"""
result_df = execute_df(query)

# Show results
result_df.head()

Unnamed: 0,stage,avg_goals
0,3,2.83
1,4,2.8
2,6,2.78
3,8,3.09
4,10,2.96


# Add a subquery in SELECT

In the previous exercise, you added a subquery to the FROM statement and selected the stages where the number of average goals in a stage exceeded the overall average number of goals in the 2012/2013 match season. In this final step, you will add a subquery in SELECT to compare the average number of goals scored in each stage to the total.

In [26]:
query = """
SELECT 
	-- Select the stage and average goals from s
	s.stage,
    ROUND(s.avg_goals,2) AS avg_goal,
    -- Select the overall average for 2012/2013
    (SELECT AVG(home_team_goal + away_team_goal) FROM match WHERE season = '2012/2013') AS overall_avg
FROM 
	-- Select the stage and average goals in 2012/2013 from match
	(SELECT
		 stage,
         AVG(home_team_goal + away_team_goal) AS avg_goals
	 FROM match
	 WHERE season = '2012/2013'
	 GROUP BY stage) AS s
WHERE 
	-- Filter the main query using the subquery
	s.avg_goals > (SELECT AVG(home_team_goal + away_team_goal) 
                    FROM match WHERE season = '2012/2013');
"""
result_df = execute_df(query)

# Show results
result_df.head()

Unnamed: 0,stage,avg_goal,overall_avg
0,3,2.83,2.772699
1,4,2.8,2.772699
2,6,2.78,2.772699
3,8,3.09,2.772699
4,10,2.96,2.772699
