In [1]:
import sqlite3
import pandas as pd

def execute(query, database_path='dataset/database.sqlite'):
    connection = sqlite3.connect(database_path)
    result = connection.execute(query).fetchall()
    column_names = [description[0] for description in connection.execute(query).description]
    df = pd.DataFrame(result, columns=column_names)
    connection.close()
    return df


In [2]:
from pandasql import sqldf

# Create helper function for easier query execution
execute_df = lambda q: sqldf(q, globals())

In [3]:
import sqlite3
import pandas as pd

def get_table_names(database_path='dataset/database.sqlite'):
    connection = sqlite3.connect(database_path)
    query = "SELECT name FROM sqlite_master WHERE type='table';"
    result = connection.execute(query).fetchall()
    table_names = [row[0] for row in result]
    connection.close()
    return table_names

# Get and print all table names in the database
tables = get_table_names()
print("Tables in the database:", tables)


Tables in the database: ['sqlite_sequence', 'Player_Attributes', 'Player', 'Match', 'League', 'Country', 'Team', 'Team_Attributes']


In [4]:
player_attributes = execute("SELECT * FROM Player_Attributes;")
player = execute("SELECT * FROM Player;")
match = execute("SELECT * FROM Match;")
league = execute("SELECT * FROM League;")
country = execute("SELECT * FROM Country;")
team = execute("SELECT * FROM Team;")
team_attributes = execute("SELECT * FROM Team_Attributes;")

# The match is OVER

The OVER() clause allows you to pass an aggregate function down a data set, similar to subqueries in SELECT. The OVER() clause offers significant benefits over subqueries in select -- namely, your queries will run faster, and the OVER() clause has a wide range of additional functions and clauses you can include with it that we will cover later on in this chapter.

In this exercise, you will revise some queries from previous chapters using the OVER() clause.

In [5]:
query = """
SELECT 
	-- Select the id, country name, season, home, and away goals
	m.id, 
    c.name AS country, 
    m.season,
	m.home_team_goal,
	m.away_team_goal,
    -- Use a window to include the aggregate average in each row
	AVG(m.home_team_goal + m.away_team_goal) OVER() AS overall_avg
FROM match AS m
LEFT JOIN country AS c ON m.country_id = c.id;
"""
result = execute_df(query)

# Show results
result.head()

Unnamed: 0,id,country,season,home_team_goal,away_team_goal,overall_avg
0,1,Belgium,2008/2009,1,1,2.705531
1,2,Belgium,2008/2009,0,0,2.705531
2,3,Belgium,2008/2009,0,3,2.705531
3,4,Belgium,2008/2009,5,0,2.705531
4,5,Belgium,2008/2009,1,3,2.705531


# What's OVER here?

Window functions allow you to create a RANK of information according to any variable you want to use to sort your data. When setting this up, you will need to specify what column/calculation you want to use to calculate your rank. This is done by including an ORDER BY clause inside the OVER() clause. 

In this exercise, you will create a data set of ranked matches according to which leagues, on average, score the most goals in a match.

In [6]:
query = """
SELECT 
	-- Select the league name and average goals scored
	l.name AS league,
    AVG(m.home_team_goal + m.away_team_goal) AS avg_goals,
    -- Rank each league according to the average goals
    RANK() OVER(ORDER BY AVG(m.home_team_goal + m.away_team_goal)) AS league_rank
FROM league AS l
LEFT JOIN match AS m 
ON l.id = m.country_id
WHERE m.season = '2011/2012'
GROUP BY l.name
-- Order the query by the rank you created
ORDER BY league_rank;
"""
result = execute_df(query)

# Show results
result.head()

Unnamed: 0,league,avg_goals,league_rank
0,Poland Ekstraklasa,2.195833,1
1,France Ligue 1,2.515789,2
2,Italy Serie A,2.583799,3
3,Switzerland Super League,2.623457,4
4,Scotland Premier League,2.635965,5


# Flip OVER your results

In the last exercise, the rank generated in your query was organized from smallest to largest. By adding DESC to your window function, you can create a rank sorted from largest to smallest.

In [7]:
query = """
SELECT 
	-- Select the league name and average goals scored
	l.name AS league,
    AVG(m.home_team_goal + m.away_team_goal) AS avg_goals,
    -- Rank each league according to the average goals
    RANK() OVER(ORDER BY AVG(m.home_team_goal + m.away_team_goal) DESC) AS league_rank
FROM league AS l
LEFT JOIN match AS m 
ON l.id = m.country_id
WHERE m.season = '2011/2012'
GROUP BY l.name
-- Order the query by the rank you created
ORDER BY league_rank;
"""
result = execute_df(query)

# Show results
result.head()

Unnamed: 0,league,avg_goals,league_rank
0,Netherlands Eredivisie,3.25817,1
1,Belgium Jupiler League,2.879167,2
2,Germany 1. Bundesliga,2.859477,3
3,England Premier League,2.805263,4
4,Spain LIGA BBVA,2.763158,5


# PARTITION BY a column

The PARTITION BY clause allows you to calculate separate "windows" based on columns you want to divide your results. For example, you can create a single column that calculates an overall average of goals scored for each season.

In this exercise, you will be creating a data set of games played by Legia Warszawa (Warsaw League), the top ranked team in Poland, and comparing their individual game performance to the overall average for that season.

Where do you see more outliers? Are they Legia Warszawa's home or away games?

In [8]:
query = """
SELECT
	date,
	season,
	home_team_goal,
	away_team_goal,
	CASE WHEN home_team_api_id = 8673 THEN 'home' 
		 ELSE 'away' END AS warsaw_location,
    -- Calculate the average goals scored partitioned by season
    AVG(home_team_goal) OVER(PARTITION BY season) AS season_homeavg,
    AVG(away_team_goal) OVER(PARTITION BY season) AS season_awayavg
FROM match
-- Filter the data set for Legia Warszawa matches only
WHERE 
	away_team_api_id = 8673 
    OR home_team_api_id = 8673
ORDER BY (home_team_goal + away_team_goal) DESC;
"""
result = execute_df(query)

# Show results
result.head()

Unnamed: 0,date,season,home_team_goal,away_team_goal,warsaw_location,season_homeavg,season_awayavg
0,2013-09-14 00:00:00,2013/2014,3,5,away,1.766667,1.233333
1,2009-10-24 00:00:00,2009/2010,5,2,home,1.233333,0.7
2,2011-05-25 00:00:00,2010/2011,2,5,away,1.633333,1.133333
3,2014-09-13 00:00:00,2014/2015,4,3,home,1.566667,1.333333
4,2011-02-25 00:00:00,2010/2011,3,3,away,1.633333,1.133333


# PARTITION BY multiple columns

The PARTITION BY clause can be used to break out window averages by multiple data points (columns). You can even calculate the information you want to use to partition your data! For example, you can calculate average goals scored by season and by country, or by the calendar year (taken from the date column).

In this exercise, you will calculate the average number home and away goals scored Legia Warszawa, and their opponents, partitioned by the month in each season.

In [12]:
query = """
SELECT 
	date,
	season,
	home_team_goal,
	away_team_goal,
	CASE WHEN home_team_api_id = 8673 THEN 'home' 
         ELSE 'away' END AS warsaw_location,
	-- Calculate average goals partitioned by season and month
    AVG(home_team_goal) OVER(PARTITION BY season, 
         	CAST(strftime('%m', date) AS INTEGER)) AS season_mo_home,
    AVG(away_team_goal) OVER(PARTITION BY season, 
         	CAST(strftime('%m', date) AS INTEGER)) AS season_mo_away
FROM match
WHERE 
	home_team_api_id = 8673
    OR away_team_api_id = 8673
ORDER BY (home_team_goal + away_team_goal) DESC;
"""
result = execute_df(query)

# Show results
result.head()

Unnamed: 0,date,season,home_team_goal,away_team_goal,warsaw_location,season_mo_home,season_mo_away
0,2013-09-14 00:00:00,2013/2014,3,5,away,2.25,2.5
1,2009-10-24 00:00:00,2009/2010,5,2,home,2.5,0.75
2,2011-05-25 00:00:00,2010/2011,2,5,away,2.0,1.166667
3,2014-09-13 00:00:00,2014/2015,4,3,home,2.0,2.666667
4,2011-02-25 00:00:00,2010/2011,3,3,away,3.0,3.0


# Slide to the left

Sliding windows allow you to create running calculations between any two points in a window using functions such as PRECEDING, FOLLOWING, and CURRENT ROW. You can calculate running counts, sums, averages, and other aggregate functions between any two points you specify in the data set.

In this exercise, you will expand on the examples discussed in the video, calculating the running total of goals scored by the FC Utrecht when they were the home team during the 2011/2012 season. Do they score more goals at the end of the season as the home or away team?

In [13]:
query = """
SELECT 
	date,
	home_team_goal,
	away_team_goal,
    -- Create a running total and running average of home goals
    SUM(home_team_goal) OVER(ORDER BY date 
         ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS running_total,
    AVG(home_team_goal) OVER(ORDER BY date 
         ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS running_avg
FROM match
WHERE 
	home_team_api_id = 9908 
	AND season = '2011/2012';
"""
result = execute_df(query)

# Show results
result.head()

Unnamed: 0,date,home_team_goal,away_team_goal,running_total,running_avg
0,2011-08-14 00:00:00,2,2,2,2.0
1,2011-08-27 00:00:00,3,1,5,2.5
2,2011-09-18 00:00:00,2,2,7,2.333333
3,2011-10-01 00:00:00,3,0,10,2.5
4,2011-10-22 00:00:00,1,4,11,2.2


# Slide to the right

Now let's see how FC Utrecht performs when they're the away team. You'll notice that the total for the season is at the bottom of the data set you queried. Depending on your results, this could be pretty long, and scrolling down is not very helpful.

In this exercise, you will slightly modify the query from the previous exercise by sorting the data set in reverse order and calculating a backward running total from the CURRENT ROW to the end of the data set (earliest record).

In [14]:
query = """
SELECT 
	-- Select the date, home goal, and away goals
	date,
    home_team_goal,
    away_team_goal,
    -- Create a running total and running average of home goals
    SUM(home_team_goal) OVER(ORDER BY date DESC
         ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS running_total,
    AVG(home_team_goal) OVER(ORDER BY date DESC
         ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING) AS running_avg
FROM match
WHERE 
	away_team_api_id = 9908 
    AND season = '2011/2012';
"""
result = execute_df(query)

# Show results
result.head()

Unnamed: 0,date,home_team_goal,away_team_goal,running_total,running_avg
0,2012-05-06 00:00:00,1,3,25,1.470588
1,2012-04-21 00:00:00,0,2,24,1.5
2,2012-04-12 00:00:00,3,0,24,1.6
3,2012-03-25 00:00:00,3,1,21,1.5
4,2012-03-11 00:00:00,1,1,18,1.384615


# Setting up the home team CTE

In this course, we've covered ways in which you can use CASE statements, subqueries, common table expressions, and window functions in your queries to structure a data set that best meets your needs. For this exercise, you will be using all of these concepts to generate a list of matches in which Manchester United was defeated during the 2014/2015 English Premier League season.

Your first task is to create the first query that filters for matches where Manchester United played as the home team. This will become a common table expression in a later exercise.

In [15]:
query = """
SELECT 
	m.id, 
    t.team_long_name,
    -- Identify matches as home/away wins or ties
	CASE WHEN m.home_team_goal > m.away_team_goal THEN 'MU Win'
		WHEN m.home_team_goal < m.away_team_goal THEN 'MU Loss'
        ELSE 'Tie' END AS outcome
FROM match AS m
-- Left join team on the home team ID and team API id
LEFT JOIN team
 AS t 
ON m.home_team_api_id = t.team_api_id
WHERE 
	-- Filter for 2014/2015 and Manchester United as the home team
	season  = '2014/2015'
	AND t.team_long_name = 'Manchester United';
"""
result = execute_df(query)

# Show results
result.head()

Unnamed: 0,id,team_long_name,outcome
0,4013,Manchester United,MU Loss
1,4031,Manchester United,MU Win
2,4051,Manchester United,MU Win
3,4062,Manchester United,MU Win
4,4085,Manchester United,MU Win


# Setting up the away team CTE

Now that you have a query identifying the home team in a match, you will perform a similar set of steps to identify the away team. Just like the previous step, you will join the match and team tables. Each of these two queries will be declared as a Common Table Expression in the following step.

The primary difference in this query is that you will be joining the tables on awayteam_id, and reversing the match outcomes in the CASE statement.

When altering CASE statement logic in your own work, you can reverse either the logical condition (i.e., home_goal > away_goal) or the outcome in THEN -- just make sure you only reverse one of the two!

In [16]:
query = """
SELECT 
	m.id, 
    t.team_long_name,
    -- Identify matches as home/away wins or ties
	CASE WHEN m.home_team_goal > m.away_team_goal THEN 'MU Loss'
		WHEN m.home_team_goal < m.away_team_goal THEN 'MU Win'
        ELSE 'Tie' END AS outcome
-- Join team table to the match table
FROM match AS m
LEFT JOIN team AS t 
ON m.away_team_api_id = t.team_api_id
WHERE 
	-- Filter for 2014/2015 and Manchester United as the away team
	season  = '2014/2015'
	AND t.team_long_name = 'Manchester United';
"""
result = execute_df(query)

# Show results
result.head()

Unnamed: 0,id,team_long_name,outcome
0,4026,Manchester United,MU Loss
1,4039,Manchester United,MU Win
2,4075,Manchester United,MU Win
3,4089,Manchester United,Tie
4,4117,Manchester United,Tie


# Putting the CTEs together

Now that you've created the two subqueries identifying the home and away team opponents, it's time to rearrange your query with the home and away subqueries as Common Table Expressions (CTEs). You'll notice that the main query includes the phrase, SELECT DISTINCT. Without identifying only DISTINCT matches, you will return a duplicate record for each game played.

Continue building the query to extract all matches played by Manchester United in the 2014/2015 season.

In [17]:
query = """
-- Set up the home team CTE
WITH home AS (
  SELECT m.id, t.team_long_name,
	  CASE WHEN m.home_team_goal > m.away_team_goal THEN 'MU Win'
		   WHEN m.home_team_goal < m.away_team_goal THEN 'MU Loss' 
  		   ELSE 'Tie' END AS outcome
  FROM match AS m
  LEFT JOIN team AS t ON m.home_team_api_id = t.team_api_id),
-- Set up the away team CTE
away AS (
  SELECT m.id, t.team_long_name,
	  CASE WHEN m.home_team_goal > m.away_team_goal THEN 'MU Win'
		   WHEN m.home_team_goal < m.away_team_goal THEN 'MU Loss' 
  		   ELSE 'Tie' END AS outcome
  FROM match AS m
  LEFT JOIN team AS t ON m.away_team_api_id = t.team_api_id)
-- Select team names, the date and goals
SELECT DISTINCT
    m.date,
    home.team_long_name AS home_team,
    away.team_long_name AS away_team,
    m.home_team_goal,
    m.away_team_goal
-- Join the CTEs onto the match table
FROM match AS m
LEFT JOIN home ON m.id = home.id
LEFT JOIN away ON m.id = away.id
WHERE m.season = '2014/2015'
      AND (home.team_long_name = 'Manchester United' 
           OR away.team_long_name = 'Manchester United');
"""
result = execute_df(query)

# Show results
result.head()

Unnamed: 0,date,home_team,away_team,home_team_goal,away_team_goal
0,2014-08-16 00:00:00,Manchester United,Swansea City,1,2
1,2014-11-02 00:00:00,Manchester City,Manchester United,1,0
2,2014-11-08 00:00:00,Manchester United,Crystal Palace,1,0
3,2014-11-22 00:00:00,Arsenal,Manchester United,1,2
4,2014-11-29 00:00:00,Manchester United,Hull City,3,0


# Add a window function

You now have a result set that retrieves the match date, home team, away team, and the goals scored by each team. You have one final component of the question left -- how badly did Manchester United lose in each match?

In order to determine this, let's add a window function to the main query that ranks matches by the absolute value of the difference between home_goal and away_goal. This allows us to directly compare the difference in scores without having to consider whether Manchester United played as the home or away team!

The equation is complete for you -- all you need to do is properly complete the window functio

In [18]:
query = """
-- Set up the home team CTE
WITH home AS (
  SELECT m.id, t.team_long_name,
	  CASE WHEN m.home_team_goal > m.away_team_goal THEN 'MU Win'
		   WHEN m.home_team_goal < m.away_team_goal THEN 'MU Loss' 
  		   ELSE 'Tie' END AS outcome
  FROM match AS m
  LEFT JOIN team AS t ON m.home_team_api_id = t.team_api_id),
-- Set up the away team CTE
away AS (
  SELECT m.id, t.team_long_name,
	  CASE WHEN m.home_team_goal > m.away_team_goal THEN 'MU Loss'
		   WHEN m.home_team_goal < m.away_team_goal THEN 'MU Win' 
  		   ELSE 'Tie' END AS outcome
  FROM match AS m
  LEFT JOIN team AS t ON m.away_team_api_id = t.team_api_id)
-- Select columns and and rank the matches by goal difference
SELECT DISTINCT
    m.date,
    home.team_long_name AS home_team,
    away.team_long_name AS away_team,
    m.home_team_goal, m.away_team_goal,
    RANK() OVER(ORDER BY ABS(home_team_goal - away_team_goal) DESC) as match_rank
-- Join the CTEs onto the match table
FROM match AS m
LEFT JOIN home ON m.id = home.id
LEFT JOIN away ON m.id = away.id
WHERE m.season = '2014/2015'
      AND ((home.team_long_name = 'Manchester United' AND home.outcome = 'MU Loss')
      OR (away.team_long_name = 'Manchester United' AND away.outcome = 'MU Loss'));
"""
result = execute_df(query)

# Show results
result.head()

Unnamed: 0,date,home_team,away_team,home_team_goal,away_team_goal,match_rank
0,2015-04-26 00:00:00,Everton,Manchester United,3,0,1
1,2014-09-21 00:00:00,Leicester City,Manchester United,5,3,2
2,2014-08-16 00:00:00,Manchester United,Swansea City,1,2,3
3,2014-11-02 00:00:00,Manchester City,Manchester United,1,0,3
4,2015-01-11 00:00:00,Manchester United,Southampton,0,1,3
