In [43]:
import sqlite3
import pandas as pd

def execute(query, database_path='dataset/database.sqlite'):
    connection = sqlite3.connect(database_path)
    result = connection.execute(query).fetchall()
    column_names = [description[0] for description in connection.execute(query).description]
    df = pd.DataFrame(result, columns=column_names)
    connection.close()
    return df


In [44]:
from pandasql import sqldf

# Create helper function for easier query execution
execute_df = lambda q: sqldf(q, globals())

In [45]:
import sqlite3
import pandas as pd

def get_table_names(database_path='dataset/database.sqlite'):
    connection = sqlite3.connect(database_path)
    query = "SELECT name FROM sqlite_master WHERE type='table';"
    result = connection.execute(query).fetchall()
    table_names = [row[0] for row in result]
    connection.close()
    return table_names

# Get and print all table names in the database
tables = get_table_names()
print("Tables in the database:", tables)


Tables in the database: ['sqlite_sequence', 'Player_Attributes', 'Player', 'Match', 'League', 'Country', 'Team', 'Team_Attributes']


In [51]:
player_attributes = execute("SELECT * FROM Player_Attributes;")
player = execute("SELECT * FROM Player;")
match = execute("SELECT * FROM Match;")
league = execute("SELECT * FROM League;")
country = execute("SELECT * FROM Country;")
team = execute("SELECT * FROM Team;")
team_attributes = execute("SELECT * FROM Team_Attributes;")

# Basic CASE statements

What is your favorite team?

The European Soccer Database contains data about 12,800 matches from 11 countries played between 2011-2015! Throughout this course, you will be shown filtered versions of the tables in this database in order to better explore their contents.

In this exercise, you will identify matches played between FC Schalke 04 and FC Bayern Munich. There are 2 teams identified in each match in the hometeam_id and awayteam_id columns, available to you in the filtered `matches_germany` table. ID can join to the team_api_id column in the teams_germany table, but you cannot perform a join on both at the same time.

However, you can perform this operation using a CASE statement once you've identified the `team_api_id` associated with each team!

In [52]:
team.head()

Unnamed: 0,id,team_api_id,team_fifa_api_id,team_long_name,team_short_name
0,1,9987,673.0,KRC Genk,GEN
1,2,9993,675.0,Beerschot AC,BAC
2,3,10000,15005.0,SV Zulte-Waregem,ZUL
3,4,9994,2007.0,Sporting Lokeren,LOK
4,5,9984,1750.0,KSV Cercle Brugge,CEB


In [53]:
query = """
SELECT
	-- Select the team long name and team API id
	team_long_name,
	team_api_id
FROM team
-- Only include FC Schalke 04 and FC Bayern Munich
WHERE team_long_name in ('FC Schalke 04', 'FC Bayern Munich');
"""
result_df = execute_df(query)

# Show results
result_df.head()

Unnamed: 0,team_long_name,team_api_id
0,FC Bayern Munich,9823
1,FC Schalke 04,10189


In [56]:
match.head()

Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
0,1,1,1,2008/2009,1,2008-08-17 00:00:00,492473,9987,9993,1,...,4.0,1.65,3.4,4.5,1.78,3.25,4.0,1.73,3.4,4.2
1,2,1,1,2008/2009,1,2008-08-16 00:00:00,492474,10000,9994,0,...,3.8,2.0,3.25,3.25,1.85,3.25,3.75,1.91,3.25,3.6
2,3,1,1,2008/2009,1,2008-08-16 00:00:00,492475,9984,8635,0,...,2.5,2.35,3.25,2.65,2.5,3.2,2.5,2.3,3.2,2.75
3,4,1,1,2008/2009,1,2008-08-17 00:00:00,492476,9991,9998,5,...,7.5,1.45,3.75,6.5,1.5,3.75,5.5,1.44,3.75,6.5
4,5,1,1,2008/2009,1,2008-08-16 00:00:00,492477,7947,9985,1,...,1.73,4.5,3.4,1.65,4.5,3.5,1.65,4.75,3.3,1.67


In [58]:
query = """
-- Identify the home team as Bayern Munich, Schalke 04, or neither
SELECT 
	CASE WHEN home_team_api_id = 10189 THEN 'FC Schalke 04'
        WHEN home_team_api_id = 9823 THEN 'FC Bayern Munich'
         ELSE 'Other' END AS home_team,
	COUNT(id) AS total_matches
FROM match
-- Group by the CASE statement alias
GROUP BY home_team;
"""
result_df = execute_df(query)

# Show results
result_df.head()

Unnamed: 0,home_team,total_matches
0,FC Bayern Munich,136
1,FC Schalke 04,136
2,Other,25707


# CASE statements comparing column values

Barcelona is considered one of the strongest teams in Spain's soccer league.

In this exercise, you will be creating a list of matches in the 2011/2012 season where Barcelona was the home team. You will do this using a CASE statement that compares the values of two columns to create a new group -- wins, losses, and ties.

In 3 steps, you will build a query that identifies a match's winner, identifies the identity of the opponent, and finally filters for Barcelona as the home team. Completing a query in this order will allow you to watch your results take shape with each new piece of information.

In [62]:
country[country["name"]=="Spain"]

Unnamed: 0,id,name
9,21518,Spain


In [66]:
matches_spain = execute_df( """
SELECT *
FROM match
WHERE country_id = 21518;
""")
matches_spain.head()


Unnamed: 0,id,country_id,league_id,season,stage,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,...,SJA,VCH,VCD,VCA,GBH,GBD,GBA,BSH,BSD,BSA
0,21518,21518,21518,2008/2009,1,2008-08-30 00:00:00,530023,10267,8661,3,...,5.0,1.65,3.4,4.5,1.75,3.3,5.0,1.67,3.4,5.0
1,21519,21518,21518,2008/2009,1,2008-08-31 00:00:00,530084,8371,10205,1,...,2.6,2.9,3.2,2.25,2.75,3.2,2.5,2.8,3.25,2.3
2,21520,21518,21518,2008/2009,1,2008-08-31 00:00:00,530085,9783,8633,2,...,1.9,3.25,3.25,2.0,3.5,3.2,2.1,3.25,3.25,2.1
3,21521,21518,21518,2008/2009,1,2008-08-31 00:00:00,530086,8388,8634,1,...,1.57,7.0,3.75,1.45,6.5,3.75,1.53,5.5,3.75,1.53
4,21522,21518,21518,2008/2009,1,2008-08-31 00:00:00,530087,8696,8302,1,...,2.3,2.7,3.1,2.4,2.75,3.25,2.45,2.7,3.25,2.38


In [67]:
query = """
SELECT 
	-- Select the date of the match
	date,
	-- Identify home wins, losses, or ties
	CASE WHEN home_team_goal > away_team_goal THEN 'Home win!'
        WHEN home_team_goal < away_team_goal THEN 'Home loss :(' 
        ELSE 'Tie' END AS outcome
FROM matches_spain;
"""
result_df = execute_df(query)

# Show results
result_df.head()

Unnamed: 0,date,outcome
0,2008-08-30 00:00:00,Home win!
1,2008-08-31 00:00:00,Tie
2,2008-08-31 00:00:00,Home win!
3,2008-08-31 00:00:00,Home win!
4,2008-08-31 00:00:00,Tie


In [72]:
query = """
SELECT 
	m.date,
	--Select the team long name column and call it 'opponent'
	t.team_long_name AS opponent, 
	-- Complete the CASE statement with an alias
	CASE WHEN m.home_team_goal >  m.away_team_goal THEN 'Home win!'
        WHEN m.home_team_goal <  m.away_team_goal THEN 'Home loss :('
        ELSE 'Tie' END AS outcome
FROM matches_spain AS m
-- Left join teams_spain onto matches_spain
LEFT JOIN team AS t
ON m.away_team_api_id = t.team_api_id;
"""
result_df = execute_df(query)

# Show results
result_df.head()

Unnamed: 0,date,opponent,outcome
0,2008-08-30 00:00:00,RCD Mallorca,Home win!
1,2008-08-31 00:00:00,Villarreal CF,Tie
2,2008-08-31 00:00:00,Real Madrid CF,Home win!
3,2008-08-31 00:00:00,FC Barcelona,Home win!
4,2008-08-31 00:00:00,Sevilla FC,Tie


In [73]:
query = """
SELECT 
	m.date,
	t.team_long_name AS opponent,
    -- Complete the CASE statement with an alias
	CASE WHEN m.home_team_goal > m.away_team_goal THEN 'Barcelona win!'
        WHEN m.home_team_goal <  m.away_team_goal THEN 'Barcelona loss :(' 
        ELSE 'Tie' END AS outcome 
FROM matches_spain AS m
LEFT JOIN team AS t 
ON m.away_team_api_id = t.team_api_id
-- Filter for Barcelona as the home team
WHERE m.home_team_api_id = 8634; 
"""
result_df = execute_df(query)

# Show results
result_df.head()

Unnamed: 0,date,opponent,outcome
0,2008-11-08 00:00:00,Real Valladolid,Barcelona win!
1,2008-11-23 00:00:00,Getafe CF,Tie
2,2008-12-06 00:00:00,Valencia CF,Barcelona win!
3,2008-12-13 00:00:00,Real Madrid CF,Barcelona win!
4,2009-01-03 00:00:00,RCD Mallorca,Barcelona win!


# CASE statements comparing two column values part 2

Similar to the previous exercise, you will construct a query to determine the outcome of Barcelona's matches where they played as the away team. You will learn how to combine these two queries in chapters 2 and 3.

Did their performance differ from the matches where they were the home team?

In [74]:
query = """
-- Select matches where Barcelona was the away team
SELECT  
	m.date,
	t.team_long_name AS opponent,
	CASE WHEN m.home_team_goal < m.away_team_goal THEN 'Barcelona win!'
        WHEN m.home_team_goal > m.away_team_goal THEN 'Barcelona loss :(' 
        ELSE 'Tie' END AS outcome
FROM matches_spain AS m
-- Join teams_spain to matches_spain
LEFT JOIN team AS t 
ON m.home_team_api_id = t.team_api_id
WHERE m.away_team_api_id = 8634;
"""
result_df = execute_df(query)

# Show results
result_df.head()

Unnamed: 0,date,opponent,outcome
0,2008-08-31 00:00:00,CD Numancia,Barcelona loss :(
1,2008-11-16 00:00:00,RC Recreativo,Barcelona win!
2,2008-11-29 00:00:00,Sevilla FC,Barcelona win!
3,2008-12-21 00:00:00,Villarreal CF,Barcelona win!
4,2009-01-11 00:00:00,CA Osasuna,Barcelona win!


# In CASE of rivalry

Barcelona and Real Madrid have been rival teams for more than 80 years. Matches between these two teams are given the name El Clásico (The Classic). In this exercise, you will query a list of matches played between these two rivals.

You will notice in Step 2 that when you have multiple logical conditions in a CASE statement, you may quickly end up with a large number of WHEN clauses to logically test every outcome you are interested in. It's important to make sure you don't accidentally exclude key information in your ELSE clause.

In this exercise, you will retrieve information about matches played between Barcelona (id = 8634) and Real Madrid (id = 8633). Note that the query you are provided with already identifies the Clásico matches using a filter in the WHERE clause.

In [75]:
query = """
SELECT 
	date,
	-- Identify the home team as Barcelona or Real Madrid
	CASE WHEN home_team_api_id = 8634 THEN 'FC Barcelona' 
        WHEN home_team_api_id = 8633 THEN 'Real Madrid CF' END AS home,
    -- Identify the away team as Barcelona or Real Madrid
	CASE WHEN away_team_api_id = 8634 THEN 'FC Barcelona' 
        WHEN away_team_api_id = 8633 THEN 'Real Madrid CF' END AS away
FROM matches_spain
WHERE (away_team_api_id = 8634 OR home_team_api_id = 8634)
      AND (away_team_api_id = 8633 OR home_team_api_id = 8633);
"""
result_df = execute_df(query)

# Show results
result_df.head()

Unnamed: 0,date,home,away
0,2008-12-13 00:00:00,FC Barcelona,Real Madrid CF
1,2009-05-02 00:00:00,Real Madrid CF,FC Barcelona
2,2009-11-29 00:00:00,FC Barcelona,Real Madrid CF
3,2010-04-10 00:00:00,Real Madrid CF,FC Barcelona
4,2010-11-29 00:00:00,FC Barcelona,Real Madrid CF


In [77]:
query = """
SELECT 
	date,
	CASE WHEN home_team_api_id = 8634 THEN 'FC Barcelona' 
         ELSE 'Real Madrid CF' END as home,
	CASE WHEN away_team_api_id = 8634 THEN 'FC Barcelona' 
         ELSE 'Real Madrid CF' END as away,
	-- Identify all possible match outcomes
	CASE WHEN home_team_goal > away_team_goal AND home_team_api_id = 8634 THEN 'Barcelona win!'
        WHEN home_team_goal > away_team_goal AND home_team_api_id = 8633 THEN 'Real Madrid win!'
        WHEN home_team_goal < away_team_goal AND away_team_api_id = 8634 THEN 'Barcelona win!'
        WHEN home_team_goal < away_team_goal AND away_team_api_id = 8633 THEN 'Real Madrid win!'
        ELSE 'Tie!' END AS outcome
FROM matches_spain
WHERE (away_team_api_id = 8634 OR home_team_api_id = 8634)
      AND (away_team_api_id = 8633 OR home_team_api_id = 8633);
"""
result_df = execute_df(query)

# Show results
result_df.head()

Unnamed: 0,date,home,away,outcome
0,2008-12-13 00:00:00,FC Barcelona,Real Madrid CF,Barcelona win!
1,2009-05-02 00:00:00,Real Madrid CF,FC Barcelona,Barcelona win!
2,2009-11-29 00:00:00,FC Barcelona,Real Madrid CF,Barcelona win!
3,2010-04-10 00:00:00,Real Madrid CF,FC Barcelona,Barcelona win!
4,2010-11-29 00:00:00,FC Barcelona,Real Madrid CF,Barcelona win!


#  Filtering your CASE statement

Let's generate a list of matches won by Italy's Bologna team! There are quite a few additional teams in the two tables, so a key part of generating a usable query will be using your CASE statement as a filter in the WHERE clause.

CASE statements allow you to categorize data that you're interested in -- and exclude data you're not interested in. In order to do this, you can use a CASE statement as a filter in the WHERE statement to remove output you don't want to see.

Here is how you might set that up:
```
SELECT *
FROM table
WHERE 
    CASE WHEN a > 5 THEN 'Keep'
         WHEN a <= 5 THEN 'Exclude' END = 'Keep';
```

In [78]:
query = """
-- Select team_long_name and team_api_id from team
SELECT
	team_long_name,
	team_api_id
FROM team
-- Filter for team long name
WHERE team_long_name = 'Bologna';
"""
result_df = execute_df(query)

# Show results
result_df.head()

Unnamed: 0,team_long_name,team_api_id
0,Bologna,9857


In [79]:
query = """
-- Select the season and date columns
SELECT 
	season,
	date,
    -- Identify when Bologna won a match
	CASE WHEN home_team_api_id = 9857 
        AND home_team_goal > away_team_goal 
        THEN 'Bologna Win'
		WHEN away_team_api_id = 9857 
        AND away_team_goal > home_team_goal 
        THEN 'Bologna Win' 
		END AS outcome
FROM match;
"""
result_df = execute_df(query)

# Show results
result_df.head()

Unnamed: 0,season,date,outcome
0,2008/2009,2008-08-17 00:00:00,
1,2008/2009,2008-08-16 00:00:00,
2,2008/2009,2008-08-16 00:00:00,
3,2008/2009,2008-08-17 00:00:00,
4,2008/2009,2008-08-16 00:00:00,


In [83]:
query = """
-- Select the season, date, home_goal, and away_goal columns
SELECT 
	season,
    date,
	home_team_goal,
	away_team_goal
FROM match
WHERE 
-- Exclude games not won by Bologna
	CASE WHEN home_team_api_id = 9857 
        AND home_team_goal > away_team_goal 
        THEN 'Bologna Win'
		WHEN away_team_api_id = 9857 
        AND away_team_goal > home_team_goal 
        THEN 'Bologna Win' 
		END IS NOT NULL;
"""
result_df = execute_df(query)

# Show results
result_df.head()

Unnamed: 0,season,date,home_team_goal,away_team_goal
0,2008/2009,2008-08-31 00:00:00,1,2
1,2008/2009,2008-12-13 00:00:00,5,2
2,2008/2009,2009-01-18 00:00:00,1,2
3,2008/2009,2009-01-28 00:00:00,0,1
4,2008/2009,2009-03-08 00:00:00,3,0


# COUNT using CASE WHEN

Do the number of soccer matches played in a given European country differ across seasons? We will use the European Soccer Database to answer this question.

You will examine the number of matches played in 3 seasons within each country listed in the database. This is much easier to explore with each season's matches in separate columns. Using the country and unfiltered match table, you will count the number of matches played in each country during the 2012/2013, 2013/2014, and 2014/2015 match seasons.

In [84]:
query = """
SELECT 
	c.name AS country,
    -- Count games from the 2012/2013 season
	COUNT(CASE WHEN m.season = '2012/2013' 
        	THEN m.id ELSE NULL END) AS matches_2012_2013
FROM country AS c
LEFT JOIN match AS m
ON c.id = m.country_id
-- Group by country name alias
GROUP BY country;
"""
result_df = execute_df(query)

# Show results
result_df.head()

Unnamed: 0,country,matches_2012_2013
0,Belgium,240
1,England,380
2,France,380
3,Germany,306
4,Italy,380


In [85]:
query = """
SELECT 
	c.name AS country,
    -- Count matches in each of the 3 seasons
	COUNT(CASE WHEN m.season = '2012/2013' THEN m.id END) AS matches_2012_2013,
	COUNT(CASE WHEN m.season = '2013/2014' THEN m.id END) AS matches_2013_2014,
	COUNT(CASE WHEN m.season = '2014/2015' THEN m.id END) AS matches_2014_2015
FROM country AS c
LEFT JOIN match AS m
ON c.id = m.country_id
-- Group by country name alias
GROUP BY country;
"""
result_df = execute_df(query)

# Show results
result_df.head()

Unnamed: 0,country,matches_2012_2013,matches_2013_2014,matches_2014_2015
0,Belgium,240,12,240
1,England,380,380,380
2,France,380,380,380
3,Germany,306,306,306
4,Italy,380,380,379


# COUNT and CASE WHEN with multiple conditions

In R or Python, you have the ability to calculate a SUM of logical values (i.e., TRUE/FALSE) directly. In SQL, you have to convert these values into 1 and 0 before calculating a sum. This can be done using a CASE statement.

There's one key difference when using SUM to aggregate logical values compared to using COUNT in the previous exercise --

Your goal here is to use the country and match table to determine the total number of matches won by the home team in each country during the 2012/2013, 2013/2014, and 2014/2015 seasons.

In [87]:
query = """
SELECT 
	c.name AS country,
    -- Sum the total records in each season where the home team won
	SUM(CASE WHEN m.season = '2012/2013' AND m.home_team_goal > m.away_team_goal 
        THEN 1 ELSE 0 END) AS matches_2012_2013,
 	SUM(CASE WHEN m.season = '2013/2014' AND m.home_team_goal > m.away_team_goal
        THEN 1 ELSE 0 END) AS matches_2013_2014,
	SUM(CASE WHEN m.season = '2014/2015' AND m.home_team_goal > m.away_team_goal
        THEN 1 ELSE 0 END) AS matches_2014_2015
FROM country AS c
LEFT JOIN match AS m
ON c.id = m.country_id
-- Group by country name alias
GROUP BY country;
"""
result_df = execute_df(query)

# Show results
result_df.head()

Unnamed: 0,country,matches_2012_2013,matches_2013_2014,matches_2014_2015
0,Belgium,102,6,106
1,England,166,179,172
2,France,170,168,181
3,Germany,130,145,145
4,Italy,177,181,152


# Calculating percent with CASE and AVG

CASE statements will return any value you specify in your THEN clause. This is an incredibly powerful tool for robust calculations and data manipulation when used in conjunction with an aggregate statement. One key task you can perform is using CASE inside an AVG function to calculate a percentage of information in your database.

In [89]:
query = """
SELECT 
    c.name AS country,
    -- Count the home wins, away wins, and ties in each country
	COUNT(CASE WHEN m.home_team_goal > m.away_team_goal THEN m.id 
        END) AS home_wins,
	COUNT(CASE WHEN m.home_team_goal < m.away_team_goal THEN m.id 
        END) AS away_wins,
	COUNT(CASE WHEN m.home_team_goal = m.away_team_goal THEN m.id 
        END) AS ties
FROM country AS c
LEFT JOIN match AS m
ON c.id = m.country_id
GROUP BY country;
"""
result_df = execute_df(query)

# Show results
result_df.head()

Unnamed: 0,country,home_wins,away_wins,ties
0,Belgium,810,493,425
1,England,1390,867,783
2,France,1359,822,859
3,Germany,1107,744,597
4,Italy,1407,814,796


In [90]:
query = """
SELECT 
	c.name AS country,
    -- Calculate the percentage of tied games in each season
	AVG(CASE WHEN m.season='2013/2014' AND m.home_team_goal = m.away_team_goal THEN 1
			WHEN m.season='2013/2014' AND m.home_team_goal != m.away_team_goal THEN 0
			END) AS ties_2013_2014,
	AVG(CASE WHEN m.season='2014/2015' AND m.home_team_goal = m.away_team_goal THEN 1
			WHEN m.season='2014/2015' AND m.home_team_goal != m.away_team_goal THEN 0
			END) AS ties_2014_2015
FROM country AS c
LEFT JOIN match AS m
ON c.id = m.country_id
GROUP BY country;
"""
result_df = execute_df(query)

# Show results
result_df.head()

Unnamed: 0,country,ties_2013_2014,ties_2014_2015
0,Belgium,0.166667,0.25
1,England,0.205263,0.244737
2,France,0.284211,0.231579
3,Germany,0.20915,0.267974
4,Italy,0.236842,0.316623


In [91]:
query = """
SELECT 
	c.name AS country,
    -- Round the percentage of tied games to 2 decimal points
	ROUND(AVG(CASE WHEN m.season='2013/2014' AND m.home_team_goal = m.away_team_goal THEN 1
			 WHEN m.season='2013/2014' AND m.home_team_goal != m.away_team_goal THEN 0
			 END),2) AS pct_ties_2013_2014,
	ROUND(AVG(CASE WHEN m.season='2014/2015' AND m.home_team_goal = m.away_team_goal THEN 1
			 WHEN m.season='2014/2015' AND m.home_team_goal != m.away_team_goal THEN 0
			 END),2) AS pct_ties_2014_2015
FROM country AS c
LEFT JOIN match AS m
ON c.id = m.country_id
GROUP BY country;
"""
result_df = execute_df(query)

# Show results
result_df.head()

Unnamed: 0,country,pct_ties_2013_2014,pct_ties_2014_2015
0,Belgium,0.17,0.25
1,England,0.21,0.24
2,France,0.28,0.23
3,Germany,0.21,0.27
4,Italy,0.24,0.32
