In [1]:
import pandas as pd
from sqlalchemy import create_engine
from config import sql_password
import pickle

# Open connection to Postgres DB

In [2]:
# Postgres username, password, and database name
POSTGRES_ADDRESS = 'nbadb.ca9dadq6ltaa.us-east-2.rds.amazonaws.com' ## INSERT YOUR DB ADDRESS IF IT'S NOT ON PANOPLY
POSTGRES_USERNAME = 'team'
POSTGRES_PASSWORD = sql_password 
POSTGRES_DBNAME = "NBA_database"

# A long string that contains the necessary Postgres login information
postgres_str = ('postgresql://{username}:{password}@{ipaddress}:5432/{dbname}'.format(username=POSTGRES_USERNAME, password=POSTGRES_PASSWORD,ipaddress=POSTGRES_ADDRESS,dbname=POSTGRES_DBNAME))

# Create the connection
cnx = create_engine(postgres_str)
cnx

Engine(postgresql://team:***@nbadb.ca9dadq6ltaa.us-east-2.rds.amazonaws.com:5432/NBA_database)

In [3]:
# Query playerStats
aggregatedTeamStats = pd.read_sql_query('''
WITH
player_roster AS (
	SELECT
		player AS playerName,
		year AS seasonYear,
		SUM(g) AS games,
		SUM(gs) AS gamesStarted,
		SUM(mp) AS minutesPlayed,
		SUM(ast) AS totalAssists,
		SUM(stl) AS totalSteals,
		SUM(blk) AS totalBlocks,
		SUM(tov) AS totalTurnOvers,
		SUM(pf) AS personalFouls,
		SUM(pts) AS totalPoints
	FROM public.player_roaster
	WHERE
		tm != 'TOT'
	GROUP BY 1, 2
	ORDER BY 1, 2
),
player_tm_mapping AS (
	SELECT
		player,
		year,
		tm,
		ROW_NUMBER() OVER (
			PARTITION BY player, year
			ORDER BY g DESC
		) AS rn
	FROM public.player_roaster
	WHERE
		tm != 'TOT'
),
player_stats AS (
	SELECT
		tm.tm AS team,
		r.year AS seasonYear,
		r.player AS playerName,
		SPLIT_PART(r.pos, '-', 1) AS playerPosition,
-- 		AVG(i.games) AS avgGamesLast3Years,
-- 		AVG(i.gamesStarted) AS avgGamesStartedLast3Years,
-- 		AVG(i.minutesPlayed) AS avgMinutesPlayedLast3Years,
		AVG(i.totalAssists) AS avgTotalAssistsLast3Years,
		AVG(i.totalSteals) AS avgTotalStealsLast3Years,
		AVG(i.totalBlocks) AS avgTotalBlocksLast3Years,
		AVG(i.totalTurnOvers) AS avgTotalTurnoversLast3Years,
		AVG(i.personalFouls) AS avgPersonalFoulsLast3Years,
		AVG(i.totalPoints) AS avgTotalPointsLast3Years
	FROM player_roaster r
		JOIN player_roster i
			ON r.player = i.playerName
		JOIN player_tm_mapping tm
			ON tm.player = r.player AND tm.year = r.year AND tm.rn = 1
	WHERE
		r.year >= 1995
		AND i.seasonYear <= r.year
		AND i.seasonYear > (r.year - 3)
	GROUP BY 1, 2, 3, 4
),
team_stats_sg_year AS (
	SELECT
		team,
		seasonYear,
		AVG(avgTotalAssistsLast3Years) AS SG_TotalAssistsLast3Years,
		AVG(avgTotalStealsLast3Years) AS SG_TotalStealsLast3Years,
		AVG(avgTotalBlocksLast3Years) AS SG_TotalBlocksLast3Years,
		AVG(avgTotalTurnoversLast3Years) AS SG_TotalTurnoversLast3Years,
		AVG(avgPersonalFoulsLast3Years) AS SG_PersonalFoulsLast3Years,
		AVG(avgTotalPointsLast3Years) AS SG_TotalPointsLast3Years
	FROM player_stats
	WHERE
		playerPosition = 'SG'
	GROUP BY 1, 2
),
team_stats_pf_year AS (
	SELECT
		team,
		seasonYear,
		AVG(avgTotalAssistsLast3Years) AS PF_TotalAssistsLast3Years,
		AVG(avgTotalStealsLast3Years) AS PF_TotalStealsLast3Years,
		AVG(avgTotalBlocksLast3Years) AS PF_TotalBlocksLast3Years,
		AVG(avgTotalTurnoversLast3Years) AS PF_TotalTurnoversLast3Years,
		AVG(avgPersonalFoulsLast3Years) AS PF_PersonalFoulsLast3Years,
		AVG(avgTotalPointsLast3Years) AS PF_TotalPointsLast3Years
	FROM player_stats
	WHERE
		playerPosition = 'PF'
	GROUP BY 1, 2
),
team_stats_sf_year AS (
	SELECT
		team,
		seasonYear,
		AVG(avgTotalAssistsLast3Years) AS SF_TotalAssistsLast3Years,
		AVG(avgTotalStealsLast3Years) AS SF_TotalStealsLast3Years,
		AVG(avgTotalBlocksLast3Years) AS SF_TotalBlocksLast3Years,
		AVG(avgTotalTurnoversLast3Years) AS SF_TotalTurnoversLast3Years,
		AVG(avgPersonalFoulsLast3Years) AS SF_PersonalFoulsLast3Years,
		AVG(avgTotalPointsLast3Years) AS SF_TotalPointsLast3Years
	FROM player_stats
	WHERE
		playerPosition = 'SF'
	GROUP BY 1, 2
),
team_stats_c_year AS (
	SELECT
		team,
		seasonYear,
		AVG(avgTotalAssistsLast3Years) AS C_TotalAssistsLast3Years,
		AVG(avgTotalStealsLast3Years) AS C_TotalStealsLast3Years,
		AVG(avgTotalBlocksLast3Years) AS C_TotalBlocksLast3Years,
		AVG(avgTotalTurnoversLast3Years) AS C_TotalTurnoversLast3Years,
		AVG(avgPersonalFoulsLast3Years) AS C_PersonalFoulsLast3Years,
		AVG(avgTotalPointsLast3Years) AS C_TotalPointsLast3Years
	FROM player_stats
	WHERE
		playerPosition = 'C'
	GROUP BY 1, 2
),
team_stats_pg_year AS (
	SELECT
		team,
		seasonYear,
		AVG(avgTotalAssistsLast3Years) AS PG_TotalAssistsLast3Years,
		AVG(avgTotalStealsLast3Years) AS PG_TotalStealsLast3Years,
		AVG(avgTotalBlocksLast3Years) AS PG_TotalBlocksLast3Years,
		AVG(avgTotalTurnoversLast3Years) AS PG_TotalTurnoversLast3Years,
		AVG(avgPersonalFoulsLast3Years) AS PG_PersonalFoulsLast3Years,
		AVG(avgTotalPointsLast3Years) AS PG_TotalPointsLast3Years
	FROM player_stats
	WHERE
		playerPosition = 'PG'
	GROUP BY 1, 2
)
SELECT DISTINCT
	-- Overall fields to groupby
	s.team,
	s.seasonYear,
	
	-- SG Stats
	sg.SG_TotalAssistsLast3Years,
	sg.SG_TotalStealsLast3Years,
	sg.SG_TotalBlocksLast3Years,
	sg.SG_TotalTurnoversLast3Years,
	sg.SG_PersonalFoulsLast3Years,
	sg.SG_TotalPointsLast3Years,
	
	-- PF stats
	pf.PF_TotalAssistsLast3Years,
	pf.PF_TotalStealsLast3Years,
	pf.PF_TotalBlocksLast3Years,
	pf.PF_TotalTurnoversLast3Years,
	pf.PF_PersonalFoulsLast3Years,
	pf.PF_TotalPointsLast3Years,
	
	-- SF stats
	sf.SF_TotalAssistsLast3Years,
	sf.SF_TotalStealsLast3Years,
	sf.SF_TotalBlocksLast3Years,
	sf.SF_TotalTurnoversLast3Years,
	sf.SF_PersonalFoulsLast3Years,
	sf.SF_TotalPointsLast3Years,
	
	-- C stats
	c.C_TotalAssistsLast3Years,
	c.C_TotalStealsLast3Years,
	c.C_TotalBlocksLast3Years,
	c.C_TotalTurnoversLast3Years,
	c.C_PersonalFoulsLast3Years,
	c.C_TotalPointsLast3Years,
	
	-- PG stats
	pg.PG_TotalAssistsLast3Years,
	pg.PG_TotalStealsLast3Years,
	pg.PG_TotalBlocksLast3Years,
	pg.PG_TotalTurnoversLast3Years,
	pg.PG_PersonalFoulsLast3Years,
	pg.PG_TotalPointsLast3Years
	
FROM player_stats s
	JOIN team_stats_sg_year sg USING(team, seasonYear)
	JOIN team_stats_pf_year pf USING(team, seasonYear)
	JOIN team_stats_sf_year sf USING(team, seasonYear)
	JOIN team_stats_c_year c USING(team, seasonYear) 
	JOIN team_stats_pg_year pg USING(team, seasonYear);
''', cnx)
aggregatedTeamStats.head()

Unnamed: 0,team,seasonyear,sg_totalassistslast3years,sg_totalstealslast3years,sg_totalblockslast3years,sg_totalturnoverslast3years,sg_personalfoulslast3years,sg_totalpointslast3years,pf_totalassistslast3years,pf_totalstealslast3years,...,c_totalblockslast3years,c_totalturnoverslast3years,c_personalfoulslast3years,c_totalpointslast3years,pg_totalassistslast3years,pg_totalstealslast3years,pg_totalblockslast3years,pg_totalturnoverslast3years,pg_personalfoulslast3years,pg_totalpointslast3years
0,ATL,1995,177.666667,68.5,20.666667,106.25,125.166667,746.833333,68.5,47.722222,...,107.166667,65.666667,226.833333,413.333333,226.875,65.5,8.916667,72.625,67.833333,384.666667
1,ATL,1996,158.333333,54.444444,14.333333,92.555556,115.555556,681.777778,64.583333,41.125,...,62.0,53.666667,131.833333,373.5,271.791667,74.333333,11.916667,96.875,82.333333,546.125
2,ATL,1997,114.75,32.791667,8.958333,65.416667,83.541667,491.208333,60.0,31.466667,...,114.555556,83.444444,134.0,434.444444,519.0,208.0,21.0,205.0,152.0,1331.666667
3,ATL,1998,109.166667,29.583333,9.333333,67.75,94.333333,539.166667,65.916667,33.833333,...,162.333333,102.666667,189.5,574.166667,154.375,60.208333,6.833333,69.0,58.5,349.5
4,ATL,1999,92.444444,24.722222,9.0,62.277778,75.833333,473.388889,58.722222,31.0,...,128.166667,87.5,147.666667,503.0,207.444444,71.388889,7.722222,83.722222,78.5,461.222222


# Label the data for ML

In [4]:
# Insert standings to create labels (while no standings in DB right now)
bottom_four_standings = pd.read_sql_query('''SELECT * FROM public."Standings" WHERE "Made Playoffs" = False ;''', cnx)
bottom_four_standings.head()

Unnamed: 0,Year,Team,W,L,PCT,GB,Made Playoffs,Team Abbreviation
0,1970,Seattle Supersonics,38,44,0.463,28,False,SEA
1,1970,Portland Trail Blazers,29,53,0.354,37,False,POR
2,1970,Buffalo Braves,22,60,0.268,30,False,BUF
3,1970,Cleveland Cavaliers,15,67,0.183,37,False,CLE
4,1971,Detroit Pistons,26,56,0.317,43,False,DET


In [5]:
# Create dictionaries for nonIdeal teams
nonIdeal={}

def createDictionary(row):
    if row["Year"] in nonIdeal:
        nonIdeal[row["Year"]].append(row["Team Abbreviation"])
    else:
        nonIdeal[row["Year"]] = []
        nonIdeal[row["Year"]].append(row["Team Abbreviation"])
        
bottom_four_standings.apply(createDictionary, axis=1) 
nonIdeal

{1970: ['SEA', 'POR', 'BUF', 'CLE'],
 1971: ['DET', 'POR', 'CLE', 'BUF'],
 1972: ['SEA', 'POR', 'BUF', 'PHL'],
 1973: ['PHX', 'POR', 'CLE', 'PHL'],
 1974: ['PHX', 'LAL', 'ATL', 'NOJ'],
 1975: ['KCK', 'CHI', 'NOJ', 'ATL'],
 1976: ['PHX', 'MIL', 'BUF', 'NYN'],
 1977: ['KCK', 'IND', 'BUF', 'NJN'],
 1978: ['IND', 'CHI', 'BOS', 'NOJ'],
 1979: ['GOS', 'UTH', 'NJN', 'DET'],
 1980: ['UTH', 'DAL', 'NJN', 'DET'],
 1981: ['UTH', 'SDC', 'NYK', 'CLE'],
 1982: ['SDC', 'HOU', 'CLE', 'IND'],
 1983: ['SDC', 'HOU', 'CHI', 'IND'],
 1984: ['LAC', 'GOS', 'NYK', 'IND'],
 1985: ['SEA', 'GOS', 'IND', 'NYK'],
 1986: ['SAN', 'LAC', 'NJN', 'NYK'],
 1987: ['GOS', 'LAC', 'PHL', 'NJN'],
 1988: ['LAC', 'MIA', 'NJN', 'CHH'],
 1989: ['MIN', 'CHH', 'ORL', 'NJN'],
 1990: ['SAC', 'DN', 'CHH', 'MIA'],
 1991: ['DAL', 'MIN', 'WAS', 'ORL'],
 1992: ['MIN', 'DAL', 'PHL', 'WAS'],
 1993: ['MIN', 'DAL', 'MIL', 'DET'],
 1994: ['MIN', 'LAC', 'PHL', 'WAS'],
 1995: ['DAL', 'VAN', 'TOR', 'PHL'],
 1996: ['SAN', 'VAN', 'PHL', 'BOS'],
 1

In [6]:
# Create dictionary for ideal teams
ideal = pd.read_sql_query('''SELECT "Year", "Abbreviation" FROM public."Champions by Year";''', cnx)
ideal = dict(zip(ideal.Year, ideal.Abbreviation))

In [7]:
aggregatedTeamStats.head()

Unnamed: 0,team,seasonyear,sg_totalassistslast3years,sg_totalstealslast3years,sg_totalblockslast3years,sg_totalturnoverslast3years,sg_personalfoulslast3years,sg_totalpointslast3years,pf_totalassistslast3years,pf_totalstealslast3years,...,c_totalblockslast3years,c_totalturnoverslast3years,c_personalfoulslast3years,c_totalpointslast3years,pg_totalassistslast3years,pg_totalstealslast3years,pg_totalblockslast3years,pg_totalturnoverslast3years,pg_personalfoulslast3years,pg_totalpointslast3years
0,ATL,1995,177.666667,68.5,20.666667,106.25,125.166667,746.833333,68.5,47.722222,...,107.166667,65.666667,226.833333,413.333333,226.875,65.5,8.916667,72.625,67.833333,384.666667
1,ATL,1996,158.333333,54.444444,14.333333,92.555556,115.555556,681.777778,64.583333,41.125,...,62.0,53.666667,131.833333,373.5,271.791667,74.333333,11.916667,96.875,82.333333,546.125
2,ATL,1997,114.75,32.791667,8.958333,65.416667,83.541667,491.208333,60.0,31.466667,...,114.555556,83.444444,134.0,434.444444,519.0,208.0,21.0,205.0,152.0,1331.666667
3,ATL,1998,109.166667,29.583333,9.333333,67.75,94.333333,539.166667,65.916667,33.833333,...,162.333333,102.666667,189.5,574.166667,154.375,60.208333,6.833333,69.0,58.5,349.5
4,ATL,1999,92.444444,24.722222,9.0,62.277778,75.833333,473.388889,58.722222,31.0,...,128.166667,87.5,147.666667,503.0,207.444444,71.388889,7.722222,83.722222,78.5,461.222222


In [8]:
# Label the playerStats data based on association to best or worst teams in the league
def labelData(row):
    
    convertYear = int(row["seasonyear"])
    
    if (row["team"] in ideal[convertYear]):
        return int(1)
    elif (row["team"] in nonIdeal[convertYear]):
        return int(0)

aggregatedTeamStats_copy = aggregatedTeamStats
aggregatedTeamStats_copy["label"] = aggregatedTeamStats_copy.apply(labelData, axis=1)
aggregatedTeamStats_copy = aggregatedTeamStats_copy.dropna(subset=['label'])
aggregatedTeamStats_copy.head()

Unnamed: 0,team,seasonyear,sg_totalassistslast3years,sg_totalstealslast3years,sg_totalblockslast3years,sg_totalturnoverslast3years,sg_personalfoulslast3years,sg_totalpointslast3years,pf_totalassistslast3years,pf_totalstealslast3years,...,c_totalturnoverslast3years,c_personalfoulslast3years,c_totalpointslast3years,pg_totalassistslast3years,pg_totalstealslast3years,pg_totalblockslast3years,pg_totalturnoverslast3years,pg_personalfoulslast3years,pg_totalpointslast3years,label
4,ATL,1999,92.444444,24.722222,9.0,62.277778,75.833333,473.388889,58.722222,31.0,...,87.5,147.666667,503.0,207.444444,71.388889,7.722222,83.722222,78.5,461.222222,0.0
9,ATL,2004,150.166667,57.166667,26.5,110.666667,155.166667,479.166667,47.380952,20.857143,...,42.111111,101.944444,234.833333,271.277778,68.166667,5.777778,118.944444,129.555556,662.0,0.0
10,ATL,2005,112.041667,48.416667,18.541667,71.041667,122.166667,442.0,98.583333,30.75,...,42.133333,96.133333,274.933333,181.888889,41.666667,4.111111,72.666667,132.222222,414.444444,0.0
22,ATL,2017,63.944444,17.111111,4.555556,38.388889,54.5,372.166667,96.0,44.416667,...,92.333333,130.0,575.166667,290.666667,51.666667,5.333333,146.0,125.0,705.666667,0.0
24,BOS,1996,116.166667,65.0,34.416667,96.333333,184.0,813.75,43.333333,28.083333,...,45.633333,114.466667,230.966667,324.666667,97.777778,16.0,130.333333,141.777778,943.555556,0.0


In [9]:
stats = pd.DataFrame(aggregatedTeamStats_copy.groupby("label").describe())
stats.to_csv("~/Downloads/Stats.csv")

# Data Preprocessing

In [10]:
# View all columns to check for those that need encoding / dropping
aggregatedTeamStats_copy.head()

Unnamed: 0,team,seasonyear,sg_totalassistslast3years,sg_totalstealslast3years,sg_totalblockslast3years,sg_totalturnoverslast3years,sg_personalfoulslast3years,sg_totalpointslast3years,pf_totalassistslast3years,pf_totalstealslast3years,...,c_totalturnoverslast3years,c_personalfoulslast3years,c_totalpointslast3years,pg_totalassistslast3years,pg_totalstealslast3years,pg_totalblockslast3years,pg_totalturnoverslast3years,pg_personalfoulslast3years,pg_totalpointslast3years,label
4,ATL,1999,92.444444,24.722222,9.0,62.277778,75.833333,473.388889,58.722222,31.0,...,87.5,147.666667,503.0,207.444444,71.388889,7.722222,83.722222,78.5,461.222222,0.0
9,ATL,2004,150.166667,57.166667,26.5,110.666667,155.166667,479.166667,47.380952,20.857143,...,42.111111,101.944444,234.833333,271.277778,68.166667,5.777778,118.944444,129.555556,662.0,0.0
10,ATL,2005,112.041667,48.416667,18.541667,71.041667,122.166667,442.0,98.583333,30.75,...,42.133333,96.133333,274.933333,181.888889,41.666667,4.111111,72.666667,132.222222,414.444444,0.0
22,ATL,2017,63.944444,17.111111,4.555556,38.388889,54.5,372.166667,96.0,44.416667,...,92.333333,130.0,575.166667,290.666667,51.666667,5.333333,146.0,125.0,705.666667,0.0
24,BOS,1996,116.166667,65.0,34.416667,96.333333,184.0,813.75,43.333333,28.083333,...,45.633333,114.466667,230.966667,324.666667,97.777778,16.0,130.333333,141.777778,943.555556,0.0


In [11]:
# Check that null values need to be removed
aggregatedTeamStats_model = aggregatedTeamStats_copy.drop(["team","seasonyear"], axis=1)
aggregatedTeamStats_model.isnull().sum(axis = 0)

sg_totalassistslast3years      0
sg_totalstealslast3years       0
sg_totalblockslast3years       0
sg_totalturnoverslast3years    0
sg_personalfoulslast3years     0
sg_totalpointslast3years       0
pf_totalassistslast3years      0
pf_totalstealslast3years       0
pf_totalblockslast3years       0
pf_totalturnoverslast3years    0
pf_personalfoulslast3years     0
pf_totalpointslast3years       0
sf_totalassistslast3years      0
sf_totalstealslast3years       0
sf_totalblockslast3years       0
sf_totalturnoverslast3years    0
sf_personalfoulslast3years     0
sf_totalpointslast3years       0
c_totalassistslast3years       0
c_totalstealslast3years        0
c_totalblockslast3years        0
c_totalturnoverslast3years     0
c_personalfoulslast3years      0
c_totalpointslast3years        0
pg_totalassistslast3years      0
pg_totalstealslast3years       0
pg_totalblockslast3years       0
pg_totalturnoverslast3years    0
pg_personalfoulslast3years     0
pg_totalpointslast3years       0
label     

In [12]:
aggregatedTeamStats_model = aggregatedTeamStats_model.dropna()
aggregatedTeamStats_model.isnull().sum(axis = 0)

sg_totalassistslast3years      0
sg_totalstealslast3years       0
sg_totalblockslast3years       0
sg_totalturnoverslast3years    0
sg_personalfoulslast3years     0
sg_totalpointslast3years       0
pf_totalassistslast3years      0
pf_totalstealslast3years       0
pf_totalblockslast3years       0
pf_totalturnoverslast3years    0
pf_personalfoulslast3years     0
pf_totalpointslast3years       0
sf_totalassistslast3years      0
sf_totalstealslast3years       0
sf_totalblockslast3years       0
sf_totalturnoverslast3years    0
sf_personalfoulslast3years     0
sf_totalpointslast3years       0
c_totalassistslast3years       0
c_totalstealslast3years        0
c_totalblockslast3years        0
c_totalturnoverslast3years     0
c_personalfoulslast3years      0
c_totalpointslast3years        0
pg_totalassistslast3years      0
pg_totalstealslast3years       0
pg_totalblockslast3years       0
pg_totalturnoverslast3years    0
pg_personalfoulslast3years     0
pg_totalpointslast3years       0
label     

In [13]:
aggregatedTeamStats_model

Unnamed: 0,sg_totalassistslast3years,sg_totalstealslast3years,sg_totalblockslast3years,sg_totalturnoverslast3years,sg_personalfoulslast3years,sg_totalpointslast3years,pf_totalassistslast3years,pf_totalstealslast3years,pf_totalblockslast3years,pf_totalturnoverslast3years,...,c_totalturnoverslast3years,c_personalfoulslast3years,c_totalpointslast3years,pg_totalassistslast3years,pg_totalstealslast3years,pg_totalblockslast3years,pg_totalturnoverslast3years,pg_personalfoulslast3years,pg_totalpointslast3years,label
4,92.444444,24.722222,9.000000,62.277778,75.833333,473.388889,58.722222,31.000000,17.888889,63.000000,...,87.500000,147.666667,503.000000,207.444444,71.388889,7.722222,83.722222,78.500000,461.222222,0.0
9,150.166667,57.166667,26.500000,110.666667,155.166667,479.166667,47.380952,20.857143,14.380952,51.333333,...,42.111111,101.944444,234.833333,271.277778,68.166667,5.777778,118.944444,129.555556,662.000000,0.0
10,112.041667,48.416667,18.541667,71.041667,122.166667,442.000000,98.583333,30.750000,18.166667,72.000000,...,42.133333,96.133333,274.933333,181.888889,41.666667,4.111111,72.666667,132.222222,414.444444,0.0
22,63.944444,17.111111,4.555556,38.388889,54.500000,372.166667,96.000000,44.416667,33.416667,64.583333,...,92.333333,130.000000,575.166667,290.666667,51.666667,5.333333,146.000000,125.000000,705.666667,0.0
24,116.166667,65.000000,34.416667,96.333333,184.000000,813.750000,43.333333,28.083333,27.333333,57.416667,...,45.633333,114.466667,230.966667,324.666667,97.777778,16.000000,130.333333,141.777778,943.555556,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
652,80.055556,41.666667,16.722222,66.777778,113.555556,505.333333,31.222222,23.333333,17.777778,60.888889,...,77.500000,182.000000,513.500000,222.555556,53.000000,4.000000,104.944444,94.611111,437.888889,0.0
656,109.444444,40.777778,7.555556,86.666667,107.777778,661.666667,73.555556,30.277778,25.055556,74.722222,...,67.777778,106.944444,331.333333,272.777778,51.333333,8.555556,96.222222,96.777778,488.888889,0.0
664,104.333333,39.222222,16.666667,82.888889,117.888889,522.111111,92.500000,58.500000,39.166667,87.833333,...,62.222222,132.111111,406.000000,240.055556,62.500000,10.444444,107.000000,125.222222,885.722222,0.0
665,159.458333,59.875000,12.125000,106.541667,120.666667,741.250000,136.333333,93.666667,31.666667,114.000000,...,58.055556,130.194444,379.888889,127.000000,36.700000,4.633333,69.833333,84.766667,427.900000,0.0


# Scaling and Creating Model

In [14]:
# Import dependencies
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [15]:
# Define the features set.
X = aggregatedTeamStats_model.copy()
X = X.drop("label", axis=1)
X.head()

Unnamed: 0,sg_totalassistslast3years,sg_totalstealslast3years,sg_totalblockslast3years,sg_totalturnoverslast3years,sg_personalfoulslast3years,sg_totalpointslast3years,pf_totalassistslast3years,pf_totalstealslast3years,pf_totalblockslast3years,pf_totalturnoverslast3years,...,c_totalblockslast3years,c_totalturnoverslast3years,c_personalfoulslast3years,c_totalpointslast3years,pg_totalassistslast3years,pg_totalstealslast3years,pg_totalblockslast3years,pg_totalturnoverslast3years,pg_personalfoulslast3years,pg_totalpointslast3years
4,92.444444,24.722222,9.0,62.277778,75.833333,473.388889,58.722222,31.0,17.888889,63.0,...,128.166667,87.5,147.666667,503.0,207.444444,71.388889,7.722222,83.722222,78.5,461.222222
9,150.166667,57.166667,26.5,110.666667,155.166667,479.166667,47.380952,20.857143,14.380952,51.333333,...,55.777778,42.111111,101.944444,234.833333,271.277778,68.166667,5.777778,118.944444,129.555556,662.0
10,112.041667,48.416667,18.541667,71.041667,122.166667,442.0,98.583333,30.75,18.166667,72.0,...,16.733333,42.133333,96.133333,274.933333,181.888889,41.666667,4.111111,72.666667,132.222222,414.444444
22,63.944444,17.111111,4.555556,38.388889,54.5,372.166667,96.0,44.416667,33.416667,64.583333,...,57.5,92.333333,130.0,575.166667,290.666667,51.666667,5.333333,146.0,125.0,705.666667
24,116.166667,65.0,34.416667,96.333333,184.0,813.75,43.333333,28.083333,27.333333,57.416667,...,34.433333,45.633333,114.466667,230.966667,324.666667,97.777778,16.0,130.333333,141.777778,943.555556


In [16]:
# Define the target set.
y = aggregatedTeamStats_model["label"].values
y[:5]

array([0., 0., 0., 0., 0.])

In [17]:
# Splitting into Train and Test sets into an 80/20 split.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=70, train_size=0.80)

In [18]:
# Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(64, 30)
(17, 30)
(64,)
(17,)


In [19]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [20]:
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier()

# Fitting the model.
model = model.fit(X_train_scaled, y_train)

In [21]:
# Making predictions using the testing data.
predictions = model.predict(X_test_scaled)

In [22]:
predictions

array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.])

In [23]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,13,1
Actual 1,2,1


In [24]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
acc_score

0.8235294117647058

In [25]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,13,1
Actual 1,2,1


Accuracy Score : 0.8235294117647058
Classification Report
              precision    recall  f1-score   support

         0.0       0.87      0.93      0.90        14
         1.0       0.50      0.33      0.40         3

    accuracy                           0.82        17
   macro avg       0.68      0.63      0.65        17
weighted avg       0.80      0.82      0.81        17



Let's go over the results in the classification report:

Precision: Precision is the measure of how reliable a positive classification is. From our results, the precision for the good loan applications can be determined by the ratio TP/(TP + FP), which is 50/(50 + 22) = 0.69. The precision for the bad loan applications can be determined as follows: 19/(19 + 34) = 0.358. A low precision is indicative of a large number of false positives—of the 53 loan applications we predicted to be bad applications, 34 were actually good loan applications.

Recall: Recall is the ability of the classifier to find all the positive samples. It can be determined by the ratio: TP/(TP + FN), or 50/(50 + 34) = 0.595 for the good loans and 19/(19 + 22) = 0.463 for the bad loans. A low recall is indicative of a large number of false negatives.

F1 score: F1 score is a weighted average of the true positive rate (recall) and precision, where the best score is 1.0 and the worst is 0.0.

Support: Support is the number of actual occurrences of the class in the specified dataset. For our results, there are 84 actual occurrences for the good loans and 41 actual occurrences for bad loans.


In [26]:
# save the model to disk
filename = 'finalized_model_v2.sav'
pickle.dump(model, open(filename, 'wb'))