In [1]:
%pylab inline
import pandas as pd
from sqlalchemy import create_engine

Populating the interactive namespace from numpy and matplotlib


In [3]:
# get all the tables into pandas dataframes
allstarfull = pd.read_sql_query("SELECT * FROM allstarfull", cnx)
fielding = pd.read_sql_query("SELECT * FROM fielding", cnx)
salaries = pd.read_sql_query("SELECT * FROM salaries", cnx)
schools = pd.read_sql_query("SELECT * FROM schools", cnx)
collegeplaying = pd.read_sql_query("SELECT * FROM collegeplaying", cnx)
teams = pd.read_sql_query("SELECT * FROM teams", cnx)

In [4]:
# a helper function to see if the sql query matches the pandas query
# prints the length of each result and returns the sum of matches for each column
def test_query(pd_q, sql_q):
    print 'pandas df length:', len(pd_q)
    print 'sql query length:', len(sql_q)
    return (pd_q == sql_q).sum()

In [5]:
""" 
Query Number 1.
Show all playerids and salaries with a salary in the year 1985 above 500k.
"""
q1_pd = salaries[(salaries.salary > 500000) & (salaries.yearid > 1985)][['playerid','salary']]

q1_query = "SELECT playerid, salary FROM salaries WHERE yearid > 1985 AND salary > 500000"
q1_sql = pd.read_sql_query(q1_query, cnx)

test_query(q1_sql.reset_index().sort(columns='playerid'),
           q1_pd.reset_index().sort(columns='playerid'))

pandas df length: 12627
sql query length: 12627


index           0
playerid    12627
salary      12627
dtype: int64

In [6]:
""" 
Query Number 2.
Show the team for each year that had a rank of 1.
"""
q2_pd = teams[teams['rank'] == 1][['teamid', 'yearid']]

q2_query = "SELECT teamid, yearid FROM teams WHERE rank = 1"
q2_sql = pd.read_sql_query(q2_query, cnx)

test_query(q2_sql.reset_index().sort(columns=['teamid', 'yearid']), 
           q2_pd.reset_index().sort(columns=['teamid', 'yearid']))

pandas df length: 406
sql query length: 406


index       0
teamid    406
yearid    406
dtype: int64

In [7]:
""" 
Query Number 3.
How many schools are in schoolstate of CT?
"""
q3_pd = len(schools[schools.schoolstate == 'CT'])

q3_query = "SELECT COUNT(schoolid) FROM schools WHERE schoolstate = 'CT'"
q3_sql = pd.read_sql_query(q3_query, cnx)

q3_sql == q3_pd

Unnamed: 0,count
0,True


In [8]:
""" 
Query Number 4.
How many schools are there in each state?
"""
q4_pd = schools[['schoolstate', 'schoolid']].groupby('schoolstate').count()

q4_query = "SELECT schoolstate, COUNT(schoolid) FROM schools GROUP BY schoolstate"
q4_sql = pd.read_sql_query(q4_query, cnx)
q4_sql = q4_sql.set_index('schoolstate').sort()
q4_sql.columns = ['schoolid']

test_query(q4_sql, q4_pd)

pandas df length: 49
sql query length: 49


schoolid    49
dtype: int64

In [9]:
""" 
Query Number 5.
What was the total spend on salaries by each team, each year?
"""
q5_pd = salaries[['teamid', 'yearid', 'salary']].groupby(['teamid', 'yearid']).sum()

q5_query = "SELECT teamid, yearid, SUM(salary) FROM salaries GROUP BY teamid, yearid"
q5_sql = pd.read_sql_query(q5_query, cnx)
q5_sql = q5_sql.groupby(['teamid', 'yearid']).sum().sort()
q5_sql.columns = ['salary']

test_query(q5_sql, q5_pd)

pandas df length: 860
sql query length: 860


salary    860
dtype: int64

In [10]:
""" 
Query Number 6.
Find all of the salaries of shortstops (fieldings, pos) for the year 2012.
"""
q6_pd = salaries[salaries.yearid == 2012][['playerid', 'salary']].merge(
                fielding[(fielding.pos == 'SS') & (fielding.yearid == 2012)][['playerid']],
                on=['playerid'],
                how="inner")

q6_query = "SELECT \
    salaries.playerid, salaries.salary \
 FROM salaries \
 INNER JOIN fielding on salaries.playerid = fielding.playerid \
 WHERE \
    fielding.pos = 'SS' AND \
    salaries.yearid = 2012 AND \
    fielding.yearid = 2012"
q6_sql = pd.read_sql_query(q6_query, cnx)

test_query(q6_sql, q6_pd)

pandas df length: 90
sql query length: 90


playerid    90
salary      90
dtype: int64

In [11]:
""" 
Query Number 7.
What is the first and last year played for each player?
"""
def min_max_year(df):
    min_year = df.yearid.min()
    max_year = df.yearid.max()
    return pd.Series([min_year, max_year], index = ['min', 'max'])
q7_pd = fielding.groupby('playerid').apply(min_max_year)

q7_query = 'SELECT playerid, MIN(yearid), MAX(yearid) FROM fielding GROUP BY playerid'
q7_sql = pd.read_sql_query(q7_query, cnx)

test_query(q7_sql.groupby('playerid').sum().sort(), q7_pd)

pandas df length: 18214
sql query length: 18214


min    18214
max    18214
dtype: int64

In [12]:
""" 
Query Number 8.
Who has played the most all star games?
"""
q8_pd = allstarfull[['playerid','gameid']].groupby('playerid').count()
q8_pd = q8_pd.sort('gameid', ascending=False).head(1)

q8_query = '\
SELECT \
    playerid, \
    COUNT(gameid) \
FROM allstarfull \
GROUP BY playerid \
ORDER BY  COUNT(gameid) DESC \
LIMIT 1'
q8_sql = pd.read_sql_query(q8_query, cnx)
q8_sql

q8_pd.index[0] == q8_sql.playerid.ix[0]

True

In [16]:
""" 
Query Number 9.
Which school has generated the most distinct players?

I interpret this as the number of players in college 
that also were in fielding and the college that had the most of these.

The problem with this is that there are some players who went to multiple schools
Ex. aardsda01 went to both Penn State and Rice
These people will be counted as having belonged to both schools
"""
q9_pd = schools[['schoolid', 'schoolname']].merge(collegeplaying, 
                                                  how='inner', 
                                                  left_on='schoolid', 
                                                  right_on='schoolid',)
q9_pd = q9_pd.merge(fielding, how='inner', left_on='playerid', right_on='playerid')
q9_pd = q9_pd[['schoolname', 'playerid']].drop_duplicates()
q9_pd = q9_pd.groupby('schoolname').count().sort('playerid', ascending=False)

q9_query = '\
SELECT \
    table2.schoolname, \
    COUNT (table2.playerid) \
FROM (SELECT DISTINCT \
        schools.schoolid, \
        schools.schoolname, \
        fielding.playerid \
    FROM collegeplaying \
    INNER JOIN schools on collegeplaying.schoolID = schools.schoolid \
    INNER JOIN fielding on collegeplaying.playerid = fielding.playerid) table2 \
GROUP BY table2.schoolname \
ORDER BY COUNT(table2.playerid) DESC'
q9_sql = pd.read_sql_query(q9_query, cnx)
q9_sql

q9_pd.index[0] == q9_sql.schoolname.ix[0]

True