#### Set Up

In [1]:
import pandas as pd
from sqlalchemy import create_engine

In [2]:
# Be sure that Postgres/PGAdmin is launched

# establish a database connection
engine = create_engine("postgres+psycopg2://postgres:postgres@localhost:5432/lahman_baseball")
# Replace `<lahman_baseball_database_name>` with the actual name of your lahman baseball database as it appears in pgadmin

In [3]:
# use the connection to run a query using pandas!
batting_df = pd.read_sql("SELECT * FROM batting;", con=engine)
batting_df.head()

Unnamed: 0,playerid,yearid,stint,teamid,lgid,g,ab,r,h,h2b,...,rbi,sb,cs,bb,so,ibb,hbp,sh,sf,gidp
0,abercda01,1871,1,TRO,,1,4,0,0,0,...,0.0,0.0,0.0,0,0.0,,,,,
1,addybo01,1871,1,RC1,,25,118,30,32,6,...,13.0,8.0,1.0,4,0.0,,,,,
2,allisar01,1871,1,CL1,,29,137,28,40,4,...,19.0,3.0,1.0,2,5.0,,,,,
3,allisdo01,1871,1,WS3,,27,133,28,44,10,...,27.0,1.0,1.0,0,2.0,,,,,
4,ansonca01,1871,1,RC1,,25,120,29,39,11,...,16.0,6.0,2.0,2,1.0,,,,,


#### All players in the database who played at Vanderbilt University

In [4]:
#pull in schools table to know what schoolid is associated with Vanderbilt University
schools_df = pd.read_sql("SELECT * FROM schools;", con=engine)
schools_df.head()

Unnamed: 0,schoolid,schoolname,schoolcity,schoolstate,schoolnick
0,abilchrist,Abilene Christian University,Abilene,TX,USA
1,adelphi,Adelphi University,Garden City,NY,USA
2,adrianmi,Adrian College,Adrian,MI,USA
3,akron,University of Akron,Akron,OH,USA
4,alabama,University of Alabama,Tuscaloosa,AL,USA


In [5]:
#Find the schoolid for Vanderbilt University
vandy_id = schools_df[schools_df['schoolname'] == 'Vanderbilt University']
vandy_id

Unnamed: 0,schoolid,schoolname,schoolcity,schoolstate,schoolnick
1110,vandy,Vanderbilt University,Nashville,TN,USA


In [6]:
# Now we know Vanderbilt University schoolid is 'vandy'
#Next, let's pull in collegeplaying table to get all the player id

#collegeplaying table 
collegeplaying_df = pd.read_sql("SELECT * FROM collegeplaying;", con=engine)
collegeplaying_df.head()

Unnamed: 0,playerid,schoolid,yearid
0,aardsda01,pennst,2001
1,aardsda01,rice,2002
2,aardsda01,rice,2003
3,abadan01,gamiddl,1992
4,abadan01,gamiddl,1993


In [7]:
#Get all the players id for vandy schoolid from collegeplaying table

vandy_playerid = collegeplaying_df[collegeplaying_df['schoolid'] == 'vandy']


In [8]:
# get unique playerid list
vandy_playerid = vandy_playerid['playerid'].unique()
vandy_playerid

array(['alvarpe01', 'baxtemi01', 'chrisni01', 'colliwi01', 'corajo01',
       'embresl01', 'flahery01', 'grayso01', 'hendrha01', 'katama01',
       'lewisje01', 'madissc01', 'minormi01', 'mooresc01', 'mossma01',
       'pauljo01', 'priceda01', 'priorma01', 'richaan01', 'sandesc01',
       'sewelri01', 'sowerje01', 'willimi01', 'zeidjo01'], dtype=object)

#### Create a list showing each player’s first and last names as well as the total salary they earned in the major leagues.

In [9]:
# Let's pull in <people> table to get first and last names for vandy players

#people table
people_df = pd.read_sql("SELECT * FROM people;", con=engine)
people_df.head()

#find out all vandy unique players the list
vandy_playernames = people_df[people_df.playerid.isin(vandy_playerid)]
vandy_playernames

Unnamed: 0,playerid,birthyear,birthmonth,birthday,birthcountry,birthstate,birthcity,deathyear,deathmonth,deathday,...,namelast,namegiven,weight,height,bats,throws,debut,finalgame,retroid,bbrefid
282,alvarpe01,1987.0,2.0,6.0,D.R.,Distrito Nacional,Santo Domingo,,,,...,Alvarez,Pedro Manuel,250.0,75.0,L,R,2010-06-16,2016-10-01,alvap001,alvarpe01
958,baxtemi01,1984.0,12.0,7.0,USA,NY,Queens,,,,...,Baxter,Michael Joseph,205.0,72.0,L,R,2010-09-06,2015-07-08,baxtm001,baxtemi01
2989,chrisni01,1987.0,7.0,17.0,USA,NJ,Elizabeth,,,,...,Christiani,Nicholas John,190.0,72.0,R,R,2013-08-23,2014-05-10,chrin001,chrisni01
3292,colliwi01,1889.0,5.0,7.0,USA,TN,Pulaski,1941.0,2.0,28.0,...,Collins,Cyril Wilson,165.0,69.0,R,R,1913-05-12,1914-07-08,collw101,colliwi01
3466,corajo01,1965.0,5.0,14.0,P.R.,,Caguas,,,,...,Cora,Jose Manuel,150.0,67.0,B,R,1987-04-06,1998-09-27,coraj001,corajo01
4990,embresl01,1901.0,8.0,17.0,USA,TN,Columbia,1947.0,10.0,10.0,...,Embry,Charles Akin,184.0,74.0,R,R,1923-10-01,1923-10-01,embrs101,embresl01
5457,flahery01,1986.0,7.0,27.0,USA,ME,Portland,,,,...,Flaherty,Ryan Edward,220.0,75.0,L,R,2012-04-07,2016-09-28,flahr001,flahery01
6605,grayso01,1989.0,11.0,7.0,USA,TN,Nashville,,,,...,Gray,Sonny Douglas,190.0,70.0,R,R,2013-07-10,2016-09-28,grays001,grayso01
7467,hendrha01,1897.0,11.0,9.0,USA,TN,Mason,1941.0,10.0,29.0,...,Hendrick,Harvey,190.0,74.0,L,R,1923-04-20,1934-08-28,hendh101,hendrha01
8861,katama01,1978.0,3.0,14.0,USA,OH,Avon Lake,,,,...,Kata,Matthew John,185.0,73.0,B,R,2003-06-15,2009-08-11,katam001,katama01


In [10]:
#Subset vandy_playernames df by only getting columns of yearid, school id 'vandy', playerid, namefirst, namelast 
vandy_playernames = vandy_playernames[['playerid','namefirst','namelast']]
vandy_playernames

Unnamed: 0,playerid,namefirst,namelast
282,alvarpe01,Pedro,Alvarez
958,baxtemi01,Mike,Baxter
2989,chrisni01,Nick,Christiani
3292,colliwi01,Wilson,Collins
3466,corajo01,Joey,Cora
4990,embresl01,Slim,Embry
5457,flahery01,Ryan,Flaherty
6605,grayso01,Sonny,Gray
7467,hendrha01,Harvey,Hendrick
8861,katama01,Matt,Kata


In [11]:
#import salaries table from sql database
salaries_df = pd.read_sql("SELECT * FROM salaries;", con=engine)
salaries_df.head()

Unnamed: 0,yearid,teamid,lgid,playerid,salary
0,1985,ATL,NL,barkele01,870000.0
1,1985,ATL,NL,bedrost01,550000.0
2,1985,ATL,NL,benedbr01,545000.0
3,1985,ATL,NL,campri01,633333.0
4,1985,ATL,NL,ceronri01,625000.0


In [12]:
# inner merge dfs vandy_playernames with salareis to get the total salaries earned from each player
vandy_players_salaries = pd.merge(vandy_playernames, salaries_df, on='playerid')

# Notice that above I want to get all unique playerids and their names not other elements like years as that might cause 
# duplicate rows with years with the salaries df. The major things here are to analyze the salaires of each player
vandy_players_salaries = pd.merge(vandy_playernames, salaries_df, on='playerid')
vandy_players_salaries 

Unnamed: 0,playerid,namefirst,namelast,yearid,teamid,lgid,salary
0,alvarpe01,Pedro,Alvarez,2011,PIT,NL,2050000.0
1,alvarpe01,Pedro,Alvarez,2012,PIT,NL,2200000.0
2,alvarpe01,Pedro,Alvarez,2013,PIT,NL,700000.0
3,alvarpe01,Pedro,Alvarez,2014,PIT,NL,4250000.0
4,alvarpe01,Pedro,Alvarez,2015,PIT,NL,5750000.0
...,...,...,...,...,...,...,...
69,sandesc01,Scott,Sanderson,1993,NYA,AL,250000.0
70,sandesc01,Scott,Sanderson,1994,CHA,AL,350000.0
71,sandesc01,Scott,Sanderson,1995,CAL,AL,250000.0
72,sandesc01,Scott,Sanderson,1996,CAL,AL,200000.0


#### Total salareis earned by each player in descending orders

In [13]:
# group by vandy_players_salaries df by playerid,namefirst,namelast, salary (sum the salary)
vandy_total_salaries = vandy_players_salaries.groupby(by= ['playerid','namefirst','namelast']).salary.sum()
vandy_total_salaries

#in descending orders
vandy_total_salaries_desc = vandy_total_salaries.sort_values(ascending = False)
vandy_total_salaries_desc

#From the list,  David Price earned the most money in the majors with $81851296. 

playerid   namefirst  namelast  
priceda01  David      Price         81851296.0
alvarpe01  Pedro      Alvarez       20681704.0
priorma01  Mark       Prior         12800000.0
sandesc01  Scott      Sanderson     10750000.0
minormi01  Mike       Minor          6837500.0
corajo01   Joey       Cora           5622500.0
flahery01  Ryan       Flaherty       4061000.0
pauljo01   Josh       Paul           2640000.0
baxtemi01  Mike       Baxter         2094418.0
grayso01   Sonny      Gray           1542500.0
lewisje01  Jensen     Lewis          1234000.0
katama01   Matt       Kata           1060000.0
chrisni01  Nick       Christiani      500000.0
sowerje01  Jeremy     Sowers          384800.0
madissc01  Scotti     Madison         135000.0
Name: salary, dtype: float64

#### Using the fielding table, group players into three groups based on their position

In [14]:
#label players with position OF as "Outfield", those with position "SS", "1B", "2B", and "3B" as "Infield",
#and those with position "P" or "C" as "Battery"

#import fielding table from sql database
fielding_df = pd.read_sql("SELECT * FROM fielding;", con=engine)
fielding_df.head()

Unnamed: 0,playerid,yearid,stint,teamid,lgid,pos,g,gs,innouts,po,a,e,dp,pb,wp,sb,cs,zr
0,abercda01,1871,1,TRO,,SS,1,,,1,3.0,2.0,0.0,,,,,
1,addybo01,1871,1,RC1,,2B,22,,,67,72.0,42.0,5.0,,,,,
2,addybo01,1871,1,RC1,,SS,3,,,8,14.0,7.0,0.0,,,,,
3,allisar01,1871,1,CL1,,2B,2,,,1,4.0,0.0,0.0,,,,,
4,allisar01,1871,1,CL1,,OF,29,,,51,3.0,7.0,1.0,,,,,


In [15]:
# I need to create a new column 'position' to group the positions in pos column into three groups as reqeusted above

# Apply if conditions in Pandas DataFrame
#label players with position OF as "Outfield"
fielding_df.loc[fielding_df['pos'] == 'OF','positions'] = 'Outfield'

#position "SS", "1B", "2B", and "3B" as "Infield"
# an example of code with different conditions : df.loc[(df['First_name'] == 'Ria') | (df['First_name'] == 'Jay'), 'Status'] = 'Found'
fielding_df.loc[(fielding_df['pos'] == 'SS')|(fielding_df['pos'] == '1B')|(fielding_df['pos'] == '2B')|(fielding_df['pos'] == '3B'),'positions'] = 'Infield'

#position "P" or "C" as "Battery"
fielding_df.loc[(fielding_df['pos'] == 'P')|(fielding_df['pos'] == 'C'),'positions'] = 'Battery'

In [16]:
fielding_df

Unnamed: 0,playerid,yearid,stint,teamid,lgid,pos,g,gs,innouts,po,a,e,dp,pb,wp,sb,cs,zr,positions
0,abercda01,1871,1,TRO,,SS,1,,,1,3.0,2.0,0.0,,,,,,Infield
1,addybo01,1871,1,RC1,,2B,22,,,67,72.0,42.0,5.0,,,,,,Infield
2,addybo01,1871,1,RC1,,SS,3,,,8,14.0,7.0,0.0,,,,,,Infield
3,allisar01,1871,1,CL1,,2B,2,,,1,4.0,0.0,0.0,,,,,,Infield
4,allisar01,1871,1,CL1,,OF,29,,,51,3.0,7.0,1.0,,,,,,Outfield
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136810,zobribe01,2016,1,CHN,NL,2B,119,113.0,2929.0,177,250.0,7.0,52.0,,,,,,Infield
136811,zobribe01,2016,1,CHN,NL,OF,46,29.0,859.0,43,1.0,0.0,0.0,,,,,,Outfield
136812,zobribe01,2016,1,CHN,NL,SS,1,0.0,6.0,0,0.0,0.0,0.0,,,,,,Infield
136813,zuninmi01,2016,1,SEA,AL,C,52,48.0,1331.0,400,15.0,0.0,0.0,3.0,,19.0,7.0,,Battery


#### The total number of putouts made by each of these three groups in 2016.

In [17]:
# yearid is 2016 and use .sum() and groupby
fielding_putouts_2016 = fielding_df[fielding_df['yearid'] == 2016]

fielding_total_putouts_2016 = fielding_putouts_2016.groupby(by= 'positions').po.sum()
fielding_total_putouts_2016

positions
Battery     41424
Infield     58934
Outfield    29560
Name: po, dtype: int64

#### From 1970 – 2016, examine what is the largest number of wins for a team that did not win the world series?

In [18]:
#First, let's get the teams table from SQL 

#import teams table
teams_df = pd.read_sql("SELECT * FROM teams;", con=engine)
teams_df.head()

Unnamed: 0,yearid,lgid,teamid,franchid,divid,rank,g,ghome,w,l,...,dp,fp,name,park,attendance,bpf,ppf,teamidbr,teamidlahman45,teamidretro
0,1871,,BS1,BNA,,3,31,,20,10,...,,0.838,Boston Red Stockings,South End Grounds I,,103,98,BOS,BS1,BS1
1,1871,,CH1,CNA,,2,28,,19,9,...,,0.829,Chicago White Stockings,Union Base-Ball Grounds,,104,102,CHI,CH1,CH1
2,1871,,CL1,CFC,,8,29,,10,19,...,,0.814,Cleveland Forest Citys,National Association Grounds,,96,100,CLE,CL1,CL1
3,1871,,FW1,KEK,,7,19,,7,12,...,,0.803,Fort Wayne Kekiongas,Hamilton Field,,101,107,KEK,FW1,FW1
4,1871,,NY2,NNA,,5,33,,16,17,...,,0.839,New York Mutuals,Union Grounds (Brooklyn),,90,88,NYU,NY2,NY2


#### Seattle Mariners (SEA) 2001 has the most win with 116 times which is not a world series winner. 

In [19]:
#we need to get the teamid, which has the highest number in 'w' column, and is a 'N' in wswin,  yearid (in the period of 
#1970-2016)

#let's subset the teams_df to have 'N' in wswin
not_worldwin_teams = teams_df[teams_df['wswin'] == 'N']

#subset the largest number of win in 'w' column
most_win_not_worldwin_teams = not_worldwin_teams.nlargest(5, 'w')
most_win_not_worldwin_teams



Unnamed: 0,yearid,lgid,teamid,franchid,divid,rank,g,ghome,w,l,...,dp,fp,name,park,attendance,bpf,ppf,teamidbr,teamidlahman45,teamidretro
467,1906,NL,CHN,CHC,,1,155,79.0,116,36,...,100.0,0.97,Chicago Cubs,West Side Park II,654300.0,106,100,CHC,CHN,CHN
2379,2001,AL,SEA,SEA,W,1,162,81.0,116,46,...,137.0,0.986,Seattle Mariners,Safeco Field,3507326.0,94,93,SEA,SEA,SEA
1253,1954,AL,CLE,CLE,,1,156,77.0,111,43,...,148.0,0.979,Cleveland Indians,Cleveland Stadium,1335472.0,102,98,CLE,CLE,CLE
1518,1969,AL,BAL,BAL,E,1,162,81.0,109,53,...,145.0,0.984,Baltimore Orioles,Memorial Stadium,1062069.0,101,99,BAL,BAL,BAL
889,1931,AL,PHA,OAK,,1,153,75.0,107,45,...,151.0,0.976,Philadelphia Athletics,Shibe Park,627464.0,106,103,PHA,PHA,PHA


#### Examine the smallest number of wins for a team that did win the world series?

#### Los Angeles Dodgers (LAN) in 1981 has the smallest win with 63 times as world series winner team. It is found that in 1981, the wins average number regardless of worldseries winners, the average wins is only 53 in 1981 while is 74 in the teams dataframe taking all years into consideration. 1981 is an odd year. 

In [20]:
#we need to get the teamid, which has the highest number in 'w' column, and is a 'N' in wswin,  yearid (in the period of 
#1970-2016)

#let's subset the teams_df to have 'Y' in wswin
is_worldwin_teams = teams_df[teams_df['wswin'] == 'Y']

#subset the largest number of win in 'w' column
most_win_is_worldwin_teams = is_worldwin_teams.nsmallest(5, 'w')
print(most_win_is_worldwin_teams)

#let examine year in 1981 to see why the  number is small
#we can see wins in this year seems small
year_1981 = teams_df[teams_df['yearid'] == 1981]
year_1981

#compare mean value of 1981 in wins number with the mean value of the total teams_df 

#Year 1981 average win is 53
year_1981_wins = year_1981['w'].mean()
year_1981_wins

#Average win in teams_df is 74 
average_wins = teams_df['w'].mean()
average_wins

      yearid lgid teamid franchid divid  rank    g  ghome   w   l  ...     dp  \
1824    1981   NL    LAN      LAD     W     2  110   56.0  63  47  ...  101.0   
671     1918   AL    BOS      BOS  None     1  126   70.0  75  51  ...   89.0   
195     1887   NL    DTN      DTN  None     1  127    NaN  79  45  ...    NaN   
231     1889   NL    NY1      SFG  None     1  131    NaN  83  43  ...    NaN   
2530    2006   NL    SLN      STL     C     1  161   80.0  83  78  ...  170.0   

         fp                 name               park  attendance  bpf  ppf  \
1824  0.980  Los Angeles Dodgers     Dodger Stadium   2381292.0   97   96   
671   0.971       Boston Red Sox      Fenway Park I    249513.0   98   97   
195   0.926   Detroit Wolverines    Recreation Park         NaN  104  100   
231   0.920      New York Giants    Polo Grounds II         NaN  104  101   
2530  0.984  St. Louis Cardinals  Busch Stadium III   3407104.0   99   99   

      teamidbr  teamidlahman45  teamidretro  
1824

74.81410934744268

#### Let's exclude 1981 to get the smallest wins for a worldseries winner again from 1970-2016

#### After excluding 1981, St. Louis Cardinals (SLN) in 2006 has 83 wins that is also a world series winner. 

In [21]:
#let's subset the is_worldwin_teams to exclude year 1981
is_worldwin_teams_no1981 = is_worldwin_teams[is_worldwin_teams['yearid'] != 1981]

#subset the largest number of win in 'w' column
most_win_is_worldwin_teams_no1981 = is_worldwin_teams_no1981.nsmallest(5, 'w')
most_win_is_worldwin_teams_no1981

Unnamed: 0,yearid,lgid,teamid,franchid,divid,rank,g,ghome,w,l,...,dp,fp,name,park,attendance,bpf,ppf,teamidbr,teamidlahman45,teamidretro
671,1918,AL,BOS,BOS,,1,126,70.0,75,51,...,89.0,0.971,Boston Red Sox,Fenway Park I,249513.0,98,97,BOS,BOS,BOS
195,1887,NL,DTN,DTN,,1,127,,79,45,...,,0.926,Detroit Wolverines,Recreation Park,,104,100,DTN,DTN,DTN
231,1889,NL,NY1,SFG,,1,131,,83,43,...,,0.92,New York Giants,Polo Grounds II,,104,101,NYG,NY1,NY1
2530,2006,NL,SLN,STL,C,1,161,80.0,83,78,...,170.0,0.984,St. Louis Cardinals,Busch Stadium III,3407104.0,99,99,STL,SLN,SLN
147,1884,NL,PRO,PRO,,1,114,,84,28,...,,0.918,Providence Grays,Messer Street Grounds,,99,96,PRO,PRO,PRO


####  How often from 1970 – 2016 was it the case that a team with the most wins also won the world series?
#### What percentage of the time?

In [22]:
# to understand the question, we first need to subset the is_worldwin_teams with yearid between 1970-2016
#then we want to get the maximum wins each year

#subset between years: surveys_df[(surveys_df.year >= 1980) & (surveys_df.year <= 1985)]
is_worldwin_teams_1970_2016 = is_worldwin_teams[ (is_worldwin_teams.yearid >=1970) & (is_worldwin_teams.yearid <=2016)]
is_worldwin_teams_1970_2016

#there are 46 rows in total

Unnamed: 0,yearid,lgid,teamid,franchid,divid,rank,g,ghome,w,l,...,dp,fp,name,park,attendance,bpf,ppf,teamidbr,teamidlahman45,teamidretro
1542,1970,AL,BAL,BAL,E,1,162,81.0,108,54,...,148.0,0.981,Baltimore Orioles,Memorial Stadium,1057069.0,101,98,BAL,BAL,BAL
1584,1971,NL,PIT,PIT,E,1,162,80.0,97,65,...,164.0,0.979,Pittsburgh Pirates,Three Rivers Stadium,1501132.0,102,100,PIT,PIT,PIT
1606,1972,AL,OAK,OAK,W,1,155,77.0,93,62,...,146.0,0.979,Oakland Athletics,Oakland Coliseum,921323.0,95,93,OAK,OAK,OAK
1630,1973,AL,OAK,OAK,W,1,162,81.0,94,68,...,170.0,0.978,Oakland Athletics,Oakland Coliseum,1000763.0,95,93,OAK,OAK,OAK
1654,1974,AL,OAK,OAK,W,1,162,81.0,90,72,...,154.0,0.977,Oakland Athletics,Oakland Coliseum,845693.0,94,92,OAK,OAK,OAK
1667,1975,NL,CIN,CIN,W,1,162,81.0,108,54,...,173.0,0.984,Cincinnati Reds,Riverfront Stadium,2315603.0,102,99,CIN,CIN,CIN
1691,1976,NL,CIN,CIN,W,1,162,81.0,102,60,...,157.0,0.984,Cincinnati Reds,Riverfront Stadium,2629708.0,102,100,CIN,CIN,CIN
1724,1977,AL,NYA,NYY,E,1,162,81.0,100,62,...,151.0,0.979,New York Yankees,Yankee Stadium II,2103092.0,99,97,NYY,NYA,NYA
1750,1978,AL,NYA,NYY,E,1,163,81.0,100,63,...,134.0,0.982,New York Yankees,Yankee Stadium II,2335871.0,97,96,NYY,NYA,NYA
1780,1979,NL,PIT,PIT,E,1,163,81.0,98,64,...,163.0,0.979,Pittsburgh Pirates,Three Rivers Stadium,1435454.0,105,105,PIT,PIT,PIT


In [23]:
#subset teams_1970_2016 first and then to get the maximum wins of each year
#example df.groupby(['Mt'], sort=False)['count'].max()
teams_1970_2016 = teams_df[(teams_df.yearid >= 1970) & (teams_df.yearid <= 2016)]
#calculate teams_1970_2016 max_wins , we do not care here yet if the team is worldseries or not
#.reset_index changed the series to dataframe
teams_1970_2016_max_wins = teams_1970_2016.groupby(['yearid'], sort=True)['w'].max().reset_index()
teams_1970_2016_max_wins



Unnamed: 0,yearid,w
0,1970,108
1,1971,101
2,1972,96
3,1973,99
4,1974,102
5,1975,108
6,1976,102
7,1977,102
8,1978,100
9,1979,102


In [24]:
#Use inner join to retrieve only matching values of teams_1970_2016_max_wins and is_worldwin_teams_1970_2016 on 'w'
#pd.merge(adf, bdf,how='inner', on='x1')

#make sure you join on both w and yearid becasue w has duplicate values that might make the final merge funny. 
#you need to further restrict yearid so you can get better accuracy
most_wins_also_are_worldseries = pd.merge(teams_1970_2016_max_wins,is_worldwin_teams_1970_2016, how='inner', on=['w','yearid'])
most_wins_also_are_worldseries

Unnamed: 0,yearid,w,lgid,teamid,franchid,divid,rank,g,ghome,l,...,dp,fp,name,park,attendance,bpf,ppf,teamidbr,teamidlahman45,teamidretro
0,1970,108,AL,BAL,BAL,E,1,162,81.0,54,...,148.0,0.981,Baltimore Orioles,Memorial Stadium,1057069.0,101,98,BAL,BAL,BAL
1,1975,108,NL,CIN,CIN,W,1,162,81.0,54,...,173.0,0.984,Cincinnati Reds,Riverfront Stadium,2315603.0,102,99,CIN,CIN,CIN
2,1976,102,NL,CIN,CIN,W,1,162,81.0,60,...,157.0,0.984,Cincinnati Reds,Riverfront Stadium,2629708.0,102,100,CIN,CIN,CIN
3,1978,100,AL,NYA,NYY,E,1,163,81.0,63,...,134.0,0.982,New York Yankees,Yankee Stadium II,2335871.0,97,96,NYY,NYA,NYA
4,1984,104,AL,DET,DET,E,1,162,82.0,58,...,162.0,0.98,Detroit Tigers,Tiger Stadium,2704794.0,100,98,DET,DET,DET
5,1986,108,NL,NYN,NYM,E,1,162,81.0,54,...,145.0,0.978,New York Mets,Shea Stadium,2767601.0,98,96,NYM,NYN,NYN
6,1989,99,AL,OAK,OAK,W,1,162,81.0,63,...,159.0,0.979,Oakland Athletics,Oakland Coliseum,2667225.0,97,95,OAK,OAK,OAK
7,1998,114,AL,NYA,NYY,E,1,162,81.0,48,...,146.0,0.984,New York Yankees,Yankee Stadium II,2955193.0,97,95,NYY,NYA,NYA
8,2007,96,AL,BOS,BOS,E,1,162,81.0,66,...,145.0,0.986,Boston Red Sox,Fenway Park II,2970755.0,106,105,BOS,BOS,BOS
9,2009,103,AL,NYA,NYY,E,1,162,81.0,59,...,131.0,0.985,New York Yankees,Yankee Stadium III,3719358.0,105,103,NYY,NYA,NYA


#### Percentage of the time most wins also are world series winners : 26.09%

In [25]:
#et row counts of most_wins_also_are_worldseries/ row counts of is_worldwin_teams_1970_2016 (to get duration of years 1970-2016)
#total_rows = len(df)

pct_mostwins_world_series = len(most_wins_also_are_worldseries) / len(is_worldwin_teams_1970_2016)
pct_mostwins_world_series

0.2608695652173913

#### Which managers have won the TSN Manager of the Year award in both the National League (NL) and the American League (AL)? 
#### Provide managers full name and the teams that they were managing when they won the award.

In [30]:
#Import awardsmanagers table from SQL and subset awardid ='TSN Manager of the Year', playerid, lgid in 'NL' and 'AL'

#import awardsmanagers table
awardsmanagers_df = pd.read_sql("SELECT * FROM awardsmanagers;", con=engine)
awardsmanagers_df.columns

# an example of code meeting with multiple conditions: df1 = df[(df.a != -1) & (df.b != -1)]
tsn_managers_both_al_nl = awardsmanagers_df[(awardsmanagers_df.awardid == 'TSN Manager of the Year') & ((awardsmanagers_df.lgid == 'NL') | (awardsmanagers_df.lgid == 'AL'))]
tsn_managers_both_al_nl

#find playerid has lgid in NL or AL. Then find the yearid, manager full name as well as team names.

Unnamed: 0,playerid,awardid,yearid,lgid,tie,notes
113,lanieha01,TSN Manager of the Year,1986,NL,,
114,mcnamjo99,TSN Manager of the Year,1986,AL,,
115,andersp01,TSN Manager of the Year,1987,AL,,
116,rodgebu01,TSN Manager of the Year,1987,NL,,
117,larusto01,TSN Manager of the Year,1988,AL,,
118,leylaji99,TSN Manager of the Year,1988,NL,,
119,robinfr02,TSN Manager of the Year,1989,AL,,
120,zimmedo01,TSN Manager of the Year,1989,NL,,
121,leylaji99,TSN Manager of the Year,1990,NL,,
122,torboje01,TSN Manager of the Year,1990,AL,,
