In [1]:
!pip install pybaseball

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import pybaseball as pb 

In [3]:
# load season record
season_record = pb.standings(2020)
season_record

[]

In [None]:
# load season record
season_record = pb.standings(2001)

# convert list to array
sr_array = np.array(season_record)

# re-shape to be used as a 2-D input for pd
sr = sr_array.reshape(30, -1)
# convert array to dataframe
sr_df = pd.DataFrame(sr)
sr_df

ValueError: ignored

In [None]:
season_record = pb.standings(2007)
season_record

[]

In [None]:
sr_cols = ["Team", "Win", "Loss", "Win %"]
season_record = season_record.drop(4, axis = 1)
season_record = season_record.rename(columns={0:"Team", 1:"Win", 2:"Loss", 3:"Win %"})
season_record

AttributeError: ignored

In [None]:
# create a dictionary of team names and abbreviations
teams = {
    'Arizona Diamondbacks': 'ARI',
    'Atlanta Braves': 'ATL',
    'Baltimore Orioles': 'BAL',
    'Boston Red Sox': 'BOS',
    'Chicago Cubs': 'CHC',
    'Chicago White Sox': 'CHW',
    'Cincinnati Reds': 'CIN',
    'Cleveland Indians': 'CLE',
    'Colorado Rockies': 'COL',
    'Detroit Tigers': 'DET',
    'Houston Astros': 'HOU',
    'Kansas City Royals': 'KCR',
    'Los Angeles Angels': 'LAA',
    'Los Angeles Dodgers': 'LAD',
    'Miami Marlins': 'MIA',
    'Milwaukee Brewers': 'MIL',
    'Minnesota Twins': 'MIN',
    'New York Mets': 'NYM',
    'New York Yankees': 'NYY',
    'Oakland Athletics': 'OAK',
    'Philadelphia Phillies': 'PHI',
    'Pittsburgh Pirates': 'PIT',
    'San Diego Padres': 'SDP',
    'Seattle Mariners': 'SEA',
    'San Francisco Giants': 'SFG',
    'St. Louis Cardinals': 'STL',
    'Tampa Bay Rays': 'TBR',
    'Texas Rangers': 'TEX',
    'Toronto Blue Jays': 'TOR',
    'Washington Nationals': 'WSN'
}

# replace team names with abbreviations to match rest of dataset
sr_df["Team"] = sr_df["Team"].replace(teams)

sr_df

Unnamed: 0,Team,Win,Loss,Win %
0,TBR,100,62,0.617
1,BOS,92,70,0.568
2,NYY,92,70,0.568
3,TOR,91,71,0.562
4,BAL,52,110,0.321
5,CHW,93,69,0.574
6,CLE,80,82,0.494
7,DET,77,85,0.475
8,KCR,74,88,0.457
9,MIN,73,89,0.451


In [None]:
# sort by team abbrv
sr_df_sorted = sr_df.sort_values("Team")
sr_df_sorted

Unnamed: 0,Team,Win,Loss,Win %
29,ARI,52,110,0.321
15,ATL,88,73,0.547
4,BAL,52,110,0.321
1,BOS,92,70,0.568
23,CHC,71,91,0.438
5,CHW,93,69,0.574
22,CIN,83,79,0.512
6,CLE,80,82,0.494
28,COL,74,87,0.46
7,DET,77,85,0.475


In [None]:
# Prepare Data

years = range(2000, 2023)

teams_new = {
    'Diamondbacks': 'ARI',
    'Braves': 'ATL',
    'Orioles': 'BAL',
    'Red Sox': 'BOS',
    'Cubs': 'CHC',
    'White Sox': 'CHW',
    'Reds': 'CIN',
    'Cleveland': 'CLE',
    'Indians': 'CLE',
    'Guardians': 'CLE',
    'Rockies': 'COL',
    'Tigers': 'DET',
    'Astros': 'HOU',
    'Royals': 'KCR',
    'Angels': 'LAA',
    'Dodgers': 'LAD',
    'Marlins': 'MIA',
    'Brewers': 'MIL',
    'Twins': 'MIN',
    'Mets': 'NYM',
    'Yankees': 'NYY',
    'Athletics': 'OAK',
    'Phillies': 'PHI',
    'Pirates': 'PIT',
    'Padres': 'SDP',
    'Mariners': 'SEA',
    'Giants': 'SFG',
    'Cardinals': 'STL',
    'Rays': 'TBR',
    'Rangers': 'TEX',
    'Blue Jays': 'TOR',
    'Nationals': 'WSN'
}

batting_columns = ['Season', 'Team', 'R', 'AB', 'H', 'HR', 'BB', 'SO', 'SB', 'CS', 'HBP', 'SF']
pitching_columns = ['Season', 'Team', 'ER', 'ERA', 'CG', 'ShO', 'SV', 'H', 'HR', 'BB', 'SO']
fielding_columns = ['Season', 'Team', 'E', 'DP', 'FP']

season_batting = pd.DataFrame()
season_pitching = pd.DataFrame()
season_fielding = pd.DataFrame()

for year in years:
  # load batting stats
  season_batting_year = pb.team_batting(year)
  # chosen batting stats
  season_batting_year = season_batting_year[batting_columns]
  season_batting_year = season_batting_year.sort_values("Team")
  # concatenate dataframes
  season_batting = pd.concat([season_batting, season_batting_year], ignore_index=True)

  # load batting stats
  season_pitching_year = pb.team_pitching(year)
  # chosen batting stats
  season_pitching_year = season_pitching_year[pitching_columns]
  season_pitching_year = season_pitching_year.sort_values("Team")
  # concatenate dataframes
  season_pitching = pd.concat([season_pitching, season_pitching_year], ignore_index=True)

  # load fielding stats 
  season_fielding_year = pb.team_fielding(year)
  # chosen fielding stats
  season_fielding_year = season_fielding_year[fielding_columns]
  season_fielding_year["Team"] = season_fielding_year["Team"].replace(teams_new)
  season_fielding_year = season_fielding_year.sort_values("Team")
  # concatenate dataframes
  season_fielding = pd.concat([season_fielding, season_fielding_year], ignore_index=True)

In [None]:
# Merge season_batting and season_pitching dataframes
final_stats = pd.merge(season_batting, season_pitching, on=["Season", "Team"])

# Merge season_fielding dataframe with merged_df
final_stats = pd.merge(final_stats, season_fielding, on=["Season", "Team"])

final_stats

Unnamed: 0,Season,Team,R,AB,H_x,HR_x,BB_x,SO_x,SB,CS,...,CG,ShO,SV,H_y,HR_y,BB_y,SO_y,E,DP,FP
0,2000,ARI,792,5527,1466,179,535,975,97,44,...,16,4,38,1441,190,500,1220,107,370,0.982
1,2000,ATL,810,5489,1490,179,595,1010,148,56,...,13,6,53,1428,165,484,1093,129,373,0.979
2,2000,BAL,794,5549,1508,184,558,900,126,65,...,14,2,33,1547,202,665,1017,116,403,0.981
3,2000,BOS,792,5630,1503,167,611,1019,43,30,...,7,4,46,1433,173,498,1121,109,320,0.982
4,2000,CHC,764,5577,1426,183,632,1120,93,37,...,10,1,39,1505,231,658,1143,100,366,0.983
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
655,2022,STL,772,5496,1386,197,537,1226,95,25,...,3,1,37,1335,146,489,1177,66,449,0.989
656,2022,TBR,666,5412,1294,139,500,1395,95,37,...,0,0,44,1260,172,384,1384,84,281,0.985
657,2022,TEX,707,5478,1308,198,456,1446,128,41,...,1,1,37,1345,169,581,1314,96,344,0.984
658,2022,TOR,775,5555,1464,200,500,1242,67,35,...,0,0,46,1356,180,424,1390,82,270,0.986
