In [None]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

In [None]:
postseason_Request = requests.get('https://www.baseball-reference.com/postseason/') # request of the playoff team web page

In [None]:
postseason_soup = BeautifulSoup(postseason_Request.text) # turning the request into a BeautifulSoup object
postseason_data = postseason_soup.select('#postseason_series a') # using the proper HTML node to get the year and all the playoff teams

In [None]:
starting_pos = 0 
ending_pos = 3
index_names = ['Year', 'Winner', 'Loser']
Post_Teams = pd.DataFrame(columns=index_names) # empty DataFrame with ['Year', 'Winner', 'Loser'] as the col names

# loop to make the data frame row by row
for series in range(1, 181):  # there were 180 different playoff series between 2000 and 2021
    post_series_list = []
    for entry in postseason_data[starting_pos:ending_pos]: # starting and ending are 0 and 3 because each row consists of only 3 values
        post_series_list.append(entry.text)
    post_series_Series = pd.Series(post_series_list, index=index_names) # create a pandas Series from the list containing the 3 values and make the index the same as the DataFrame col names
    Post_Teams = Post_Teams.append(post_series_Series, ignore_index=True) # add the Series as a row to the DataFrame (we can do this b/c the index matches the col names)
    starting_pos += 3 # increase the starting and ending values by 3 so that the next iteration will grab the next 3 values 
    ending_pos += 3
Post_Teams # what the loop created

Unnamed: 0,Year,Winner,Loser
0,2021 World Series,"Atlanta Braves (88-73, NL)","Houston Astros (95-67, AL)"
1,2021 ALCS,"Houston Astros (95-67, AL)","Boston Red Sox* (92-70, AL)"
2,2021 NLCS,"Atlanta Braves (88-73, NL)","Los Angeles Dodgers* (106-56, NL)"
3,2021 ALDS1,"Houston Astros (95-67, AL)","Chicago White Sox (93-69, AL)"
4,2021 ALDS2,"Boston Red Sox* (92-70, AL)","Tampa Bay Rays (100-62, AL)"
...,...,...,...
175,2000 NLCS,"New York Mets* (94-68, NL)","St. Louis Cardinals (95-67, NL)"
176,2000 ALDS1,"New York Yankees (87-74, AL)","Oakland Athletics (91-70, AL)"
177,2000 ALDS2,"Seattle Mariners* (91-71, AL)","Chicago White Sox (95-67, AL)"
178,2000 NLDS1,"St. Louis Cardinals (95-67, NL)","Atlanta Braves (95-67, NL)"


In [None]:
# Using regular expressions to clean up each column to only what we want
Post_Teams.Year = Post_Teams.Year.str.extract('(\d{4})+') # there should be 4 digits then more characters after that; only take the 4 digits
Post_Teams.Year = Post_Teams.Year.astype('int') # make the year column type integer (not type float)
Post_Teams.Winner = Post_Teams.Winner.str.extract('([\w\s]+\.?[\w\s]+)\*? \(') # there should be letter characters, maybe a ".", more letter characters, maybe a "*", then a "("; grab everything before the "*" or the "(", whichever of those comes first
Post_Teams.Loser = Post_Teams.Loser.str.extract('([\w\s]+\.?[\w\s]+)\*? \(') # there should be letter characters, maybe a ".", more letter characters, maybe a "*", then a "("; grab everything before the "*" or the "(", whichever of those comes first
Post_Teams # new DataFrame with cleaned columns

Unnamed: 0,Year,Winner,Loser
0,2021,Atlanta Braves,Houston Astros
1,2021,Houston Astros,Boston Red Sox
2,2021,Atlanta Braves,Los Angeles Dodgers
3,2021,Houston Astros,Chicago White Sox
4,2021,Boston Red Sox,Tampa Bay Rays
...,...,...,...
175,2000,New York Mets,St. Louis Cardinals
176,2000,New York Yankees,Oakland Athletics
177,2000,Seattle Mariners,Chicago White Sox
178,2000,St. Louis Cardinals,Atlanta Braves


In [None]:
# Here's some code for thought
# Change a year to the year you want and this cell will output the teams that made the postseason that year 
year = 2021
Post_Teams_from_year = Post_Teams.loc[Post_Teams.Year == year, ]
unique_Post_Teams_from_year = pd.concat([Post_Teams_from_year.Winner, Post_Teams_from_year.Loser]).unique()
unique_Post_Teams_from_year = list(unique_Post_Teams_from_year)
unique_Post_Teams_from_year

['Atlanta Braves',
 'Houston Astros',
 'Boston Red Sox',
 'Los Angeles Dodgers',
 'Chicago White Sox',
 'Tampa Bay Rays',
 'Milwaukee Brewers',
 'San Francisco Giants',
 'New York Yankees',
 'St. Louis Cardinals']

In [None]:
# More code for thought
teams = ['Los Angeles Dodgers', 'Los Angeles Angels']
playoff_status = [team in unique_Post_Teams_from_year for team in teams]
playoff_status = np.array(playoff_status).astype('int')
SmallPlayoffDF = pd.DataFrame({'Tm':teams, 'Playoff':playoff_status})
SmallPlayoffDF

Unnamed: 0,Tm,Playoff
0,Los Angeles Dodgers,1
1,Los Angeles Angels,0
