In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

# The URL of the page you want to scrape
url = 'https://www.baseball-reference.com/teams/HOU/2023-schedule-scores.shtml'

# Fetching the HTML content
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

table = soup.find('table', id='team_schedule')
rows = table.find_all('tr')

# Extracting header columns
header = [th.get_text(strip=True) for th in rows[0].find_all('th')]

data = []
for row in rows[1:]:
    rowData = [cell.get_text(strip=True) for cell in row.find_all(['td', 'th'])]  # Get both td and th cells
    
    # Check if the first cell in the rowData is numeric
    # If not, we skip adding this row to our data
    if not rowData[0].isdigit():
        continue
    
    if len(rowData) != len(header):  # Check for discrepancies
        print(f"Row discrepancy: Expected {len(header)} columns but got {len(rowData)}.")
        rowData.extend([''] * (len(header) - len(rowData)))  # Add empty strings for missing columns
        
    data.append(rowData)

# Construct the DataFrame
df = pd.DataFrame(data, columns=header)

# Drop columns by index position
df = df.drop(df.columns[[2, 21]], axis=1)

# To save this data to CSV
#df.to_csv('astros_schedule_2023.csv', index=False)


In [4]:
# Change variable names

from bs4 import BeautifulSoup
import requests
import pandas as pd

def scrape_baseball_schedule(url):
    # Fetching the HTML content
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    table = soup.find('table', id='team_schedule')
    rows = table.find_all('tr')

    # Extracting header columns
    header = [th.get_text(strip=True) for th in rows[0].find_all('th')]

    games = []
    for row in rows[1:]:
        row_data = [cell.get_text(strip=True) for cell in row.find_all(['td', 'th'])]  # Get both td and th cells
        
        # Check if the first cell in the rowData is numeric
        # If not, we skip adding this row to our data
        if not row_data[0].isdigit():
            continue
        
        if len(row_data) != len(header):  # Check for discrepancies
            print(f"Row discrepancy: Expected {len(header)} columns but got {len(rowData)}.")
            row_data.extend([''] * (len(header) - len(row_data)))  # Add empty strings for missing columns
            
        games.append(row_data)

    # Construct the DataFrame
    df = pd.DataFrame(games, columns=header)

    # Drop columns by index position
    df = df.drop(df.columns[[2, 21]], axis=1)
    
    return df

In [None]:
df.to_csv('astros.csv', index=False)

In [None]:
astros = df

# Ensure all values in 'Attendance' are strings
astros['Attendance'] = astros['Attendance'].astype(str)

# Remove the commas from the 'Attendance' column
astros['Attendance'] = astros['Attendance'].str.replace(',', '')

# Replace empty strings with NaN (using numpy's nan)
astros['Attendance'] = astros['Attendance'].replace('', pd.NA)

# Convert the 'Attendance' column to numeric, turning non-numeric values into NaN
astros['Attendance'] = pd.to_numeric(astros['Attendance'], errors='coerce').astype('Int64')


In [None]:
astros.value_counts('W/L')

W/L
W       590
L       360
L-wo     41
W-wo     41
dtype: int64

In [None]:
# Replace 'W-wo' with 'W*' and 'L-wo' with 'L*'
astros['W/L'] = astros['W/L'].replace({'W-wo': 'W*', 'L-wo': 'L*'})

In [None]:
# Convert the "Inn" column to numeric, setting errors='coerce' to replace non-numeric values with NaN
astros['Inn'] = pd.to_numeric(astros['Inn'], errors='coerce')

# Fill NaN values in the "Inn" column with 9
astros['Inn'] = astros['Inn'].fillna(9)


In [None]:
astros['GB'] = astros['GB'].str.replace('up','+')
astros

Unnamed: 0,Gm#,Date,Tm,Opp,W/L,R,RA,Inn,W-L,Rank,GB,Win,Loss,Save,Time,D/N,Attendance,cLI,Streak,Season
0,1,"Thursday, Mar 30",HOU,CHW,L,2,3,9.0,0-1,4,1.0,Graveman,Pressly,López,2:38,N,43032,.92,-,2023
1,2,"Friday, Mar 31",HOU,CHW,W,6,3,9.0,1-1,3,0.5,Martinez,Graveman,Montero,2:40,N,41453,.92,+,2023
2,3,"Saturday, Apr 1",HOU,CHW,W,6,4,9.0,2-1,2,0.5,Stanek,Kelly,Neris,3:11,D,37519,.93,++,2023
3,4,"Sunday, Apr 2",HOU,CHW,L,3,6,9.0,2-2,3,1.5,Clevinger,Garcia,,3:06,D,42835,1.00,-,2023
4,5,"Monday, Apr 3",HOU,DET,L,6,7,11.0,2-3,3,1.5,Wingenter,Neris,Hill,3:21,N,29272,.93,--,2023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1027,158,"Wednesday, Sep 27",HOU,TEX,W,12,2,9.0,98-60,1,+20.0,Verlander,Martinez,,3:17,D,26053,.18,+++,2017
1028,159,"Thursday, Sep 28",HOU,BOS,W,12,2,9.0,99-60,1,+21.0,Peacock,Rodriguez,,3:31,N,34222,.10,++++,2017
1029,160,"Friday, Sep 29",HOU,BOS,W,3,2,9.0,100-60,1,+21.0,Morton,Fister,Giles,2:47,N,36623,.07,+++++,2017
1030,161,"Saturday, Sep 30",HOU,BOS,L,3,6,9.0,100-61,1,+21.0,Pomeranz,McCullers,,3:19,D,35722,.14,-,2017


In [None]:
# Convert 'Time' to minutes using a lambda function directly within the apply method
astros['Time_minutes'] = astros['Time'].apply(lambda x: int(x.split(':')[0]) * 60 + int(x.split(':')[1]))
astros

Unnamed: 0,Gm#,Date,Tm,Opp,W/L,R,RA,Inn,W-L,Rank,...,Win,Loss,Save,Time,D/N,Attendance,cLI,Streak,Season,Time_minutes
0,1,"Thursday, Mar 30",HOU,CHW,L,2,3,9.0,0-1,4,...,Graveman,Pressly,López,2:38,N,43032,.92,-,2023,158
1,2,"Friday, Mar 31",HOU,CHW,W,6,3,9.0,1-1,3,...,Martinez,Graveman,Montero,2:40,N,41453,.92,+,2023,160
2,3,"Saturday, Apr 1",HOU,CHW,W,6,4,9.0,2-1,2,...,Stanek,Kelly,Neris,3:11,D,37519,.93,++,2023,191
3,4,"Sunday, Apr 2",HOU,CHW,L,3,6,9.0,2-2,3,...,Clevinger,Garcia,,3:06,D,42835,1.00,-,2023,186
4,5,"Monday, Apr 3",HOU,DET,L,6,7,11.0,2-3,3,...,Wingenter,Neris,Hill,3:21,N,29272,.93,--,2023,201
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1027,158,"Wednesday, Sep 27",HOU,TEX,W,12,2,9.0,98-60,1,...,Verlander,Martinez,,3:17,D,26053,.18,+++,2017,197
1028,159,"Thursday, Sep 28",HOU,BOS,W,12,2,9.0,99-60,1,...,Peacock,Rodriguez,,3:31,N,34222,.10,++++,2017,211
1029,160,"Friday, Sep 29",HOU,BOS,W,3,2,9.0,100-60,1,...,Morton,Fister,Giles,2:47,N,36623,.07,+++++,2017,167
1030,161,"Saturday, Sep 30",HOU,BOS,L,3,6,9.0,100-61,1,...,Pomeranz,McCullers,,3:19,D,35722,.14,-,2017,199


In [None]:
astros.to_csv('astros.csv', index=False)