In [183]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re

In [184]:
def scrape_and_format(url, output_csv):
    year_match = re.search(r'(\d{4})-schedule-scores', url)
    if year_match:
        year = int(year_match.group(1))
    else:
        raise ValueError("Year not found in the URL")
    data = requests.get(url)
    win_loss = pd.read_html(data.text, match="Team Game-by-Game Schedule")
    table = win_loss[0]
    table.to_csv(output_csv)
    data1 = pd.read_csv(output_csv, index_col=0)
    data1 = data1[data1['Unnamed: 2'].str.contains('boxscore', na=False)]
    
    data1['Date'] = data1['Date'].astype(str)
    data1['Date'] = data1['Date'].str.extract(r'(\w+, \w+ \d+)')[0] + f' {year}'
    data1['Date'] = pd.to_datetime(data1['Date'], format='%A, %b %d %Y')
    data1.to_csv(output_csv)
    

In [185]:
urls = [
    "https://www.baseball-reference.com/teams/NYY/2024-schedule-scores.shtml",
    "https://www.baseball-reference.com/teams/NYY/2023-schedule-scores.shtml",
    "https://www.baseball-reference.com/teams/NYY/2022-schedule-scores.shtml"
]

In [186]:
output_csvs = [
    "nyy_2024.csv",
    "nyy_2023.csv",
    "nyy_2022.csv"
]

In [187]:
for url, output_csv in zip(urls, output_csvs):
    scrape_and_format(url, output_csv)

  win_loss = pd.read_html(data.text, match="Team Game-by-Game Schedule")
  win_loss = pd.read_html(data.text, match="Team Game-by-Game Schedule")
  win_loss = pd.read_html(data.text, match="Team Game-by-Game Schedule")


In [188]:
data_2022 = pd.read_csv("nyy_2022.csv", index_col=0)

# Read the 2023 and 2024 CSV files without headers (skipping the first row)
data_2023 = pd.read_csv("nyy_2023.csv", index_col=0)
data_2024 = pd.read_csv("nyy_2024.csv", index_col=0)

# Combine the data frames chronologically
combined_data = pd.concat([data_2022, data_2023, data_2024])
combined_data

Unnamed: 0,Gm#,Date,Unnamed: 2,Tm,Unnamed: 4,Opp,W/L,R,RA,Inn,...,GB,Win,Loss,Save,Time,D/N,Attendance,cLI,Streak,Orig. Scheduled
0,1,2022-04-08,boxscore,NYY,,BOS,W-wo,6,5,11.0,...,Tied,King,Crawford,,3:56,D,46097.0,0.98,+,
1,2,2022-04-09,boxscore,NYY,,BOS,W,4,2,,...,Tied,Luetge,Pivetta,Chapman,2:58,D,46882.0,1.01,++,
2,3,2022-04-10,boxscore,NYY,,BOS,L,3,4,,...,1.0,Crawford,Schmidt,Diekman,3:40,N,40108.0,1.02,-,
3,4,2022-04-11,boxscore,NYY,,TOR,L,0,3,,...,1.0,Manoah,Taillon,Romano,3:03,N,26211.0,1.04,--,
4,5,2022-04-12,boxscore,NYY,,TOR,W,4,0,,...,1.0,Holmes,Kikuchi,,3:07,N,25068.0,0.99,+,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,94,2024-07-10,boxscore,NYY,@,TBR,W,2,1,,...,2.0,Hill,Eflin,Holmes,2:49,N,19246.0,1.21,+,
98,95,2024-07-11,boxscore,NYY,@,TBR,L,4,5,,...,2.0,Kelly,CortÃ©s,Fairbanks,2:57,N,23438.0,1.18,-,
99,96,2024-07-12,boxscore,NYY,@,BAL,W,4,1,,...,1.0,Cole,Povich,Holmes,3:02,N,39566.0,1.75,+,
100,97,2024-07-13,boxscore,NYY,@,BAL,W,6,1,,...,Tied,Gil,Rodriguez,,2:35,D,44018.0,1.79,++,


In [189]:
combined_data['home_away'] = combined_data['Unnamed: 4'].astype('category').cat.codes
combined_data['Date'] = pd.to_datetime(combined_data['Date'], format='%Y-%m-%d')
combined_data['opposition'] = combined_data['Opp'].astype('category').cat.codes
combined_data['day or night'] = data1['D/N'].astype('category').cat.codes
combined_data['target'] = combined_data['W/L'].isin(['W', 'W-wo']).astype("int")

In [190]:
combined_data.dtypes

Gm#                         int64
Date               datetime64[ns]
Unnamed: 2                 object
Tm                         object
Unnamed: 4                 object
Opp                        object
W/L                        object
R                           int64
RA                          int64
Inn                       float64
W-L                        object
Rank                        int64
GB                         object
Win                        object
Loss                       object
Save                       object
Time                       object
D/N                        object
Attendance                float64
cLI                       float64
Streak                     object
Orig. Scheduled            object
home_away                    int8
opposition                   int8
day or night              float64
target                      int64
dtype: object

In [191]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)
training_data = combined_data[combined_data["Date"] < '2024-3-28']
testing_data = combined_data[combined_data["Date"] > '2024-3-28']
predictors = ['home_away', 'opposition', 'day or night', 'R', 'RA', 'Inn', 'Attendance']

In [192]:
rf.fit(training_data[predictors], training_data['target'])

In [193]:
 preds = rf.predict(testing_data[predictors])

In [194]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(testing_data['target'], preds)
acc

0.9587628865979382

In [195]:
combined = pd.DataFrame(dict(actual=testing_data['target'], predicted=preds))

In [196]:
pd.crosstab(index=combined["actual"], columns=combined["predicted"])

predicted,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,37,3
1,1,56


In [197]:
from sklearn.metrics import precision_score

In [198]:
precision_score(testing_data["target"], preds)

np.float64(0.9491525423728814)

In [199]:
combined

Unnamed: 0,actual,predicted
1,1,1
2,1,1
3,1,1
5,1,1
6,0,0
...,...,...
97,1,1
98,0,0
99,1,1
100,1,1


In [200]:
combined = combined.merge(combined_data[['Opp', 'Date','W/L']],left_index = True, right_index = True)

In [201]:
combined

Unnamed: 0,actual,predicted,Opp,Date,W/L
1,1,1,BOS,2022-04-09,W
1,1,1,HOU,2024-03-29,W
2,1,1,BOS,2022-04-10,L
2,1,1,SFG,2023-04-01,L
2,1,1,HOU,2024-03-30,W
...,...,...,...,...,...
100,1,1,LAA,2023-07-19,L
100,1,1,BAL,2024-07-13,W
101,0,0,NYM,2022-07-27,L-wo
101,0,0,KCR,2023-07-21,W


In [182]:
combined_data.to_csv('combined_data.csv')