In [9]:
import os
import pandas as pd
from bs4 import BeautifulSoup

In [None]:
HTML_DIR = "data/scores"
OUTPUT_CSV = "data/line_scores.csv"

rows = []

for filename in os.listdir(HTML_DIR):
    if not filename.endswith(".html"):
        continue

    file_path = os.path.join(HTML_DIR, filename)

    with open(file_path, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")

    table = soup.find("table", id="line_score")
    if table is None:
        continue

    game_id = filename.replace(".html", "")
    tbody = table.find("tbody")

    for tr in tbody.find_all("tr"):
        team = tr.find("th", {"data-stat": "team"}).text.strip()

        def stat(name):
            cell = tr.find("td", {"data-stat": name})
            return int(cell.text) if cell else None

        rows.append({
            "game_id": game_id,
            "team": team,
            "Q1": stat("1"),
            "Q2": stat("2"),
            "Q3": stat("3"),
            "Q4": stat("4"),
            "total": stat("T")
        })


In [8]:
df = pd.DataFrame(rows)
print("Filas extraídas:", len(df))

df.to_csv(OUTPUT_CSV, index=False)
print("CSV creado:", OUTPUT_CSV)

Filas extraídas: 8854
CSV creado: data/line_scores.csv


In [13]:
df = pd.read_csv("data/line_scores.csv")

In [34]:
df.describe()

Unnamed: 0,Q1,Q2,Q3,Q4,total
count,8854.0,8854.0,8854.0,8854.0,8854.0
mean,28.675288,28.513553,28.809013,27.46815,114.119155
std,6.030547,6.001787,6.071694,5.970318,12.627639
min,8.0,8.0,8.0,8.0,67.0
25%,25.0,24.0,25.0,23.0,105.0
50%,29.0,28.0,29.0,27.0,114.0
75%,33.0,33.0,33.0,32.0,123.0
max,55.0,51.0,52.0,50.0,176.0


In [35]:
df.groupby("team")[["Q1", "Q2", "Q3", "Q4", "total"]].agg(["min", "mean", "max"])

Unnamed: 0_level_0,Q1,Q1,Q1,Q2,Q2,Q2,Q3,Q3,Q3,Q4,Q4,Q4,total,total,total
Unnamed: 0_level_1,min,mean,max,min,mean,max,min,mean,max,min,mean,max,min,mean,max
team,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
ATL,16,29.709343,45,16,30.065744,49,8,29.404844,43,14,28.197232,50,89,118.200692,152
BOS,15,30.404908,46,14,29.779141,51,11,28.766871,45,12,26.920245,46,76,116.763804,155
BRK,11,27.541219,47,11,27.738351,45,12,27.139785,44,12,26.55914,43,67,109.379928,147
CHI,14,27.943262,40,11,28.141844,46,16,29.535461,44,14,28.035461,44,89,114.673759,152
CHO,12,26.559567,51,10,26.711191,44,9,27.433213,45,10,27.267148,44,82,108.444043,145
CLE,14,29.629508,50,12,28.304918,44,10,28.816393,45,12,27.727869,47,79,115.108197,149
DAL,14,28.933993,47,8,28.607261,47,14,28.752475,51,11,27.610561,43,86,114.508251,148
DEN,8,30.139319,49,15,29.095975,43,14,29.811146,47,9,26.950464,43,70,116.743034,149
DET,10,28.265018,47,15,28.042403,46,13,28.64311,45,14,27.113074,43,81,112.614841,148
GSW,10,28.401316,55,11,28.717105,50,15,30.016447,48,13,28.164474,43,83,115.835526,157


In [25]:
stats = df.groupby("game_id")[["Q1", "Q2", "Q3", "Q4", "total"]].describe()


In [24]:
games = (
    df.groupby("game_id")[["Q1", "Q2", "Q3", "Q4", "total"]]
      .sum()
      .reset_index()
)


In [29]:
games.sort_values("Q1", ascending=True)

Unnamed: 0,game_id,Q1,Q2,Q3,Q4,total
574,202301040TOR,25,52,45,72,205
3931,202505110DEN,25,53,54,47,179
565,202301040CLE,29,47,47,55,178
2278,202403100NYK,30,38,51,33,152
4301,202512060CLE,30,51,53,59,193
...,...,...,...,...,...,...
3822,202504080OKC,86,66,43,61,256
4085,202511070MIA,86,55,41,52,234
1224,202304090NYK,86,58,63,70,277
3466,202502210HOU,86,49,56,45,236


In [31]:
games.describe()

Unnamed: 0,Q1,Q2,Q3,Q4,total
count,4427.0,4427.0,4427.0,4427.0,4427.0
mean,57.350576,57.027106,57.618026,54.9363,228.23831
std,8.461202,8.552656,8.493546,8.972923,20.097591
min,25.0,28.0,27.0,24.0,152.0
25%,52.0,51.0,52.0,49.0,215.0
50%,57.0,57.0,57.0,55.0,228.0
75%,63.0,63.0,63.0,61.0,241.0
max,89.0,86.0,88.0,90.0,351.0
