In [11]:
import praw
from datetime import datetime
import os
from dotenv import load_dotenv

In [12]:
load_dotenv()
reddit = praw.Reddit(
    client_id=os.getenv('REDDIT_CLIENT_ID'),
    client_secret=os.getenv('REDDIT_CLIENT_SECRET'),
    user_agent=os.getenv('REDDIT_USER_AGENT', 'nba-sentiment-research'),
    check_for_async=False,
)

print("Read-only:", reddit.read_only)



Read-only: False


In [10]:
subreddits = ['nba', 'nbadiscussion', 'NBAtalk']
start_date = datetime(2024, 5, 1).timestamp()   # Example: May 1st, 2024
end_date   = datetime(2024, 8, 31).timestamp()  # Example: August 31st, 2024
limit = 500


In [None]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import pandas as pd
import re
from urllib.parse import urljoin


In [8]:
BASE = "https://www.basketball-reference.com"


def get_signings_for_offseason(year: int):
    """
    year = season end year on basketball-reference.
    Example: 2024 -> https://www.basketball-reference.com/leagues/NBA_2024_transactions.html
    """
    url = f"{BASE}/leagues/NBA_{year}_transactions.html"

    # make it look like a real browser
    session = requests.Session()
    session.headers.update({
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/124.0.0.0 Safari/537.36"
        ),
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.9",
        "Referer": "https://www.basketball-reference.com/",
        "Connection": "keep-alive",
    })

    resp = session.get(url)
    resp.raise_for_status()  # will raise if still 403

    soup = BeautifulSoup(resp.text, "html.parser")

    # transactions are usually in the main content under <ul> lists
    all_items = soup.select("div#content li")

    signings = []
    for li in all_items:
        text = li.get_text(" ", strip=True)

        # match lines like "July 5, 2024 – The Lakers signed ..."
        m = re.match(r"([A-Za-z]+ \d{1,2}, \d{4})\s[–-]\s(.*)", text)
        if not m:
            continue

        date_str, desc = m.groups()

        try:
            tx_date = datetime.strptime(date_str, "%B %d, %Y")
        except ValueError:
            continue

        # keep only May–August
        if tx_date.month not in (5, 6, 7, 8):
            continue

        # filter for signings
        desc_l = desc.lower()
        signing_keywords = (
            "signed",
            "re-signed",
            "agreed to a two-way contract",
            "signed to a two-way contract",
            "signed to a multi-year contract",
            "signed a multi-year contract",
        )
        if not any(k in desc_l for k in signing_keywords):
            continue

        signings.append({
            "date": tx_date.date().isoformat(),
            "description": desc,
        })

    return signings


    

In [None]:
year = 2025   # change to the offseason you want
data = get_signings_for_offseason(year)

df = pd.DataFrame(data)
print(df)
# df.to_csv(f"nba_signings_may_aug_{year}.csv", index=False)
print(f"saved {len(df)} signings")

HTTPError: 403 Client Error: Forbidden for url: https://www.basketball-reference.com/leagues/NBA_2023_transactions.html

In [None]:
import requests
import pandas as pd
from datetime import datetime

YEAR = 2025
URL = f"https://www.spotrac.com/nba/free-agents/signed/_/year/{YEAR}/"

headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/124.0.0.0 Safari/537.36"
    ),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://www.spotrac.com/",
    "Connection": "keep-alive",
}

resp = requests.get(URL, headers=headers)
resp.raise_for_status()

# Spotrac normally has one main table on that page
tables = pd.read_html(resp.text)
df = tables[0]

# ---- DEBUG: see what Spotrac actually named the columns
print(df.columns)

# --------------- OPTIONAL DATE FILTER ----------------
# I don't know for sure what they call the date column on your side
# so let's try a few common possibilities:
possible_date_cols = ["Signed", "Updated", "Date", "Contract Date"]

date_col = None
for c in df.columns:
    if c in possible_date_cols:
        date_col = c
        break

if date_col:
    # keep only May–August
    def in_offseason(x):
        try:
            d = datetime.strptime(str(x), "%m/%d/%y")  # you may need "%m/%d/%Y"
        except ValueError:
            return False
        return d.month in (5, 6, 7, 8)

    df = df[df[date_col].apply(in_offseason)]
else:
    # Spotrac might not show dates on the signed-FA view.
    # In that case you’ll get *all* signed FAs for that year here
    print("No date column found – keeping all signed free agents for", YEAR)

# save
out_path = f"spotrac_signed_free_agents_{YEAR}.csv"
df.to_csv(out_path, index=False)
print(f"Saved {len(df)} rows to {out_path}")


  tables = pd.read_html(resp.text)


Index(['From  To', 'Player (129)', 'Pos', 'Yrs', 'Value  $1,406,729,444',
       'AAV  $6,998,654', 'Unnamed: 6'],
      dtype='object')
No date column found – keeping all signed free agents for 2025
Saved 129 rows to spotrac_signed_free_agents_2025.csv


In [30]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

URL = "https://bleacherreport.com/articles/25218772-nba-free-agent-tracker-2025-and-updates-all-latest-contract-signings-trades"

headers = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/124.0.0.0 Safari/537.36"
    ),
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Referer": "https://bleacherreport.com/",
}

resp = requests.get(URL, headers=headers)
resp.raise_for_status()

soup = BeautifulSoup(resp.text, "html.parser")

# ---------- helper functions ----------

def classify_action(text: str) -> str:
    t = text.lower()
    if "re-sign" in t or "re-signs" in t or "re-signed" in t:
        return "re-sign"
    if "extension" in t or "extends" in t:
        return "extension"
    if "signs" in t or "to sign" in t or "agrees to contract" in t or "agrees to deal" in t:
        return "sign"
    return "other"

def extract_amount(text: str):
    # look for $X million
    # examples:
    # $12 million
    # $3.5 million
    # $120 million
    m = re.search(r"\$([\d\.]+)\s*million", text, flags=re.IGNORECASE)
    if m:
        return float(m.group(1))
    # sometimes they write the years then money, we can add more patterns later
    return None

def extract_player_from_title(title: str) -> str:
    """
    Try to get player name from titles like:
    - 'Chris Paul Agrees to Contract with Clippers'
    - 'Dru Smith Signs with Heat'
    - 'Lakers Re-Sign Austin Reaves'
    We'll just take text before the first verb.
    """
    parts = re.split(r"\b(signs?|re-signs?|agrees|agreed|extends?|extension)\b", title, flags=re.IGNORECASE)
    if len(parts) >= 2:
        player = parts[0].strip(" -–:")
        return player
    return title  # fallback

# ---------- main scrape ----------

rows = []

keep_keywords = [
    "sign", "signs", "to sign", "agrees to contract", "agrees to deal",
    "re-sign", "re-signs", "re-signed", "extension", "extends"
]
drop_keywords = [
    "trade", "traded", "acquired", "acquire", "waivers", "claimed"
]

for h in soup.find_all(["h2", "h3"]):
    title = h.get_text(strip=True)
    if not title:
        continue

    t_lower = title.lower()

    # drop trades etc.
    if any(bad in t_lower for bad in drop_keywords):
        continue

    # keep only contract-ish things
    if not any(good in t_lower for good in keep_keywords):
        continue

    # get description (paragraphs) just under this heading

    action = classify_action(title)
    player = extract_player_from_title(title)

    rows.append({
        "player": player,
        "action": action,
        "title": title,
    })

df = pd.DataFrame(rows)
df.to_csv("bleacherreport_2025_signings_structured.csv", index=False)
print(f"saved {len(df)} rows")
print(df.head(10))


saved 98 rows
                      player action  \
0                  Dru Smith   sign   
1             Ron Harper Jr.   sign   
2  Victor Oladipo Reportedly   sign   
3                Amir Coffey   sign   
4           Javonte Green to   sign   
5           Ricky Council IV   sign   
6              Johnny Juzang   sign   
7            TyTy Washington   sign   
8                Colby Jones   sign   
9               Jared Butler   sign   

                                               title  
0                          Dru Smith Signs with Heat  
1                  Ron Harper Jr. Signs With Celtics  
2  Victor Oladipo Reportedly Signs Contract to En...  
3                       Amir Coffey Signs with Bucks  
4                 Javonte Green to Sign with Pistons  
5                   Ricky Council IV Signs with Nets  
6              Johnny Juzang Signs with Timberwolves  
7                TyTy Washington Signs with Clippers  
8                     Colby Jones Signs with Pistons  
9     

In [33]:

salaries = pd.read_csv("br_player_salaries.csv")
signings = pd.read_csv("bleacherreport_2025_signings_structured.csv")

def normalize_name(name):
    if pd.isna(name):
        return ""
    name = str(name)
    name = re.sub(r"[^\w\s]", "", name)   # remove punctuation
    name = re.sub(r"\s+", " ", name)      # collapse spaces
    return name.strip().lower()

salaries["player_key"] = salaries["Player"].apply(normalize_name)
signings["player_key"] = signings["player"].apply(normalize_name)

merged = pd.merge(
    signings,
    salaries,
    how="left",
    on="player_key",
    suffixes=("_signing", "_salary")
)

# keep only the columns we want
# adjust these if your CSV uses slightly different headers
cols_to_keep = ["player", "action", "Tm", "2025-26"]
existing_cols = [c for c in cols_to_keep if c in merged.columns]

result = merged[["player_key"] + existing_cols]

# drop rows where salary for 2025-26 is missing
if "2025-26" in result.columns:
    result = result.dropna(subset=["2025-26"])

# save
result.to_csv("signings_with_salaries.csv", index=False)
print(result.head(20))


           player_key             player     action   Tm    2025-26
0           dru smith          Dru Smith       sign  MIA   $2378870
3         amir coffey        Amir Coffey       sign  MIL   $2296274
5    ricky council iv   Ricky Council IV       sign  PHI   $2221677
10   lindy waters iii   Lindy Waters III       sign  SAS   $2296274
11        josh okogie        Josh Okogie       sign  HOU   $2296274
12       blake wesley       Blake Wesley       sign  WAS   $7022602
13       blake wesley       Blake Wesley       sign  POR   $7022602
14         chris paul         Chris Paul       sign  LAC   $2296274
17     damian lillard     Damian Lillard       sign  MIL  $68230450
18     damian lillard     Damian Lillard       sign  POR  $68230450
19        cam spencer        Cam Spencer       sign  MEM   $2537989
21     isaiah jackson     Isaiah Jackson    re-sign  IND   $7600000
23  marvin bagley iii  Marvin Bagley III       sign  WAS   $2296274
25  jordan mclaughlin  Jordan McLaughlin    re-s

## adding date of signing


In [None]:
import re
import datetime as dt

import requests
import pandas as pd
from bs4 import BeautifulSoup
import spacy

# -----------------------------
# Config
# -----------------------------
FOX_URL = "https://www.foxsports.com/stories/nba/nba-free-agency-signings-tracker"
SIGNINGS_CSV = "data/player_data/signings_with_salaries.csv"   # path to your existing file
OUTPUT_CSV = "data/player_data/fox_free_agency_signings_2025.csv"
SIGNING_YEAR = 2025  # this tracker is for 2025 free agency

signings = pd.read_csv(SIGNINGS_CSV)

players_to_find = signings["player"].to_list()



MONTHS = {
    "June": 6,
    "July": 7,
    "August": 8,
    "Aug.": 8,
    "Sept.": 9,
    "September": 9,
    "Oct.": 10,
    "October": 10,
}
YEAR_MAPPING = {
    "One": 1, 
    "Two": 2, 
    "Three": 3, 
    "Four": 4, 
    "Five": 5
}

date_line_re = re.compile(r"^(?:##\s*)?([A-Za-z]+\.?)\s+(\d{1,2})$")

money_re = re.compile(r"(\d+(?:\.\d+)?)\s*(million)", re.IGNORECASE)

def parse_date_line(line: str):
    """
    Parse lines like '## Oct. 15' or 'July 3' into a datetime.date.
    Returns None if it's not a date line.
    """
    line = line.strip()
    m = date_line_re.match(line)
    if not m:
        return None

    month_name, day_str = m.groups()
    month = MONTHS.get(month_name)
    if not month:
        return None

    return dt.date(SIGNING_YEAR, month, int(day_str))

def find_player(line: str):
    for potential_player in players_to_find:
        if potential_player in line:
            return potential_player
    return ""


def find_amount_and_years(line: str) -> tuple[int, float]:
    found_year: int = 0
    value: float = 0.0
    year: str
    for year in YEAR_MAPPING.keys():
        if year in line:
            found_year = YEAR_MAPPING[year]
            break
    m = money_re.search(line)
    if m:
        value = float(m.group(1))
    return (found_year, value)






# -----------------------------
# 1) Download & extract article text
# -----------------------------
headers = {"User-Agent": "Mozilla/5.0"}
resp = requests.get(FOX_URL, headers=headers)
resp.raise_for_status()

soup = BeautifulSoup(resp.text, "html.parser")

# Try to isolate the story body; fall back to whole page if needed
article = (
    soup.find("div", attrs={"data-testid": "story-body"})
    or soup.find("article")
    or soup
)

full_text = article.get_text("\n", strip=True)

start_marker = "FOX Sports is keeping track of every signing that could shake up the association in our NBA free agency tracker:"
end_marker = "Check back for updates."

start_idx = full_text.find(start_marker)
end_idx = full_text.find(end_marker, start_idx if start_idx != -1 else 0)

if start_idx != -1 and end_idx != -1:
    tracker_text = full_text[start_idx:end_idx]
else:
    # If markers fail for some reason, just use the full text.
    tracker_text = full_text
print(tracker_text)
lines = [ln.strip() for ln in tracker_text.splitlines()]

# -----------------------------
# 2) Build list of events (date + summary line + details line)
# -----------------------------
records = []
current_date = None
current_player = None

for i, line in enumerate(lines):
    if not line:
        continue

    maybe_date = parse_date_line(line)
    if maybe_date:
        current_date = maybe_date
        continue
    
    maybe_player = find_player(line)
    if maybe_player:
        current_player = maybe_player
        continue

    year, value = find_amount_and_years(line)
    if (year != 0 or value != 0.0):

        records.append(
            {
                "years_of_contract": year,
                "value_of_contract":value,
                "player_key": current_player,
                "signing_date": current_date,
            }
        )
        current_player = None


# ----------------------------
df_article = pd.DataFrame(records)





FOX Sports is keeping track of every signing that could shake up the association in our NBA free agency tracker:
Oct. 15
Kings
sign G
Russell Westbrook
Details:
One year, $3.6 million
Oct. 1
Heat
extend F
Nikola Jović
Details:
Four years, $62.4 million
76ers
re-sign G
Quentin Grimes
Details:
One year, $8.7 million (qualifying offer)
Sept. 24
Heat
sign F
Precious Achiuwa
Details:
One year
Sept. 23
Pacers
sign G
Monte Morris
Details:
One year
Cavaliers
sign C
Thomas Bryant
Details:
One year
Sept. 15
Spurs
sign C
Bismack Biyombo
Details:
One year
Sept. 12
Knicks
sign G
Malcolm Brogdon
Details:
One year
Sept. 11
Knicks
sign G
Garrison Mathews
Details:
One year
Knicks
re-sign G
Landry Shamet
Details:
One year
Sept. 9
Bulls
re-sign G
Josh Giddey
Details:
Four years, $100 million
Sept. 4
Cam Thomas
signs $6 million
qualifying offer
with
Nets
Details:
Thomas will be an unrestricted free agent after 2025-26 season.
2024-25 stats:
24.0 points per game, while shooting 43.8/34.9/88.1 (25 games)
Au

In [29]:
def normalize_name(name):
    if pd.isna(name):
        return ""
    name = str(name)
    name = re.sub(r"[^\w\s]", "", name)   # remove punctuation
    name = re.sub(r"\s+", " ", name)      # collapse spaces
    return name.strip().lower()

In [30]:
df_article["player_key"] = df_article["player_key"].apply(normalize_name)
merged = df_article.merge(
    signings,
    on="player_key",
    how="right"
)
print(merged.head())


merged.to_csv(OUTPUT_CSV, index=False)

print(f"Saved {len(merged)} rows to {OUTPUT_CSV}")


   years_of_contract  value_of_contract        player_key signing_date  \
0                NaN                NaN         dru smith          NaN   
1                NaN                NaN       amir coffey          NaN   
2                NaN                NaN  ricky council iv          NaN   
3                NaN                NaN  lindy waters iii          NaN   
4                1.0                3.1       josh okogie   2025-07-22   

             player action   Tm   2025-26  
0         Dru Smith   sign  MIA  $2378870  
1       Amir Coffey   sign  MIL  $2296274  
2  Ricky Council IV   sign  PHI  $2221677  
3  Lindy Waters III   sign  SAS  $2296274  
4       Josh Okogie   sign  HOU  $2296274  
Saved 72 rows to data/player_data/fox_free_agency_signings_2025.csv
