In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


In [14]:
URL = "https://fantasy.premierleague.com/news/"
URL_scout = "https://fantasy.premierleague.com/the-scout"
headers = {"User-Agent": "Mozilla/5.0 (compatible; FPL-Agent/1.0)"}

resp = requests.get(URL, headers=headers)
resp.raise_for_status()
html = resp.text
soup = BeautifulSoup(html, "lxml")


In [13]:
# See what kind of elements contain article links/titles
for a in soup.select("a[href*='/news/']")[:20]:
    print(a.get("href"), "|", a.get_text(strip=True))


https://www.premierleague.com/en/news/1820580/off-the-bench-can-you-ignore-cunha-in-fpl | FPL Podcast
https://www.premierleague.com/en/news/4345793/sign-up-now-to-mypremier-league/ | myPremierLeague


In [4]:
articles = []
seen = set()

for a in soup.select("a[href*='/news/']"):
    href = a.get("href", "")
    if not href or href in seen or "/news/" not in href:
        continue
    seen.add(href)
    title = a.get_text(" ", strip=True)
    if title:
        articles.append({
            "url": href if href.startswith("http") else f"https://fantasy.premierleague.com{href}",
            "title": title
        })

df_articles = pd.DataFrame(articles).drop_duplicates("url")
df_articles.head(10)


Unnamed: 0,url,title
0,https://www.premierleague.com/en/news/1820580/...,FPL Podcast
1,https://www.premierleague.com/en/news/4345793/...,myPremierLeague


In [6]:
df_articles

Unnamed: 0,url,title
0,https://www.premierleague.com/en/news/1820580/...,FPL Podcast
1,https://www.premierleague.com/en/news/4345793/...,myPremierLeague


In [8]:
first_url = df_articles.iloc[0]["url"]
print("Testing:", first_url)

r = requests.get(first_url, headers=headers)
r.raise_for_status()
soup_article = BeautifulSoup(r.text, "lxml")

# Try common article text containers
paras = soup_article.find_all("p")
text = "\n".join(p.get_text(strip=True) for p in paras)
print(text[:1000])  # print first 1000 characters


Testing: https://www.premierleague.com/en/news/1820580/off-the-bench-can-you-ignore-cunha-in-fpl
FPL experts Gianni Buttice and Prasun Singhal reflect on the key talking points from Gameweek 9
After a third successive win, is it now time to invest in Manchester United players inFantasy Premier League?
FPL experts Gianni Buttice and Prasun Singhal discuss the form of a number of their players such asBryan Mbeumo(£8.1m), who scored twice last weekend, andBruno Fernandes(£8.9m).
They also discuss if it is time to bring in AFC Bournemouth'sJunior Kroupi(£4.6m) following his third goal in the last two Gameweeks.
Running order:- Liverpool dilemma (02m 40s)- Magical Mateta (13m 58s)- Selling Gyokeres for Arsenal defence (18m 14s)- Man Utd options (24m 16s)- Cooling on Man City (31m 54s)- Keen on Kroupi (38m 08s)




In [9]:
records = []
for idx, row in df_articles.head(5).iterrows():
    url = row["url"]
    try:
        r = requests.get(url, headers=headers, timeout=15)
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "lxml")
        text = " ".join(p.get_text(strip=True) for p in soup.find_all("p"))
        records.append({
            "title": row["title"],
            "url": url,
            "text_excerpt": text[:400]
        })
    except Exception as e:
        print("Error:", e)

pd.DataFrame(records)


Unnamed: 0,title,url,text_excerpt
0,FPL Podcast,https://www.premierleague.com/en/news/1820580/...,FPL experts Gianni Buttice and Prasun Singhal ...
1,myPremierLeague,https://www.premierleague.com/en/news/4345793/...,Join now for free to set your preferences and ...


In [10]:
df_articles.to_csv("../data/fpl_latest_articles.csv", index=False)
print("Saved to data/fpl_latest_articles.csv")


Saved to data/fpl_latest_articles.csv


# The Scout 

In [15]:
resp = requests.get(URL_scout, headers=headers)
resp.raise_for_status()
html = resp.text
soup = BeautifulSoup(html, "lxml")

In [17]:
soup

<!DOCTYPE html>
<html lang="en">
<head>
<!-- OneTrust Cookies Consent Notice start for premierleague.com -->
<script charset="UTF-8" data-domain-script="cdf436a9-4615-4772-a4b4-7660a91cc3a2" src="https://cdn-ukwest.onetrust.com/scripttemplates/otSDKStub.js" type="text/javascript"></script>
<script defer="" src="https://cc-embed.adobe.com/sdk/v4/CCEverywhere.js"></script>
<script async="" src="https://assets.adobedtm.com/e7ad550d4f82/693c01019a2b/launch-a4f579b5dcc4.min.js"></script>
<script id="ism-environment-paths">
            if (!window.PULSE) {
                window.PULSE = {};
            }
            window.PULSE.envPaths = {
                label: "production",
                domain: [
                    "web25.premierleague.pulselive.com",
                    "www.premierleague.com",
                ],
                cdn: "//www.premierleague.com/resources/v1.28.10/",
                api: "//api.premierleague.com",
                preferences:
                    "https:

In [18]:
# See what kind of elements contain article links/titles
for a in soup.select("a[href*='/news/']")[:20]:
    print(a.get("href"), "|", a.get_text(strip=True))

https://www.premierleague.com/en/news/1820580/off-the-bench-can-you-ignore-cunha-in-fpl | FPL Podcast
https://www.premierleague.com/en/news/4345793/sign-up-now-to-mypremier-league/ | myPremierLeague


In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def get_scout_article_links(base_url="https://fantasy.premierleague.com/the-scout"):
    headers = {"User-Agent": "Mozilla/5.0 (compatible; FPL-Agent/1.0)"}
    r = requests.get(base_url, headers=headers, timeout=20)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "lxml")

    links = []
    for a in soup.find_all("a", href=True):
        href = a["href"].strip()
        if "/news/" in href:   # only links containing “the-scout”
            full_url = urljoin(base_url, href)
            if full_url not in links:
                links.append(full_url)
    print(f"✅ Found {len(links)} total links")
    return links

links = get_scout_article_links()
for l in links[:10]:
    print(l)


✅ Found 0 total links
