In [148]:
# !git add .
# !git commit -m "added web scraping"
# !git pull
# !git push -u origin main

## Sports Misinformation Classification Tool

### Connect to the Database

In [1]:
import mysql.connector
from sqlalchemy import create_engine
import pandas as pd

username = 'root'  
password = 'password'
host = 'localhost' 
database = 'sports_news_db'

connection = mysql.connector.connect(
    host=host,
    user=username,
    password=password,
    database=database
)

if connection.is_connected():
    print("Connected to the database successfully!")

engine = create_engine(f"mysql+mysqlconnector://{username}:{password}@{host}/{database}", echo=False)

Connected to the database successfully!


#### Use this format to insert into the database

INSERT INTO articles (team_or_player, source, publication_date, content, trust_score, classification, link)
VALUES
('New York Yankees, Los Angeles Lakers', 'ESPN', '2024-10-27', 'Yankees article content example.', 85.00, 'real', 'https://example.com/article1'),
('Los Angeles Lakers', 'Twitter', '2024-10-27', 'Lakers article content example.', 60.00, 'fake', 'https://example.com/article2');


#### Table fields 
Table Name: articles

Fields: 

     id INT AUTO_INCREMENT PRIMARY KEY,
     
     team_or_player VARCHAR(500), (This will be the article title that we can query- usually includes teams or names in it)
     
     source VARCHAR(200),
     
     publication_date DATE,
     
     content TEXT,
     
     trust_score DECIMAL(5, 2), 
     
     classification ENUM('credible', 'uncredible', 'unknown') DEFAULT 'unknown',

     link VARCHAR(255)

### Simulate Tool Working

In [118]:
team_or_player = input("Enter the team or player's name: ")

query = f"SELECT * FROM articles WHERE team_or_player LIKE '%{team_or_player}%'" #search for entered name/team in the title 
df_result = pd.read_sql(query, con=engine, params={'team_or_player': team_or_player})

#get results
if not df_result.empty:
    print(f"Articles related to {team_or_player}:")
    display(df_result)
else:
    print(f"No articles found for {team_or_player}.")

Enter the team or player's name:  x


No articles found for x.


### Get Article Entries from RSS Feeds-- \*not continuous*

In [134]:
# # reddit API - collected 1876 posts
# import csv
# import praw
# from datetime import datetime


# reddit = praw.Reddit(
#     client_id='w-kwRyPigyjYeG9DOiDc8g', 
#     client_secret='ZeDsvNH2YlpVH7F9wEWPkt5wkjLzqA',  
#     user_agent='sports_misinfo_script'  
# )

# subreddit = reddit.subreddit('sports+fantasyfootball') #2 subreddits 

# recent_posts = []
# for post in subreddit.new(limit= 5000):
#     created_date = datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S')
#     recent_posts.append({
#         'title': post.title,
#         'score': post.score,
#         'url': post.url,
#         'id': post.id,
#         'author': str(post.author),
#         'text': post.selftext,
#         'created_date': created_date,
#         'num_comments': post.num_comments,
#         'subreddit': post.subreddit.display_name,

#     })

# print(f"Fetched {len(recent_posts)} posts")

# for i, post in enumerate(recent_posts[:5]):
#     print(f"{i+1}. Title: {post['title']} | Score: {post['score']} | URL: {post['url']}")


# #-------------------------------
# #Save the data to a csv file
# fieldnames = ['title', 'score', 'url', 'id', 'author', 'text', 'created_date', 'num_comments', 'subreddit']

# with open('recent_sports_reddit_posts.csv', mode='w', newline='', encoding='utf-8') as csv_file:
#     writer = csv.DictWriter(csv_file, fieldnames=fieldnames)

#     writer.writeheader()

#     for post in recent_posts:
#         writer.writerow(post)

# print("Data successfully saved to 'recent_sports_reddit_posts.csv'")
# print("Data saved")


In [15]:
import feedparser
import requests
from bs4 import BeautifulSoup

#add urls to this list to parse
url_list = [
        "https://api.foxsports.com/v2/content/optimized-rss?partnerKey=MB0Wehpmuj2lUhuRhQaafhBjAJqaPU244mlTDK1i&size=30&tags=fs/nfl",#fox NFL
        "https://api.foxsports.com/v2/content/optimized-rss?partnerKey=MB0Wehpmuj2lUhuRhQaafhBjAJqaPU244mlTDK1i&size=30", #fox MLB
        "https://api.foxsports.com/v2/content/optimized-rss?partnerKey=MB0Wehpmuj2lUhuRhQaafhBjAJqaPU244mlTDK1i&size=30&tags=fs/nba", #fox NBA
        "https://www.sportscollectorsdaily.com/feed/", #Sports Collectors Daily 
        "https://www.espn.com/espn/rss/news", #ESPN top headlines
        "https://deadspin.com/rss/", #Deadspin 
        "https://news.sportslogos.net/feed/", #SportsLogos.Net

        #questionable sources or medium credibility
        "https://notthebee.com/feed", #not the bee
        "https://uproxx.com/sports/feed/", #uproxx   
        "https://www.vibe.com/c/news/sports/feed/", #The vibe - medium cred 
]
    
entries = [] #list of dictionaries

#--------------DEFINE FUNCTION TO SCRAPE-------------------
def scrape_article_content(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")

        #extract content from common tags
        article_body = (
            soup.find("article") or
            soup.find("div", {"class": "post-content"}) or
            soup.find("div", class_="article-body") or
            soup.find("div", class_="article-content") or
            soup.find("section", class_="article-section") or
            soup.find("div", class_="main-content") or
            soup.find("div", class_="content-body")
        )
        
        if article_body:
            paragraphs = article_body.find_all("p")
        else:
            paragraphs = soup.find_all("p")

        #join all paragraphs into a single string
        article_content = " ".join(p.get_text() for p in paragraphs)
        return article_content.strip() if article_content else "No content found"

    except Exception as e:
        print(f"Failed to scrape content from {url}: {e}")
        return "Failed to fetch content"


#run the function to collect feeds and scrape
for url in url_list:
    feed = feedparser.parse(url)
    feed_title= feed.feed.title
    
    for entry in feed.entries:
        entry_title= entry.title
        entry_link= entry.link
        entry_published_date= entry.published
        entry_summary= entry.summary
        entry_content = scrape_article_content(entry_link) #scrape
        
        entries.append({
            "feed_title": feed_title,
            "entry_title": entry_title,
            "entry_link": entry_link,
            "entry_published_date": entry_published_date,
            "entry_summary": entry_summary,
            "entry_content": entry_content,
        })

df = pd.DataFrame(entries)
print(df)

df.to_csv('RSS_sports_feeds_11-12.csv', index=False)

Failed to scrape content from https://www.espn.com/nfl/story/_/id/42345908/shane-waldron-fired-bears-offensive-coordinator-sources-say: 403 Client Error: Forbidden for url: https://www.espn.com/nfl/story/_/id/42345908/shane-waldron-fired-bears-offensive-coordinator-sources-say
Failed to scrape content from https://www.espn.com/nfl/story/_/id/42346615/cowboys-dak-prescott-season-ending-surgery-jones-says: 403 Client Error: Forbidden for url: https://www.espn.com/nfl/story/_/id/42346615/cowboys-dak-prescott-season-ending-surgery-jones-says
Failed to scrape content from https://www.espn.com/soccer/story/_/id/42345125/ecuador-cincy-player-marco-angulo-dies-crash-injuries: 403 Client Error: Forbidden for url: https://www.espn.com/soccer/story/_/id/42345125/ecuador-cincy-player-marco-angulo-dies-crash-injuries
Failed to scrape content from https://www.espn.com/nhl/story/_/id/42338657/wait-jeremy-roenick-enters-hockey-hall-fame: 403 Client Error: Forbidden for url: https://www.espn.com/nhl/st

In [11]:
import requests
from bs4 import BeautifulSoup
import pandas as pd


uncredible_urls = [
    "https://www.tellerreport.com/sports",
    "https://www.newsbreak.com/mountain-view-ca-sports",
    "https://newsrnd.com/sports",
    "https://baltimorecitywire.com/stories/tag/53-sports"
]

# Function to scrape article details
def scrape_article(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")

        # Extract the article title
        title = soup.find("h1").get_text() if soup.find("h1") else soup.title.get_text()

        # Extract the publication date (common in <time> or meta tags)
        date = soup.find("time")
        if date:
            publication_date = date.get("datetime") or date.get_text()
        else:
            date_meta = soup.find("meta", {"name": "article:published_time"})
            publication_date = date_meta["content"] if date_meta else "No date found"

        # Extract the article content
        content_container = (
            soup.find("article") or
            soup.find("div", class_=["post-content", "article-body", "article-content", "content-body"])
        )
        if content_container:
            paragraphs = content_container.find_all("p")
        else:
            paragraphs = soup.find_all("p")

        content = "\n".join(p.get_text() for p in paragraphs)

        return {
            "Title": title.strip(),
            "Publication Date": publication_date.strip(),
            "Content": content.strip()[:500] + "...",
            "Link": url
        }

    except Exception as e:
        print(f"Failed to scrape {url}: {e}")
        return None

articles = []
for url in uncredible_urls:
    print(f"Scraping: {url}")
    article_details = scrape_article(url)
    if article_details:
        articles.append(article_details)

for article in articles:
    print("\n--- Article ---")
    print(f"Title: {article['Title']}")
    print(f"Date: {article['Publication Date']}")
    print(f"Content Preview: {article['Content']}")
    print(f"Link: {article['Link']}\n")

df = pd.DataFrame(articles)
output_file = "scraped_uncredible_articles.csv"
df.to_csv(output_file, index=False)
print(f"\nResults saved to {output_file}")

Scraping: https://www.tellerreport.com/sports
Scraping: https://www.newsbreak.com/mountain-view-ca-sports
Scraping: https://newsrnd.com/sports
Scraping: https://baltimorecitywire.com/stories/tag/53-sports
Failed to scrape https://baltimorecitywire.com/stories/tag/53-sports: 403 Client Error: Forbidden for url: https://baltimorecitywire.com/stories/tag/53-sports

--- Article ---
Title: Sports - Teller Report
Date: No date found
Content Preview: Now you can see non-English news...
© Communities 2019 - Privacy...
Link: https://www.tellerreport.com/sports


--- Article ---
Title: Mountain View, CA Sports and More | NewsBreak
Date: No date found
Content Preview: Mountain View
This weekend saw a flurry of action in Bay Area high school football, with notable performances shaping the playoff picture. Among the highlights, De La Salle and Pittsburg secured dominant wins, while some ranked teams faced setbacks. The matchups across various leagues showcased high-scoring games and surprising resu

### Format DF to match DB table

In [123]:
from datetime import datetime
from dateutil import parser

df['publication_date'] = df['entry_published_date'].apply(lambda x: parser.parse(x).date()) #parse different date formats to date object format
dbdf = pd.read_csv('RSS_sports_feeds.csv')
dbdf['team_or_player'] = df['entry_title']
dbdf['source'] = df['feed_title']
dbdf['publication_date'] = df['publication_date'] 
dbdf['content'] = df['entry_summary']
dbdf['trust_score'] = 0.00  #default
dbdf['classification'] = 'unknown' #default
dbdf['link'] = df['entry_link']

#make a new df in the format of the DB table for easy inserting
sports_DB_df = dbdf[['team_or_player', 'source', 'publication_date', 'content', 'trust_score', 'classification', 'link']]
#save to a new CSV 
sports_DB_df.to_csv('formatted_sports_posts_for_DB.csv', index=False)
print("done")

done




## Define labeling approach to get classification and trust score to update sports_DB_df with ground truths

## Sentiment Analysis Tool analysis

In [71]:
#sentiment tool test
from transformers import pipeline

df = pd.read_csv('RSS_sports_feeds_11-12.csv') #scraped feeds

pipe_finnews = pipeline("text-classification", model="mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")
pipe_emotions = pipeline("text-classification", model="bhadresh-savani/distilbert-base-uncased-emotion")

headlines=  random.sample( df["entry_title"].tolist(), 15) 
# print(headlines)

# for headline in headlines: 
#     result_bert_finnews = pipe_finnews(headline)
#     result_bert_emotions = pipe_emotions(headline)
#     print(headline)
#     print(f"result_bert_finnews: {result_bert_finnews}")
#     # #print(f"result_bert_emotions: {result_bert_emotions}")
#     print()

#check labels
result_bert_finnews = pipe_finnews("Lionel Messi & Inter Miami SHOCKINGLY lose to Atlanta United 3-2 | SOTU")
print(f"result_bert_finnews: {result_bert_finnews}")


result_bert_finnews: [{'label': 'negative', 'score': 0.9906446933746338}]


In [78]:
#labels from original random sample
sample_actual_labels=[0,-1,0,-1,1,0,0,0,0,0,1,1,-1,1,-1]
print(f"Dataset labels\t: {sample_actual_labels}")

#my labels were the exact same as dataset labels   
my_sample_labels= [0,-1,0,0,1,0,0,0,0,-1,1,1,-1,-1,-1]
print(f"My labels\t: {my_sample_labels}")

#3/15 differing labels 

import numpy as np
correlation = np.corrcoef(sample_actual_labels, my_sample_labels)[0, 1]
print(f"Pearson Correlation Coefficient: {correlation}") 

import scipy.stats as stats

r, p_value = stats.pearsonr(sample_actual_labels, my_sample_labels)

print(f"Pearson Correlation Coefficient: {r:.2f}")
print(f"P-value: {p_value:.4f}")

# Interpretation
if p_value < 0.05:
    print("The correlation is statistically significant.")
else:
    print("The correlation is NOT statistically significant.")

Dataset labels	: [0, -1, 0, -1, 1, 0, 0, 0, 0, 0, 1, 1, -1, 1, -1]
My labels	: [0, -1, 0, 0, 1, 0, 0, 0, 0, -1, 1, 1, -1, -1, -1]
Pearson Correlation Coefficient: 0.6356845346445097
Pearson Correlation Coefficient: 0.64
P-value: 0.0109
The correlation is statistically significant.
