# Sentiment Analysis

We're able to crawl for headlines from various collections but I don't yet know how to see which publishers are available.

In [1]:
from fundus import PublisherCollection, Crawler, NewsMap
from flair.data import Sentence
from flair.models import TextClassifier
from tqdm import tqdm

In [2]:
# initialize the crawler for news publishers based in the US
crawler = Crawler(PublisherCollection.us)

# crawl 2 articles and print
for article in crawler.crawl(max_articles=2):
    print(article)

Fundus-Article:
- Title: "Armed men fire on Haiti hospital reopening, killing at least 2"
- Text:  "At least two people were killed and others injured on Tuesday when armed men
          opened fire on a group of journalists who gathered for a [...]"
- URL:    https://www.voanews.com/a/armed-men-fire-on-haiti-hospital-reopening-killing-at-least-2-/7912982.html
- From:   Voice Of America (2024-12-24 22:35)
Fundus-Article:
- Title: "Rams takeaways: Improving offensive line difference in win over Jets"
- Text:  "EAST RUTHERFORD, N.J. — The Rams defeated the New York Jets, 19-9, on Sunday at
          MetLife Stadium in East Rutherford, N.J.  What we learned from a [...]"
- URL:    https://www.latimes.com/sports/rams/story/2024-12-23/rams-offensive-line-dominates-jets
- From:   Los Angeles Times (2024-12-23 11:00)


In [3]:
print(article.title)
print(article.lang) # detects language of article
article.plaintext

Rams takeaways: Improving offensive line difference in win over Jets 
en


'EAST RUTHERFORD, N.J. — The Rams defeated the New York Jets, 19-9, on Sunday at MetLife Stadium in East Rutherford, N.J.\n\nWhat we learned from a victory that extended the Rams’ win streak to four games and improved their record to 9-6:\n\nMatthew Stafford wins duel with Aaron Rodgers\n\nRams quarterback Matthew Stafford tossed a go-ahead touchdown pass to Tyler Higbee in the fourth quarter and improved his record against Aaron Rodgers to 5-13.\n\nStafford completed 14 of 19 passes for 110 yards and a touchdown, with one interception.\n\nIn the second quarter, safety Tony Adams picked off a pass. The turnover ended at five Stafford’s career best streak of games without an interception.\n\n“I thought Matthew was efficient in the pass game,” coach Sean McVay said.\n\nStafford, a 16th-year pro, has passed for 20 touchdowns, with eight interceptions.\n\nRodgers completed 28 of 42 passes for 256 yards and a touchdown. He lost a fumble that led to Stafford’s touchdown pass to Higbee.\n\nOf

In [4]:
# initialize the crawler for The New Yorker
crawler = Crawler(PublisherCollection.us.TheNewYorker)

# crawl 2 articles and print
for article in crawler.crawl(max_articles=2):
    print(article)

Fundus-Article:
- Title: "What We’re Reading to Start the New Year"
- Text:  "New Yorker writers and contributors on the books keeping them company this
          winter.  The New Yorker’s editors and critics considered [...]"
- URL:    https://www.newyorker.com/books/page-turner/what-were-reading-to-start-the-new-year
- From:   The New Yorker (2024-12-24 06:00)
Fundus-Article:
- Title: "What to Do if You Find Paul Giamatti Hiding in Your Christmas Tree"
- Text:  "Because of his ability to completely lose himself in any role—including that of
          a mildly depressed holiday ornament—it can be days before you [...]"
- URL:    https://www.newyorker.com/humor/shouts-murmurs/what-to-do-if-you-find-paul-giamatti-hiding-in-your-christmas-tree
- From:   The New Yorker (2024-12-24 06:00)


In [5]:
def crawl_headlines(crawlers, name_of_publishers, article_number=20):
    """Crawls headlines from a list of crawlers for specified publishers.

    This function takes three arguments:
        - crawlers: A list of web crawlers, each responsible for a specific publisher.
        - name_of_publishers: A list of publisher names corresponding to the crawlers.
        - article_number (optional): The maximum number of articles to crawl per publisher. Defaults to 20.

    It returns a dictionary where the keys are publisher names and the values are lists of headlines crawled from those publishers.
    """

    headlines = {}

    for crawler, name_of_publisher in zip(crawlers, name_of_publishers):
        """Iterates through crawlers and corresponding publisher names."""

        publisher = []

        for article in tqdm(crawler.crawl(max_articles=article_number)):
            """Crawls the title of articles from the current crawler up to the specified article_number."""
            publisher.append(article.title)

        headlines[name_of_publisher] = publisher

    return headlines

In [6]:
crawler1 = Crawler(PublisherCollection.us.CNBC)
crawler2 = Crawler(PublisherCollection.us.TheNation)

crawlers = [crawler1, crawler2]
names_of_publishers = ["CNBC", "The Nation"]
headlines = crawl_headlines(crawlers, names_of_publishers, article_number=10)

10it [00:20,  2.09s/it]
10it [00:21,  2.18s/it]


In [7]:
headlines

{'CNBC': ["As older Americans downsize, over 20 million homes could become available—but they're not where young people want to move",
  "4 phrases even couples in successful relationships need to use more: 'Clear, respectful communication feels good for everyone,' therapist says",
  'Both sides of Amazon scored big wins this year that should carry over into 2025',
  "This couple retired in their 30s in 1991 and have no regrets: 'It just keeps getting better'",
  'These are our top 10 things to watch in the stock market Tuesday',
  'Tech billionaire shares his 5-word piece of advice for a successful future: ‘I get up every morning’ with it in mind',
  'The polite way to gift cash this holiday season, according to etiquette pros—you can even use Venmo',
  'The Juan Soto wealth effect: Soaring athlete salaries attract wealth management firms',
  "What it would cost to live like the 'Home Alone' family today, according to financial advisors",
  "'Returnuary' — after the peak shopping seas

In [8]:
def predict_labels(publisher_headlines):
    """Predicts sentiment labels for headlines from each publisher.

    This function takes a dictionary `publisher_headlines` as input. 
    The dictionary keys are publisher names and the values are lists of headlines.

    The function performs sentiment analysis on each headline and stores the predicted labels 
    in a new dictionary with the same publisher names as keys.

    It returns a dictionary where the keys are publisher names and the values are lists of predicted sentiment labels 
    for the corresponding headlines.
    """

    sentiments_per_publisher = {}

    # Load a sentiment classifier (TextClassifier likely refers to a custom class or library)
    tagger = TextClassifier.load('sentiment')  

    for key, values in publisher_headlines.items():
        """Iterates through each publisher and its corresponding headlines."""

        temp = []
        for value in values:
            """Iterates through each headline for the current publisher"""
            sentence = Sentence(value)  # Create a Sentence object (likely custom class) for the headline
            tagger.predict(sentence)    # Predict sentiment label for the sentence using the loaded classifier
            temp.append(sentence.get_label().value)  # Append the predicted label value to a temporary list

        sentiments_per_publisher[key] = temp  # Add the list of predicted labels for the publisher to the result dictionary

    return sentiments_per_publisher

In [9]:
sentiments_per_publisher=predict_labels(headlines) # getting sentiments

In [11]:
def print_statistics(sentiments_per_publisher, number_of_articles=20):
    """
    This function iterates over a dictionary of sentiments per publisher and prints statistics about the sentiment distribution.

    Args:
        sentiments_per_publisher (dict): A dictionary where keys are publishers and values are lists of sentiment labels for their articles.
        number_of_articles (int, optional): The number of articles to consider when calculating statistics. Defaults to 20.
    """

    for keys, values in sentiments_per_publisher.items():
        """
        Iterates over each publisher and their corresponding sentiment labels.
        """

        positive = 0
        negative = 0
        something_else = 0
        for value in values:
            """
            Iterates over each sentiment label for the current publisher.
            """

            if value == "POSITIVE":
                positive += 1
            elif value == "NEGATIVE":
                negative += 1
            else:
                something_else += 1
        print(f"{keys} has {positive} positive and {negative} negative headlines out of {number_of_articles}.")
        if something_else >= 1:
            print(f"If something got wrong then it has {something_else} something_else headlines.")
        print()

    return


In [12]:
print_statistics(sentiments_per_publisher,number_of_articles = len(sentiments_per_publisher['CNBC']))

CNBC has 8 positive and 2 negative headlines out of 10.

The Nation has 5 positive and 5 negative headlines out of 10.



In [13]:
# Yep, many of these titles sound negative to me
headlines['The Nation']

['Hot Air Rises',
 'The Supreme Court Prepares to Take Its First Shot at Contraception',
 'Kash Patel’s First Day',
 'What to Expect With Trump’s Future China Policy',
 'A Far-Right Attacker Kills 5 in a Christmas Market. The German Far Right Takes Advantage.',
 'Novelist on a Deadline: Barry Malzberg, 1939–2024',
 'These Progressives Will Guide Us Through the Darkness',
 'The Best Albums of 2024',
 'A Stunning Year for Student Journalism',
 'The Spending Fiasco Was a Preview of the Trump-Musk Administration']

Let's try to filter by topic.

In [14]:
from typing import Dict, Any

keywords = ['market','stock']
def body_filter(extracted: Dict[str, Any]) -> bool:
    if body := extracted.get("body"):
        for word in keywords:
            if word in str(body).casefold():
                return False
    return True

In [15]:
crawler = Crawler(PublisherCollection.us)

for us_themed_article in crawler.crawl(max_articles=2,only_complete=body_filter):
    print(us_themed_article)

Fundus-Article:
- Title: "As older Americans downsize, over 20 million homes could become [...]"
- Text:  "As the U.S. continues to face a shortage of available homes, some may be looking
          at those occupied by "empty nesters" as an incoming source of [...]"
- URL:    https://www.cnbc.com/2024/12/24/zillow-empty-nesters-downsizing-wont-solve-housing-crisis.html
- From:   CNBC (2024-12-24 17:26)
Fundus-Article:
- Title: "A comprehensive list of 2024 tech layoffs"
- Text:  "The tech layoff wave is still going strong in 2024. Following significant
          workforce reductions in 2022 and 2023, this year has more than [...]"
- URL:    https://techcrunch.com/2024/12/24/tech-layoffs-2024-list/
- From:   TechCrunch (2024-12-24 20:30)


In [18]:
import datetime

def date_filter(extracted: Dict[str, Any]) -> bool:
    start_date = datetime.date(2024,11,1)
    end_date = datetime.date(2024,12,1)
    if publishing_date := extracted.get("publishing_date"):
        return not (start_date <= publishing_date.date() <= end_date)
    return True

In [19]:
for us_themed_article in crawler.crawl(max_articles=2,only_complete=date_filter):
    print(us_themed_article)

Fundus-Article:
- Title: "Biden Has “Pardoned” Eight Turkeys. Will He Spare the Lives of 40 Human Beings?"
- Text:  "Five days after Orlando Hall was executed in the federal death chamber, Donald
          Trump appeared in the White House Rose Garden for the 2020 [...]"
- URL:    https://theintercept.com/2024/11/27/biden-trump-commutations-death-row-executions/
- From:   The Intercept (2024-11-27 16:27)
Fundus-Article:
- Title: "Nike Promo Codes and Deals: Up to 40% Off"
- Text:  "Check out our deals for Nike this month, including up to 40% off select styles.
          In the midst of our reporting on Nike’s turbulent year, we [...]"
- URL:    https://www.wired.com/story/nike-promo-code/
- From:   Wired (2024-11-01 03:00)
