In [23]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import xml.etree.ElementTree as ET
from datetime import datetime, timedelta

In [40]:
class SearchEngine():
    """Search engine for the News Feed website, which takes a search term and optional start and finish dates to produce relevant news items."""

    def generate_search_url(self, search_term, from_date=None, to_date=None):
        """
        Generates a search URL for the given search term and data filter.
        """
        if from_date and not to_date:
            # no to date, default to a timeframe of 10 days
            to_date = from_date + timedelta(days=10)
        elif to_date and not from_date:
            # no from date, default to a timeframe of 10 days
            from_date = to_date - timedelta(days=10)
        elif not to_date and not from_date:
            # neither exist, default to a timeframe of the past 10 days
            to_date = datetime.today()
            from_date = to_date - timedelta(days=10)
        time_filter = f'after:{from_date.strftime("%Y-%m-%d")} before:{to_date.strftime("%Y-%m-%d")}'
        url = f'https://news.google.com/rss/search?q={search_term}+{time_filter}&hl=en-US&gl=US&ceid=US:en'
        return url

    def get_text(self, description):
        """
        Extracts the text from a description string.
        """
        start = description.find('<p>') + 3
        end = description.find('</p>')
        return description[start:end]

    def parse_news_items(self, root):
        """
        Parses the news items from the XML root element.
        """
        news_items = []
        for item in root.findall('.//channel/item'):
            title = item.find('title').text
            link = item.find('link').text
            description = self.get_text(item.find('description').text)
            pub_date = pd.to_datetime(item.find('pubDate').text)
            source = item.find('source').text
            news_item = {
                'title': title,
                'link': link,
                'description': description,
                'pub_date': pub_date,
                'source': source
            }
            news_items.append(news_item)
        return news_items

    def get_news(self, search_term, from_date, to_date):
        """
        Searches Google News for the given search term and data filter, and returns
        a DataFrame containing the news items.
        """
        url = self.generate_search_url(search_term, from_date, to_date)
        print(url)
        response = requests.get(url)
        if response.status_code != 200:
            raise Exception('Failed to fetch news items.')
        root = ET.fromstring(response.text)
        news_items = self.parse_news_items(root)
        df = pd.DataFrame(news_items)
        df.to_csv(f'{search_term}_news.csv', encoding='utf-8-sig', index=False)
        return df
    
    def search(self, search_term, from_date=None, to_date=None):
        data = self.get_news(search_term, from_date, to_date)
        return data

In [42]:
class ArticleParser():
    """Class that contains the functions to parse one article given"""

    def __init__(self, link):
        self.link = link
        html_text = requests.get(self.link)
        soup = BeautifulSoup(html_text.content.decode('utf-8'))
        self.title = soup.find('title')
        body = soup.find_all('p')
        lists = soup.find_all('li')
        self.text = ' '.join([p.text for p in body]) + " " + ' '.join([p.text for p in lists])

    def graph(self):
        raise NotImplementedError

    def summarize(self):
        raise NotImplementedError
    
    def sentiment(self):
        raise NotImplementedError
    
    def bias(self):
        raise NotImplementedError
    
    def to_csv(self):
        


                                                title  \
0   Kansas City Chiefs superfan who became a fugit...   
1   Kansas City Chiefs superfan 'ChiefsAholic' arr...   
2   K.C. Chiefs superfan went on bank robbery spre...   
3   Atlanta Nail First salon robbery ends with cus...   
4   Nail salon robbery goes awry as customers comp...   
..                                                ...   
87  Man robbed, hit by vehicle after leaving North...   
88  (UPDATE) Police Seek Public's Help in Locating...   
89  MPD seeks public’s help finding Waffle House r...   
90  Armed suspects steal $15k during armored truck...   
91  60 shots, 2 wounded, 1 dead, 6 arrested after ...   

                                                 link  \
0   https://news.google.com/rss/articles/CBMiXGh0d...   
1   https://news.google.com/rss/articles/CBMiigFod...   
2   https://news.google.com/rss/articles/CBMifmh0d...   
3   https://news.google.com/rss/articles/CBMicmh0d...   
4   https://news.google.com/rs