In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import httpx
from datetime import datetime
import re

In [2]:
url = 'https://www.cbsnews.com/news/year-in-review-top-news-stories-of-2018-month-by-month'

In [3]:
response = requests.get(url)
html_content = BeautifulSoup(response.content, 'html.parser')

In [4]:
sections = html_content.find_all('section', class_='content__body')
data_with_dates = []

for section in sections:
    list_items = section.find_all('li')
    
    for item in list_items:
        try:
            date_str = item.get_text().split(',')[-1].strip().strip(')').strip('"')
            date = datetime.strptime(date_str, "%m/%d/%y")
            headline = item.get_text().rsplit(" (", 1)[0]
            data_with_dates.append((date, headline))
        except ValueError:
            continue

df = pd.DataFrame(data_with_dates, columns=['date', 'headline'])

In [5]:
df_2018 = df.copy()
df_2018['date'] = pd.to_datetime(df_2018['date'])
df_2018 = df_2018.sort_values('date')
df_2018.set_index('date', inplace=True)

In [6]:
url = 'https://en.wikipedia.org/wiki/2019'
response = requests.get(url)
html_content = BeautifulSoup(response.content, 'html.parser')

In [7]:
# 2019
year = 2019
soup = html_content
list_items = html_content.find_all('a')
months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
wiki_months = [F"/wiki/{m}" for m in months]
month_regex = re.compile('|'.join(wiki_months), re.IGNORECASE)
month_links = soup.find_all('a', href=month_regex)

stop_string = 'Iraqi militiamen and protestersbreachthe front gate checkpoint of theUnited States embassy in Baghdadfollowing a U.S.military operationthat targeted an Iraqi militia on December 29.'
data = []

for link in month_links:
    parent_li = link.find_parent('li')
    if parent_li:
        row = parent_li.get_text(strip=True) 
        match = re.search(r"(\w+)\s(\d{1,2})", row) 
        if match:
            month = match.group(1)  
            day = match.group(2)    
            date = f"{year} {month} {day}"

        text = re.sub(r"^(January|February|March|April|May|June|July|August|September|October|November|December)\s\d{1,4}", "", row)
        text = re.sub(r"\[\d+\]", " ", text) 
        
        data.append((date, text))

        if stop_string in text:
            break

In [8]:
data[77] = ('2019 June 9', 'Hong Kong protests: Over 10million people in Hong Kong protest against proposed legislation regarding extradition to mainland China. It is the largest protest in Hong Kong since the 1997 handover. A large explosive eruption of Mount Sinabung in Indonesia sends a 7,000-meter ash column into the air, generating a pyroclastic flow 3–3.5 kilometers long towards the south and southeast of the mountain.')

In [9]:
data[131] = ('2019 September 4', 'Hong Kong protests: Hong Kong Chief Executive Carrie Lam announces the official withdrawal of the controversial Fugitive Offenders and Mutual Legal Assistance in Criminal Matters Legislation (Amendment) Bill 2019, and setting up of an independent study to probe social and economic inequality within the territory. In the United States the Federal Trade Commission threatens to fine YouTube and Googleup to $170 million for violation of collecting personal information from children under 13. Google tracked childrens YouTube history to regulate targeted advertising and the FTC took notice and took action.')

In [10]:
data[157] = ('2019 October 8', 'Ecuadorian protests: The Government of Ecuador, headed by President Lenín Moreno, moves to Guayaquil as the Carondelet Palace in Quitois overtaken by protesters and chaos persists in the capital. About 200 Extinction Rebellion activists block the gates of Leinster House(parliament) in the Republic of Ireland.')

In [11]:
data[158] = ('2019 October 9', data[158][1])

In [12]:
df = pd.DataFrame(data, columns=['date', 'headline'])
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

In [13]:
df_2019 = df.copy()

In [14]:
#2020
year = 2020
url = f'https://en.wikipedia.org/wiki/{year}'
response = requests.get(url)
html_content = BeautifulSoup(response.content, 'html.parser')

In [15]:
#2020
soup = html_content
list_items = html_content.find_all('a')
months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
wiki_months = [F"/wiki/{m}" for m in months]
month_regex = re.compile('|'.join(wiki_months), re.IGNORECASE)
month_links = soup.find_all('a', href=month_regex)

stop_string = "– Thetransition periodfollowingthe United Kingdom's exit from the European Unionon January 31, 2020, expires."
data = []

for link in month_links:
    parent_li = link.find_parent('li')
    if parent_li:
        row = parent_li.get_text(strip=True) 
        match = re.search(r"(\w+)\s(\d{1,2})", row) 
        if match:
            month = match.group(1)  
            day = match.group(2)    
            date = f"{year} {month} {day}"

        text = re.sub(r"^(January|February|March|April|May|June|July|August|September|October|November|December)\s\d{1,4}", "", row)   
        text = re.sub(r"\[\d+\]", " ", text) 
        data.append((date, text))

        if stop_string in text:
            break

In [16]:
data[52] = ('2020 March 27', data[52][1])

In [17]:
data[156] = ('2020 October 20' , data[156][1])

In [18]:
data[188] = ('2020 November 30', 'A penumbral lunar eclipseoccurs; the last of four lunar eclipses in 2020.')

In [19]:
df = pd.DataFrame(data, columns=['date', 'headline'])
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

In [20]:
df_2020 = df.copy()

In [21]:
#2021
year = 2021
url = f'https://en.wikipedia.org/wiki/{year}'
response = requests.get(url)
html_content = BeautifulSoup(response.content, 'html.parser')

In [22]:
#2021
soup = html_content
list_items = html_content.find_all('a')
months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
wiki_months = [F"/wiki/{m}" for m in months]
month_regex = re.compile('|'.join(wiki_months), re.IGNORECASE)
month_links = soup.find_all('a', href=month_regex)

stop_string = '–NASA,ESA, theCanadian Space Agencyand theSpace Telescope Science Institutelaunch theJames Webb Space Telescope, the successor of theHubble Space Telescope.'
data = []

for link in month_links:
    parent_li = link.find_parent('li')
    if parent_li:
        row = parent_li.get_text(strip=True) 
        match = re.search(r"(\w+)\s(\d{1,2})", row) 
        if match:
            month = match.group(1)  
            day = match.group(2)    
            date = f"{year} {month} {day}"

        text = re.sub(r"^(January|February|March|April|May|June|July|August|September|October|November|December)\s\d{1,4}", "", row) 
        text = re.sub(r"\[\d+\]", " ", text) 
        data.append((date, text))

        if stop_string in text:
            break

In [23]:
data[108] = ('2021 August 4', data[108][1])

In [24]:
df = pd.DataFrame(data, columns=['date', 'headline'])
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

In [25]:
df_2021 = df.copy()

In [26]:
#2022
year = 2022
url = f'https://en.wikipedia.org/wiki/{year}'
response = requests.get(url)
html_content = BeautifulSoup(response.content, 'html.parser')

In [27]:
#2022
soup = html_content
list_items = html_content.find_all('a')
months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
wiki_months = [F"/wiki/{m}" for m in months]
month_regex = re.compile('|'.join(wiki_months), re.IGNORECASE)
month_links = soup.find_all('a', href=month_regex)

stop_string = "– FormerPope Benedict XVIdiesat the age of 95, with his funeral being held inSt. Peter's Square, presided over byPope FrancisandCardinal Giovanni Battista Re."
data = []

for link in month_links:
    parent_li = link.find_parent('li')
    if parent_li:
        row = parent_li.get_text(strip=True) 
        match = re.search(r"(\w+)\s(\d{1,2})", row)
        if match:
            month = match.group(1)  
            day = match.group(2)    
            date = f"{year} {month} {day}"

        text = re.sub(r"^(January|February|March|April|May|June|July|August|September|October|November|December)\s\d{1,4}", "", row)   
        text = re.sub(r"\[\d+\]", " ", text) 
        data.append((date, text))

        if stop_string in text:
            break

In [28]:
data[37] = ('2022 March 9', "2022 South Korean presidential election:People Power PartycandidateYoon Suk-yeolis narrowly elected President of South Korea. Russian invasion of Ukraine: Russia is condemned by world leaders following anair strike in Mariupolthat destroys a hospital including a maternity and children's ward. ")

In [29]:
data[157] = ('2022 October 29', 'A double car bombing by al-Shabaab in Mogadishu, Somalia kills at least 121 people and injures around 300.')

In [30]:
data[158] = ('2022 October 29', 'A double car bombing by al-Shabaab in Mogadishu, Somalia kills at least 121 people and injures around 300.')

In [31]:
df = pd.DataFrame(data, columns=['date', 'headline'])
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

In [32]:
df_2022 = df.copy()

In [33]:
#2023
year = 2023
url = f'https://en.wikipedia.org/wiki/{year}'
response = requests.get(url)
html_content = BeautifulSoup(response.content, 'html.parser')

In [34]:
#2023
soup = html_content
list_items = html_content.find_all('a')
months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
wiki_months = [F"/wiki/{m}" for m in months]
month_regex = re.compile('|'.join(wiki_months), re.IGNORECASE)
month_links = soup.find_all('a', href=month_regex)

stop_string = "–Queen Margrethe IIofDenmarkannouncesher abdicationeffective January 14, 2024, after 52 years on the throne."
data = []

for link in month_links:
    parent_li = link.find_parent('li')
    if parent_li:
        row = parent_li.get_text(strip=True) 
        match = re.search(r"(\w+)\s(\d{1,2})", row)
        if match:
            month = match.group(1)  
            day = match.group(2)   
            date = f"{year} {month} {day}"

        text = re.sub(r"^(January|February|March|April|May|June|July|August|September|October|November|December)\s\d{1,4}", "", row) 
        text = re.sub(r"\[\d+\]", " ", text) 
        data.append((date, text))

        if stop_string in text:
            break

In [35]:
data[35] = ('2023 March 4', 'Kivu conflict: Burundi deploys 100 troops to the Democratic Republic of the Congoto help fight insurgencies by militias, including M23.')

In [36]:
data[37] = data[35]

In [37]:
data[76] = ('2023 May 21', 'The May 2023 Greek legislative electionis held; the ruling New Democracy wins a plurality of seats in the Hellenic Parliament. Just days later incumbent prime minister Kyriakos Mitsotakis, called for another snap election to be held in June.')

In [38]:
data[78] = data[76]

In [39]:
data[97] = ('2023 July 3', 'In the largest incursion by Israel into the West Bank since the Second Intifada, the Israeli military deploys ground forces and armed drones into the Jenin camp, killing 13 and injuring more than 100. An attack claimed by Hamas as retaliation for the incursion, occurs in Tel Aviv the following day, injuring nine.')

In [40]:
data[98] = ('2023 July 3', 'In the largest incursion by Israel into the West Bank since the Second Intifada, the Israeli military deploys ground forces and armed drones into the Jenin camp, killing 13 and injuring more than 100. An attack claimed by Hamas as retaliation for the incursion, occurs in Tel Aviv the following day, injuring nine.')

In [41]:
data[99] = data[98]

In [42]:
data[100] = data[99]

In [43]:
data[137] = ('2023 October 3', 'Elected on 8 January, Kevin McCarthy is removed as Speaker of the United States House of Representatives, with Mike Johnson being elected new Speaker on 25 October.')

In [44]:
data[138] = ('2023 October 3', 'Elected on 8 January, Kevin McCarthy is removed as Speaker of the United States House of Representatives, with Mike Johnson being elected new Speaker on 25 October.')

In [45]:
data[139] = ('2023 October 7', '2023 Israel–Hamas war:Hamas launches an incursion into southern Israel from the Gaza Strip, prompting a military response from the Israel Defense Forces. Israel launches numerous air strikes on Lebanon after rockets are fired by Hezbollah and further attempts are made to penetrate Israel. A series of earthquakes occur in Herat Province in Afghanistan, killing over 1,000 people and injuring nearly 2,000, with tremors felt in Iran and Turkmenistan. The earthquakes are the deadliest in the country since 1998.')

In [46]:
data[140] = ('2023 October 7', '2023 Israel–Hamas war:Hamas launches an incursion into southern Israel from the Gaza Strip, prompting a military response from the Israel Defense Forces. Israel launches numerous air strikes on Lebanon after rockets are fired by Hezbollah and further attempts are made to penetrate Israel. A series of earthquakes occur in Herat Province in Afghanistan, killing over 1,000 people and injuring nearly 2,000, with tremors felt in Iran and Turkmenistan. The earthquakes are the deadliest in the country since 1998.')

In [47]:
data[141] = data[140]

In [48]:
data[142] = data[141]

In [49]:
data[143] = ('2023 October 7', data[143][1])

In [50]:
data[160] = ('2023 November 19', '2023 Argentine presidential election: Following the first round on 22 October 2023,Javier Mileiwins in the second round of the2023 Argentine presidential election, assuming office on10 DecemberwithVictoria Villarruelas hisvice president. ')

In [51]:
data[162] = data[160]

In [52]:
#2023
df = pd.DataFrame(data, columns=['date', 'headline'])
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

In [53]:
#2023
df_2023 = df.copy()

In [54]:
#2024
year = 2024
url = f'https://en.wikipedia.org/wiki/{year}'
response = requests.get(url)
html_content = BeautifulSoup(response.content, 'html.parser')

In [55]:
#2024
soup = html_content
list_items = html_content.find_all('a')
months = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
wiki_months = [F"/wiki/{m}" for m in months]
month_regex = re.compile('|'.join(wiki_months), re.IGNORECASE)
month_links = soup.find_all('a', href=month_regex)

stop_string = "– Assuming thenext United Kingdom general electionhas not already taken place, theParliament elected in 2019will automatically bedissolved, with the next general election taking place no later than January 28, 2025."
data = []

for link in month_links:
    parent_li = link.find_parent('li')
    if parent_li:
        row = parent_li.get_text(strip=True) 
        match = re.search(r"(\w+)\s(\d{1,2})", row) 
        if match:
            month = match.group(1)  
            day = match.group(2)    
            date = f"{year} {month} {day}"

        text = re.sub(r"^(January|February|March|April|May|June|July|August|September|October|November|December)\s\d{1,4}", "", row)   
        text = re.sub(r"\[\d+\]", " ", text) 
        data.append((date, text))

        if stop_string in text:
            break

In [56]:
data[15] = ('2024 January 12', data[15][1])

In [57]:
data[19] = ('2024 February 8', data[19][1])

In [58]:
#2024
df = pd.DataFrame(data, columns=['date', 'headline'])
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)

In [59]:
#2024
df_2024 = df.copy()

In [60]:
dfs = [df_2018, df_2019, df_2020, df_2021, df_2022, df_2023, df_2024]
market_sentiment = pd.concat(dfs, axis=0)

In [61]:
market_sentiment.to_csv('data/headlines_2018_2024', index=True)