# Use Beautiful Soup for Web Scraping News Headlines

In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import date
today = date.today()
d = today.strftime("%m-%d-%y")
print("date =" ,d)

date = 06-17-22


## Function to get duration from news dataset

In [None]:
from datetime import datetime
from dateutil import parser

def get_date_range(date_list):
  f = min([date for date in date_list  if date !='N/A'], key=lambda d: parser.parse(d, fuzzy=True))
  t = max([date for date in date_list  if date !='N/A'], key=lambda d: parser.parse(d, fuzzy=True))
  return (f,t)

# Scrap news from BBC.CO.UK

In [None]:
bbc = "https://www.bbc.co.uk/search?q=COVID-19&page=1"
res = requests.get(bbc)
soup = BeautifulSoup(res.content, 'html.parser')

In [None]:
headlines = soup.find_all('span',{'aria-hidden':'false'})
date = soup.find_all('span',{'class':"ssrcss-8g95ls-MetadataSnippet ecn1o5v2"})

len(headlines),len(date)

(10, 28)

### Scrap headlines from BBC.CO.UK

In [None]:
for i in range(len(headlines)):
  print(i, headlines[i].text)  

0 Origins: Hunting the Source of Covid-19
1 Totally Under Control: Trump and Covid-19
2 Newsday: Mass Covid-19 testing in Beijing
3 Covid-19 in the UK
4 Coronavirus: More than 100 TfL workers died from Covid-19
5 Datganiad COVID-19
6 Rygbi Cymru a COVID-19
7 Back to school and Covid-19 in the East of England
8 Covid-19 in the East: Your questions answered
9 Covid-19 visiting rules relaxed at Worcestershire hospitals


### Scrap date from BBC.CO.UK

In [None]:
from dateutil import parser
date_list=[]
for i in range(len(date)):
  # Check if the string contains a date or not
  if date[i].text in ['Programmes','News','Sport']:
    try:
      news_date = parser.parse(date[i-1].text, fuzzy=True)
      print(i, date[i-1].text)
      date_list.append(date[i-1].text)
    except:
      date_list.append("N/A")
      print(i,"N/A")

1 30 October 2021
4 1 November 2020
7 3 days ago
10 6 March 2020
13 3 days ago
15 N/A
18 27 March 2020
20 N/A
23 14 October 2020
26 3 days ago


### 10 News per page, set range to 1000 to get 10k news

In [None]:
# 10 News per page, set range to 1000 to get 10k news
date_list=[]
headlines_ls = []
for i in range(1000):
  print(i)  
  bbc= "https://www.bbc.co.uk/search?q=COVID-19&page="+str(i)
  res = requests.get(bbc)
  soup = BeautifulSoup(res.content, 'html.parser')

  headlines = soup.find_all('span',{'aria-hidden':'false'})
  date = soup.find_all('span',{'class':"ssrcss-8g95ls-MetadataSnippet ecn1o5v2"})
  soup.find_all('span',{'class':"ssrcss-tq7xfh-PromoContent e1f5wbog7"})


  for i in range(len(headlines)):
    headlines_ls.append(headlines[i].text)
    #print(i, headlines[i].text) 

  for i in range(len(date)):
    # Check if the string contains a date or not
    if date[i].text in ['Programmes','News','Sport']:
      try:
        news_date = parser.parse(date[i-1].text, fuzzy=True)
        #print(i, date[i-1].text)
        date_list.append(date[i-1].text)
      except:
        if date[i].text != date[i-1].text:
          date_list.append("N/A")
          #print(i,"N/A")

  dict_news = {'headlines': headlines_ls, 'date': date_list}
  df = pd.DataFrame(dict_news)
  df.to_csv('bbc_news.csv', index=False)

In [None]:
dict_news = {'label':["TRUE" for i in range(len(headlines_ls))],'source':["bbc" for i in range(len(headlines_ls))],'title': headlines_ls, 'date': date_list}
df = pd.DataFrame(dict_news)

path = './New Dataset/'
df.to_csv(path+'bbc_real_news.csv', index=False)

# Scrap news from axio.com

In [None]:
link = 'https://www.axios.com/results?q=COVID-19&page=1'

### Scrap headlines and date from axio.com

In [None]:
res = requests.get(link)
soup = BeautifulSoup(res.content, 'html.parser')

for i in soup.find_all("span",{'data-cy':"time-rubric"}):
   print(i.text)

Updated Mar 20, 2022 - COVID
Jun 10, 2022 - COVID
Updated Dec 28, 2021 - Health
Mar 25, 2021 - Technology
Updated Dec 6, 2021 - World
Updated Nov 30, 2020 - Health
Updated Apr 15, 2022 - Health
Updated May 13, 2022 - Health
Updated Apr 28, 2022 - Health
Updated May 20, 2021 - Axios Events


In [None]:
headlines = soup.find_all("a",{'data-cy':"headline"})
date = soup.find_all("span",{'data-cy':"time-rubric"})

len(headlines),len(date)

(10, 10)

### Get 5*10 samples, 95 max from this website

In [None]:
#Get 5*10 samples, 95 max from this website
from dateutil import parser
from datetime import date
today = date.today().strftime("%m-%d-%y")
path = '/content/drive/MyDrive/CoVerifi&MedVerifi/JMIR ML Models/New Dataset/'
date_list=[]
headlines_ls = []
for i in range(10):

  link = 'https://www.axios.com/results?q=COVID-19&page='+str(i)

  res = requests.get(link)
  soup = BeautifulSoup(res.content, 'html.parser')

  headlines = soup.find_all("a",{'data-cy':"headline"})
  date = soup.find_all("span",{'data-cy':"time-rubric"})


  for i in range(len(headlines)):
    headlines_ls.append(headlines[i].text)

  for i in date:
    try:
      date_list.append(parser.parse(i.text, fuzzy=True).strftime("%m-%d-%y"))
    except:
      date_list.append(today)
  print(len(date_list))


  dict_news = {'headlines': headlines_ls, 'date': date_list}
  df = pd.DataFrame(dict_news)
  df.to_csv(path+'axios_95.csv', index=False)
f,t = get_date_range(date_list)
df.to_csv(path+'axios_95_from_'+f+'_to_'+t+'.csv', index=False)

In [None]:
get_date_range(date_list)

('06-11-20', '06-17-22')

# Scrap news from CBS NEWS

In [None]:
link = 'https://www.cbsnews.com/tag/covid-19/1/'

### Scrap headlines and date from CBS NEWS

In [None]:
res = requests.get(link)
soup = BeautifulSoup(res.content, 'html.parser')

for i in soup.find_all("h3"):
   print(i.text)

Latest News
U.S.
MoneyWatch
Politics
Health
World
Entertainment
Technology
Science
Crime
Space
Latest Galleries
Latest Videos


In [None]:
# 15 news per page
headlines = soup.find_all("h4")[:24]
date = soup.find_all("li",{'class':"item__date"})[:24]

len(headlines),len(date)

(97, 94)

### Get 25*200 samples

In [None]:
#Get 25*200 samples
from dateutil import parser
from datetime import date
today = date.today().strftime("%m-%d-%y")
path = './New Dataset/'
date_list=[]
headlines_ls = []
for i in range(1,201):
  if i%10 ==0:
    print(i)

  link = 'https://www.cbsnews.com/tag/covid-19/'+str(i)+'/'

  res = requests.get(link)
  soup = BeautifulSoup(res.content, 'html.parser')

  headlines = soup.find_all("h4")[:24]
  date = soup.find_all("li",{'class':"item__date"})[:24]


  for i in range(len(headlines)):
    headlines_ls.append(headlines[i].text)

  for i in date:
    try:
      date_list.append(parser.parse(i.text, fuzzy=True).strftime("%m-%d-%y"))
    except:
      date_list.append(today)
  print(len(date_list))


  dict_news = {'headlines': headlines_ls, 'date': date_list}
  df = pd.DataFrame(dict_news)
  df.to_csv(path+'cbsnews_4k8.csv', index=False)
f,t = get_date_range(date_list)
df.to_csv(path+'cbsnews_4k8_from_'+f+'_to_'+t+'.csv', index=False)

# Scrap news from the globe and mail

In [None]:
link = 'https://www.theglobeandmail.com/search/?q=COVID-19&mode=all&S=relevant'

### Scrap headlines and date from the globe and mail

In [None]:
res = requests.get(link)
soup = BeautifulSoup(res.content, 'html.parser')

for i in soup.find_all("span",{"class":"c-card__label c-card__label--muted"}):
   print(i.text)

Report on Business
Music
Canada
Canada
World
Health & Fitness
Canada
U.S. Politics
Sponsor Content
British Columbia


In [None]:
headlines = soup.find_all("div",{"class":"c-card__hed-text"})
date = soup.find_all("time",{"class":"c-timestamp"})

len(headlines),len(date)

(10, 10)

### Get 25*200 samples

In [None]:
#Get 25*200 samples
from dateutil import parser
from datetime import date
today = date.today().strftime("%m-%d-%y")
path = './New Dataset/'
date_list=[]
headlines_ls = []
for i in range(1,450):
  if i%10 ==0:
    print(i)

  link = 'https://www.theglobeandmail.com/search/?q=COVID-19&mode=all&page='+str(i)+'&S=relevant'

  res = requests.get(link)
  soup = BeautifulSoup(res.content, 'html.parser')

  headlines = soup.find_all("div",{"class":"c-card__hed-text"})
  date = soup.find_all("time",{"class":"c-timestamp"})
  source = soup.find_all("span",{"class":"c-card__label c-card__label--muted"})


  for i in range(len(headlines)):
    if source[i].text not in['Sponsor Content', 'Paid Post','Globe and Mail Events Content']:
      headlines_ls.append(headlines[i].text)

  for i in date:
    try:
      date_list.append(parser.parse(i.text, fuzzy=True).strftime("%m-%d-%y"))
    except:
      date_list.append(today)
  print('len(date_list):', len(date_list))


  dict_news = {'headlines': headlines_ls, 'date': date_list}
  df = pd.DataFrame(dict_news)
  df.to_csv(path+'theglobeandmail_5k.csv', index=False)
f,t = get_date_range(date_list)
df.to_csv(path+'theglobeandmail_5k_from_'+f+'_to_'+t+'.csv', index=False)

In [None]:
f,t = get_date_range(date_list)
df.to_csv(path+'theglobeandmail_4k4_from_'+f+'_to_'+t+'.csv', index=False)

In [None]:
date_list2 = []
for d in date_list:
  if parser.parse(d) > parser.parse('06-17-2022'):
    print(d)
    date_list2.append(parser.parse(d).strftime("%m-%d")+'-21')
    print(parser.parse(d).strftime("%m-%d")+'-21')
  else:
    date_list2.append(d)


## Scrap news from New Sciencetist


In [None]:
link = 'https://www.newscientist.com/search/?q=COVID-19&page=2'

### Scrap headlines and date from the globe and mail

In [None]:
res = requests.get(link)
soup = BeautifulSoup(res.content, 'html.parser')

for i in soup.find_all("span",{"class":"published-date font-sans-serif-xxs--regular"}):
   print(i.text)

18 May 2022
23 April 2020
13 January 2022
2 March 2020
5 April 2022
5 March 2021
17 March 2021
11 March 2022
28 July 2020
13 April 2021


In [None]:
headlines = soup.find_all("h2",{"class":"card__heading"})
date = soup.find_all("span",{"class":"published-date font-sans-serif-xxs--regular"})

len(headlines),len(date)

(10, 10)

### Get max 1337

In [None]:
#Get max 1337
from dateutil import parser
from datetime import date
today = date.today().strftime("%m-%d-%y")
path = '../New Dataset/'
date_list=[]
headlines_ls = []
for i in range(1,140):
  if i%10 ==0:
    print(i)

  link = 'https://www.newscientist.com/search/?q=COVID-19&page='+str(i)

  res = requests.get(link)
  soup = BeautifulSoup(res.content, 'html.parser')

  headlines = soup.find_all("h2",{"class":"card__heading"})
  date = soup.find_all("span",{"class":"published-date font-sans-serif-xxs--regular"})


  for i in range(len(headlines)):
    headlines_ls.append(headlines[i].text)

  for i in date:
    try:
      date_list.append(parser.parse(i.text, fuzzy=True).strftime("%m-%d-%y"))
    except:
      date_list.append(today)
  print(len(date_list))


  dict_news = {'headlines': headlines_ls, 'date': date_list}
  df = pd.DataFrame(dict_news)
  df.to_csv(path+'newscientist_1k3.csv', index=False)
f,t = get_date_range(date_list)
df.to_csv(path+'newscientist_1k3_from_'+f+'_to_'+t+'.csv', index=False)

# Scrap Fake News from poynter.org

In [None]:
link = 'https://www.poynter.org/ifcn-covid-19-misinformation/'

### Scrap headlines and date from the globe and mail

In [None]:
for i in soup.find_all("span",{"class":"entry-title--red"}):
   print(i.text)

<html>
<head><title>403 Forbidden</title></head>
<body>
<center><h1>403 Forbidden</h1></center>
<hr/><center>nginx</center>
</body>
</html>

In [None]:
from urllib.request import Request, urlopen

req = Request(link, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()

soup = BeautifulSoup(webpage, 'html.parser')

In [None]:
date_list = []
headlines_ls = []
labels = []
for i in soup.find_all("span",{"class":"entry-title--red"}):
   headlines_ls.append(i.next_sibling)
   labels.append(i.text)
for i in soup.find_all('p',{"class":'entry-content__text'}):
  try:
    date_list.append(parser.parse(i.text, fuzzy=True).strftime("%m-%d-%y"))
  except:
    continue
len(date_list), len(headlines_ls), len(labels)

(15, 15, 15)

### Get 15*1000

In [None]:
#Get 15*1000
from dateutil import parser
from datetime import date
today = date.today().strftime("%m-%d-%y")
path = '../New Dataset/'
date_list = []
headlines_ls = []
labels = []
for i in range(1,1001):
  if i%10 ==0:
    print(i)

  link = 'https://www.poynter.org/ifcn-covid-19-misinformation/page/'+str(i)+'/'

  req = Request(link, headers={'User-Agent': 'Mozilla/5.0'})
  webpage = urlopen(req).read()

  soup = BeautifulSoup(webpage, 'html.parser')

  for i in soup.find_all("span",{"class":"entry-title--red"}):
    headlines_ls.append(i.next_sibling)
    labels.append(i.text)
  for i in soup.find_all('p',{"class":'entry-content__text'}):
    if 'Fact-Checked' not in i.text:
      try:
        date_list.append(parser.parse(i.text, fuzzy=True).strftime("%m-%d-%y"))
      except:
        continue
  print(len(date_list), len(headlines_ls), len(labels))


  dict_news = {'headlines': headlines_ls, 'date': date_list, 'label':labels}
  df = pd.DataFrame(dict_news)
  df.to_csv(path+'fake_news_Poynter_15k.csv', index=False)
f,t = get_date_range(date_list)
df.to_csv(path+'fake_news_Poynter_15k_from_'+f+'_to_'+t+'.csv', index=False)

# Combine news dataset

In [1]:
import re

def clean_sentence(ls):
    return [re.sub(r'(^[ \n]+^[ \t]+|[ \t]+(?=:))', '', a, flags=re.M).replace('\t','') for a in ls]

In [3]:
import numpy as np
import pandas as pd
path = './New Dataset/'
bbc_path = path+'bbc_real_news.csv'
axios_path = path+'New Dataset/axios_95_from_06-11-20_to_06-17-22.csv'
cbsnews_path = path+'New Dataset/cbsnews_4k8_from_02-16-21_to_06-17-22.csv'
newscientist_path=path+'New Dataset/newscientist_1k3_from_01-21-20_to_06-15-22.csv'
poynter_news_path = path+'New Dataset/fake_news_Poynter_15k_from_03-20-20_to_02-23-22.csv'

bbc_news = pd.DataFrame({'headlines':pd.read_csv(bbc_path,usecols=['title'])['title']})
axios_news = pd.read_csv(axios_path, usecols=['headlines'])
cbsnews = pd.read_csv(cbsnews_path, usecols = ['headlines'])
newscientist_news = pd.read_csv(newscientist_path, usecols = ['headlines'])

dfReal_total = pd.concat([bbc_news, axios_news, cbsnews, newscientist_news])
dfReal_total = dfReal_total.dropna()
dfReal_total = pd.DataFrame({'title':clean_sentence(dfReal_total['headlines']), 'label':1})


poynter_news = pd.read_csv(poynter_news_path)

# All kinds of labels
label_categories=np.array(poynter_news['label'].drop_duplicates())

#Get the label list that contains not contains the text in fake_labels list
fake_labels = ['false', 'misleading','no evidence','fake']
true_labels = []

for l in label_categories:
  if not any([fake_label in l.lower() for fake_label in fake_labels]):
    true_labels.append(l)
#print(true_labels)

poynter_news_cleaned = poynter_news[poynter_news.label.isin(true_labels) == False]

dfFake_total = pd.DataFrame({'title': clean_sentence(poynter_news_cleaned['headlines']), 'label':0})

dfTotal = pd.concat([dfReal_total,dfFake_total])
dfTotal,len(dfReal_total), len(dfFake_total)
dfTotal.to_csv(path+'New_External_Validation_Dataset.csv', index=False)
