# **Scraping GoogleNews Top Stories (No Pagination/1 Page)- BeautifulSoap4**

### **1. Parsing HTML**

In [342]:
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import requests
import pandas as pd

In [343]:
# specify url to scrape
url = 'https://news.google.com/topstories?hl=id&gl=ID&ceid=ID:id'
# alternative-1 (online parsing)
page = requests.get(url).text

# create an object to scrape various data later
soup = BeautifulSoup(page, 'html.parser')

In [None]:
# alternative-2 (offline parsing)
# uncomment iteration below to run
# with open('manual.html', 'r') as f:
#     page = f.read()
# page

### **2. Grab HTML Tag, Class, and Text**

In [344]:
# grab a title from article=all, .=unspecified class, DY5T1d.RZIKme=attribute
result_tl = soup.select('article .DY5T1d.RZIKme')
result_tl[0].text

'Ajudan Irjen Ferdy Sambo Ditembak, Kapolri Diminta Nonaktifkan Kadiv Propam, IPW: Beliau Saksi Kunci'

In [354]:
# grab all title
title = [t.text for t in result_tl]
title

['Ajudan Irjen Ferdy Sambo Ditembak, Kapolri Diminta Nonaktifkan Kadiv Propam, IPW: Beliau Saksi Kunci',
 'Kronologi Brigadir J Ditembak Polisi Bharada E di Rumah Irjen Ferdy Sambo',
 'Polisi Tembak Polisi, Keluarga Temukan 4 Bekas Luka Tembakan pada Jenazah Brigpol Yosua',
 'Polisi Tembak Polisi di Rumah Pejabat Polri, Brigadir J dari Propam Tewas',
 '8 Hal yang Diketahui dari Penembakan Maut Ajudan Kadiv Propam',
 'Ibu Pembunuh Shinzo Abe Anggota Gereja Sekte Kristen di Jepang',
 'Pembunuh Shinzo Abe: Identitas dan Motif Tembak Shinzo Abe',
 'Shinzo Abe, Nasionalis Sayap Kanan Tewas di Kota Kuno - Ayo Bandung',
 'Gereja Unifikasi Jepang Jawab soal Dugaan Keanggotaan Shinzo Abe',
 'Fakta Baru Pembunuh Shinzo Abe: dari Motif hingga Senjata',
 'Mengejutkan! 5 Fakta Sosok Mas Bechi Pelaku Pencabulan, Anak Kiai yang Jadi Wakil Rektor PonPes tapi Ogah-ogahan Salat',
 'Pencabulan Anak Kiai Jombang Tak Cukup Sekadar Cabut Izin Pesantren',
 "Sosok 'Sang Raja' Bechi Terjerat Pencabulan-Miliki 

In [346]:
# grab a timedate from article=all, time=element, datetime=attribute
result_dt = soup.select('[datetime]')
result_dt[0]

<time class="WW6dff uQIVzc Sksgp slhocf" datetime="2022-07-11T10:23:24Z">2 jam lalu</time>

In [347]:
# grab all timedate
timedate = [d['datetime'] for d in result_dt]
timedate

['2022-07-11T10:23:24Z',
 '2022-07-11T10:27:31Z',
 '2022-07-11T10:28:32Z',
 '2022-07-11T08:00:08Z',
 '2022-07-11T10:25:36Z',
 '2022-07-11T07:42:07Z',
 '2022-07-11T08:15:55Z',
 '2022-07-10T05:43:00Z',
 '2022-07-11T10:47:59Z',
 '2022-07-10T22:30:37Z',
 '2022-07-11T04:24:00Z',
 '2022-07-11T04:26:39Z',
 '2022-07-11T02:06:55Z',
 '2022-07-11T04:48:39Z',
 '2022-07-10T17:17:50Z',
 '2022-07-11T04:07:11Z',
 '2022-07-11T06:54:25Z',
 '2022-07-10T11:56:20Z',
 '2022-07-10T23:15:56Z',
 '2022-07-11T07:13:14Z',
 '2022-07-11T05:18:00Z',
 '2022-07-11T08:05:46Z',
 '2022-07-11T06:09:00Z',
 '2022-07-11T05:17:53Z',
 '2022-07-11T05:46:00Z',
 '2022-07-11T06:00:19Z',
 '2022-07-11T03:02:10Z',
 '2022-07-11T07:49:11Z',
 '2022-07-11T03:02:52Z',
 '2022-07-11T06:29:36Z']

In [348]:
# grab a source from article=all, .wEwyrc=attribute
result_src = soup.select('article .wEwyrc.AVN2gc.uQIVzc.Sksgp.slhocf')
result_src[0].text

'Tribunnews.com'

In [349]:
# grab a link
result_link = soup.select('article .DY5T1d.RZIKme')
result_link[0]['href']

# as you can see in the output, the links are relative-url 
# we should turn those into absolute-url

'./articles/CAIiEBAyWcRaEsaLzDtGejLjQO8qGQgEKhAIACoHCAow5o6OCzDE5aADMOO01AY?uo=CAUikAFodHRwczovL3d3dy50cmlidW5uZXdzLmNvbS9uYXNpb25hbC8yMDIyLzA3LzExL2FqdWRhbi1pcmplbi1mZXJkeS1zYW1iby1kaXRlbWJhay1rYXBvbHJpLWRpbWludGEtbm9uYWt0aWZrYW4ta2FkaXYtcHJvcGFtLWlwdy1iZWxpYXUtc2Frc2kta3VuY2nSAQA&hl=id&gl=ID&ceid=ID%3Aid'

In [355]:
# grab all source
source = [s.text for s in result_src]
source

['Tribunnews.com',
 'KOMPASTV',
 'Tribunnews.com',
 'detikNews',
 'detikNews',
 'CNN Indonesia',
 'detikNews',
 'Ayo Bandung',
 'CNN Indonesia',
 'detikNews',
 'tvOneNews.com',
 'CNN Indonesia',
 'detikcom',
 'Tribunnews',
 'JPNN.com',
 'detikNews',
 'Suara.com',
 'KOMPASTV',
 'detikFinance',
 'Suara.com',
 'Nasional Kompas.com',
 'Tribunnews',
 'Nasional Kompas.com',
 'detikNews',
 'Nasional Tempo',
 'CNBC Indonesia',
 'CNN Indonesia',
 'Tribunnews',
 'detikNews',
 'CNN Indonesia']

### **3. Turn Relative-URL into Absolute-URL**

In [351]:
# let's provide empty list to store absolute-links after iteration
# this way, we can parse links into a list
links = []

# let's turn all relative-url into absolute-url by iterating all links
base_url = 'https://news.google.com/'
for i in soup.select('article .DY5T1d.RZIKme'):
    ss = urljoin(base_url, i.get('href'))
    # put all absolute links into empty list
    links.append(ss)

# to remove bracket or unlist, just uncomment iteration below
# for inp in links:
#     print(inp)

### **4. Putting All of Data Together into a List**

In [356]:
# putting all of data into a list
all_data = list(zip(source, title, timedate, links))
all_data

[('Tribunnews.com',
  'Ajudan Irjen Ferdy Sambo Ditembak, Kapolri Diminta Nonaktifkan Kadiv Propam, IPW: Beliau Saksi Kunci',
  '2022-07-11T10:23:24Z',
  'https://news.google.com/articles/CAIiEBAyWcRaEsaLzDtGejLjQO8qGQgEKhAIACoHCAow5o6OCzDE5aADMOO01AY?uo=CAUikAFodHRwczovL3d3dy50cmlidW5uZXdzLmNvbS9uYXNpb25hbC8yMDIyLzA3LzExL2FqdWRhbi1pcmplbi1mZXJkeS1zYW1iby1kaXRlbWJhay1rYXBvbHJpLWRpbWludGEtbm9uYWt0aWZrYW4ta2FkaXYtcHJvcGFtLWlwdy1iZWxpYXUtc2Frc2kta3VuY2nSAQA&hl=id&gl=ID&ceid=ID%3Aid'),
 ('KOMPASTV',
  'Kronologi Brigadir J Ditembak Polisi Bharada E di Rumah Irjen Ferdy Sambo',
  '2022-07-11T10:27:31Z',
  'https://news.google.com/articles/CCAiC3ViV29IX0VybHEwmAEB?hl=id&gl=ID&ceid=ID%3Aid'),
 ('Tribunnews.com',
  'Polisi Tembak Polisi, Keluarga Temukan 4 Bekas Luka Tembakan pada Jenazah Brigpol Yosua',
  '2022-07-11T10:28:32Z',
  'https://news.google.com/articles/CAIiEMf747LKG2Pcv0tsIvkkp84qGQgEKhAIACoHCAow5o6OCzDE5aADMO2LqwY?uo=CAUihQFodHRwczovL3d3dy50cmlidW5uZXdzLmNvbS9uYXNpb25hbC8yMDIyLzA

### **5. Convert a List into DataFrame**

In [353]:
# convert a list to dataframe
df = pd.DataFrame(all_data, columns=['source', 'title', 'timedate', 'links'])

# save to csv
df.to_csv(r'C:\Users\wis\Documents\GitHub\webscraping\google\manual.csv', index=False)
df.head()

Unnamed: 0,source,title,timedate,links
0,Tribunnews.com,"Ajudan Irjen Ferdy Sambo Ditembak, Kapolri Dim...",2022-07-11T10:23:24Z,https://news.google.com/articles/CAIiEBAyWcRaE...
1,KOMPASTV,Kronologi Brigadir J Ditembak Polisi Bharada E...,2022-07-11T10:27:31Z,https://news.google.com/articles/CCAiC3ViV29IX...
2,Tribunnews.com,"Polisi Tembak Polisi, Keluarga Temukan 4 Bekas...",2022-07-11T10:28:32Z,https://news.google.com/articles/CAIiEMf747LKG...
3,detikNews,"Polisi Tembak Polisi di Rumah Pejabat Polri, B...",2022-07-11T08:00:08Z,https://news.google.com/articles/CBMicGh0dHBzO...
4,detikNews,8 Hal yang Diketahui dari Penembakan Maut Ajud...,2022-07-11T10:25:36Z,https://news.google.com/articles/CBMiZWh0dHBzO...
