## Google Scholar scraping

Import all the necessary libraries:

In [1]:
import requests
from bs4 import BeautifulSoup
import scrapy
from scrapy.crawler import CrawlerProcess
import csv

Send test requests to make sure that resource is reachable:

In [2]:
urls = ['https://scholar.google.com/scholar?start=0&q=web+parsing&hl=en&as_sdt=0,5',
        'https://scholar.google.com/scholar?start=10&q=web+parsing&hl=en&as_sdt=0,5',
        'https://scholar.google.com/scholar?start=20&q=web+parsing&hl=en&as_sdt=0,5']
for url in urls:
    try:
        data = requests.get(url, timeout=3)
        data.raise_for_status()
    except requests.exceptions.HTTPError as errh:
        print("Http Error: ", errh)
    except requests.exceptions.ConnectionError as errc:
        print("Error Connecting: ",errc)
    except requests.exceptions.Timeout as errt:
        print("Timeout Error: ", errt)
    except requests.exceptions.RequestException as err:
        print("Another Error:", err)
    else: 
        print(data.status_code)

200
200
200


Create scraping class with parsing function:

In [3]:
articles_list = []

class ArticleSpider(scrapy.Spider):
    name = 'article'

    def start_requests(self):
        urls = ['https://scholar.google.com/scholar?start=0&q=web+parsing&hl=en&as_sdt=0,5',
                'https://scholar.google.com/scholar?start=10&q=web+parsing&hl=en&as_sdt=0,5',
                'https://scholar.google.com/scholar?start=20&q=web+parsing&hl=en&as_sdt=0,5']
        return [scrapy.Request(url=url, callback=self.parse) for url in urls]

    def parse(self, response):
        bs = BeautifulSoup(response.text, 'html.parser')
        articles = bs.find_all('div', {'class': 'gs_ri'})

        for article in articles:
            article_title = article.find('h3').find('a').text.encode("ascii", "ignore").decode()
            article_authors = article.find('div', {'class': 'gs_a'}).text
            article_authors = article_authors[:article_authors.find('-') - 1].encode("ascii", "ignore").decode()
            article_pubication_year = article.find('div', {'class': 'gs_a'}).text
            article_pubication_year = article_pubication_year[(article_pubication_year.rindex('- ') - 5):(article_pubication_year.rindex('- '))]
            article_partial_text = article.find('div', {'class': 'gs_rs'}).text.replace('\n', '').encode("ascii", "ignore").decode()
            
            articles_list.append([article_title, article_authors, article_pubication_year, article_partial_text])

Start scraping process:

In [4]:
process = CrawlerProcess()
process.crawl(ArticleSpider)
process.start()

2022-10-12 17:02:46 [scrapy.utils.log] INFO: Scrapy 2.6.3 started (bot: scrapybot)
2022-10-12 17:02:46 [scrapy.utils.log] INFO: Versions: lxml 4.9.1.0, libxml2 2.9.12, cssselect 1.1.0, parsel 1.6.0, w3lib 2.0.1, Twisted 22.8.0, Python 3.10.4 (tags/v3.10.4:9d38120, Mar 23 2022, 23:13:41) [MSC v.1929 64 bit (AMD64)], pyOpenSSL 22.0.0 (OpenSSL 1.1.1n  15 Mar 2022), cryptography 36.0.2, Platform Windows-10-10.0.19044-SP0
2022-10-12 17:02:46 [scrapy.crawler] INFO: Overridden settings:
{}
2022-10-12 17:02:46 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2022-10-12 17:02:46 [scrapy.extensions.telnet] INFO: Telnet Password: 95e7282c6d6cc5a9
2022-10-12 17:02:46 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.logstats.LogStats']
2022-10-12 17:02:46 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMi

Export data into the file:

In [5]:
with open ('articles.csv', 'w') as file:
    writer = csv.writer(file, delimiter=';')
    writer.writerow('title;authors;publication_year;partial_text'.split(';'))
    for row in articles_list:
        writer.writerow(';'.join(row).split(';'))