In [1]:
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import time

In [2]:
class Adahunse:
    
    def __init__(self, no_articles_to_collect, website=None, articles_collected=None, chrome_path=None, prefs=None, filename=None):
        
        self.no_articles_to_collect = no_articles_to_collect
        self.website = website or "https://hausa.legit.ng"
        self.articles_collected = articles_collected or 0
        
        self.driver = None
        self.chrome_path = chrome_path or 'C:\\Users\\NiniolaAdegboyega\\Downloads\\chromedriver_win32\\chromedriver.exe'
        # disable images in browser. For faster loading. 
        self.prefs = prefs or {"profile.managed_default_content_settings.images" : 2}
        
        self.article_path = "/html/body/div[3]/div/div[2]/div[2]/section/section[2]/div[2]/article[{}]/div/a"
        self.scroll_button_path = "/html/body/div[3]/div/div[2]/div[2]/section/section[2]/a"
        self.cancel_subscription_alert_path = '//*[@id="onesignal-popover-cancel-button"]'
        
        self.filename = filename or "../data/legitng/hausa_legit_ng_{}_{}.txt".format(
            time.strftime("%Y%m%d-%H%M%S"), self.articles_collected)
        self.file = open(self.filename, "w+", encoding="utf-8")
        
        
    def init_driver(self):
        """
        This func initializes the webdriver and disables images
        A wait is initialized with a 5 second timeout
        """
        options = Options()
        options.add_experimental_option("prefs", self.prefs)

        driver = webdriver.Chrome(self.chrome_path, options=options)
        # define a generic wait to be used throughout
        driver.wait = WebDriverWait(driver, 5)

        return driver
    
    def __check_no_of_articles(self):
        """
        This func checks number of articles
        that are on the loaded page
        """
        content = self.driver.page_source
        soup = bs(content)
        no_articles_on_page = len(soup.find_all('span', attrs={
            'class': 'c-article-card-horizontal__headline-hover-inner'}))
        return no_articles_on_page
    
    def __write_article_to_text(self, sentences):
        """
        This func writes individual setences to
        the open file.
        """
        # split into individual sentences
        sentence_split = filter(None, sentences.split("."))
        # write each individual sentences on a new line
        for s in sentence_split:
            self.file.write(s.strip() + "\n")
    
    def scroll_to_article_range(self):
        """
        This func scrolls the webpage until the article
        to be collected is in view. Avoid ElementNotInteractable error
        """
        # check number of articles currently on page
        no_articles_on_page = self.__check_no_of_articles()
        
        # ensure that there is at least one more article on page than our article number
        while no_articles_on_page < self.articles_collected + 2:
            # scroll to "Load More" link and click it 
            self.driver.execute_script("arguments[0].scrollIntoView();", 
                                       self.driver.find_element_by_xpath(self.scroll_button_path))
            time.sleep(2)
            self.driver.find_element_by_xpath(self.scroll_button_path).click()
            time.sleep(1)
            
            # check if article we are interested in is now on page
            no_articles_on_page = self.__check_no_of_articles()
            self.driver.execute_script("arguments[0].scrollIntoView();", 
                                       self.driver.find_element_by_xpath(self.scroll_button_path))
            time.sleep(1)
        
        # scroll the article into view   
        self.driver.execute_script("arguments[0].scrollIntoView();", 
                                   self.driver.find_element_by_xpath(self.article_path.format(
                                       self.articles_collected)))
        
    def on_article_action(self):
        """
        This func wraps all the actions that take place when an article is loaded.
        It used __write_article_to_text function. 
        """
        #driver.wait.until(EC.visibility_of_element_located((By.TAG_NAME, 'p')))
        # grab & parse page html
        content = self.driver.page_source
        soup = bs(content)

        # loop over individual p-elements 
        # & write their text to file
        # ignore the last 12 p-elements. They are adverts. 
        for paragraph in soup.findAll('p')[:-12]:
            self.__write_article_to_text(paragraph.text)
            
    def open_article_on_new_page_and_collect(self):
        """
        This func is to be used in a while loop to 
        collect all articles up to a defined number
        """
        # get link of article to be collected
        link = self.driver.find_element_by_xpath(self.article_path.format(
            self.articles_collected+1)).get_attribute("href")
        
        # open article in new window and switch to it 
        self.driver.execute_script("window.open('{}')".format(link))
        windows = self.driver.window_handles
        self.driver.switch_to.window(windows[1])

        # collect the article into file & increment articles_collected
        self.on_article_action()
        self.articles_collected += 1

        # close new window and return to main window
        self.driver.close()
        self.driver.switch_to.window(windows[0])
        
    def start(self):
        """
        This function puts together all the previous functions 
        in order to collect all the articles from the website. 
        """
        self.driver = self.init_driver()
        self.driver.get(self.website)
        time.sleep(2)
        # close subscription alert
        self.driver.wait.until(EC.visibility_of_element_located((By.XPATH, 
                                                                 self.cancel_subscription_alert_path)))
        self.driver.find_element_by_xpath(self.cancel_subscription_alert_path).click()
        
        # while we have not yet collected our target number of articles
        try:
            while self.articles_collected < self.no_articles_to_collect:
                if self.articles_collected > 0:
                    # ensure that articles to be collected is on-page
                    self.scroll_to_article_range()

                self.open_article_on_new_page_and_collect()

                # print a message for every 100 articles
                if not self.articles_collected % 100:
                    print("{} articles collected".format(self.articles_collected))

                # after returning to main window, check for element of article just collected
                self.driver.wait.until(EC.visibility_of_element_located(
                    (By.XPATH, self.article_path.format(self.articles_collected))))
            # close file & browser window    
            self.file.close()
            self.driver.close()
        # for any exception, print articles_collected & raise error
        except Exception as e:
            print("Failed after {} articles".format(self.articles_collected))
            self.file.close()
            self.driver.close()
            raise e

In [3]:
adhns = Adahunse(no_articles_to_collect=1000, articles_collected=0)

In [4]:
adhns.start()

100 articles collected
200 articles collected
300 articles collected
400 articles collected
500 articles collected
600 articles collected
700 articles collected
800 articles collected
900 articles collected
1000 articles collected
