In [1]:
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import time

In [2]:
class Awokose:
    
    def __init__(self, chrome_path=None, prefs=None, filename=None, unallowed_tokens=None, source_to_start_from=None, page_to_start_from=None, article_to_start_from=None):
        
        self.page_to_start_from = page_to_start_from or 0
        self.article_to_start_from = article_to_start_from or 0
        self.source_to_start_from = source_to_start_from or 0
        #self.article_xpath_number = 3
        
        self.sources = ["https://www.hausaloaded.com/search/label/Wasanni",
                        "https://www.hausaloaded.com/search/label/Labarai",
                        "https://www.hausaloaded.com/search/label/Politics%20Musics",
                        "https://www.hausaloaded.com/search/label/Fadakarwa",
                        "https://www.hausaloaded.com/search/label/kannywood"
                       ]
        
        self.driver = None
        self.chrome_path = chrome_path or 'C:\\Users\\NiniolaAdegboyega\\Downloads\\chromedriver_win32\\chromedriver.exe'
        # disable images in browser. For faster loading. 
        self.prefs = prefs or {"profile.managed_default_content_settings.images" : 2}
        
        #self.article_path = "/html/body/div[3]/div[3]/div/div[2]/div/div[1]/div[{}]/div/div/article/div[3]/div[2]/a"
        self.load_more_button_path = '//*[@id="Blog1_blog-pager-older-link"]'
        
        self.filename = filename or "../data/hausaloaded/hausaloaded.com_{}_{}_{}_{}.txt".format(
            time.strftime("%Y%m%d-%H%M%S"),
            self.source_to_start_from,
            self.page_to_start_from,
            self.article_to_start_from
        )
        self.file = open(self.filename, "w+", encoding="utf-8")
        
        
    def init_driver(self):
        """
        This func initializes the webdriver and disables images
        A wait is initialized with a 5 second timeout
        """
        options = Options()
        options.add_experimental_option("prefs", self.prefs)

        driver = webdriver.Chrome(self.chrome_path, options=options)
        # define a generic wait to be used throughout
        driver.wait = WebDriverWait(driver, 5)

        return driver
    
    def __check_no_of_articles(self):
        """
        This func checks number of articles on a page
        """
        content = self.driver.page_source
        soup = bs(content)
        no_articles_on_page = len(soup.find_all('h2', attrs={'class': 'post-title entry-title'}))
        return no_articles_on_page
        
    def __write_article_to_text(self, sentences):
        """
        This func writes individual sentences to a file
        """
        # split into individual sentences
        sentence_split = filter(None, sentences.split("."))
        # write each individual sentences on a new line
        for s in sentence_split:
            self.file.write(s.strip() + "\n")
            
    def __on_article_action(self):
        """
        This func calls __write_article_to_text on individual paragraphs of the article
        The paragraphs are in different divs
        """
        content = self.driver.page_source
        soup = bs(content)
        
        for paragraph in soup.find_all('div', attrs={'class':"post-body entry-content"}):
            self.__write_article_to_text(paragraph.text)
    
    def __scroll_to_article_range(self):
        """
        To avoid ElementNotInteractable error, this func scrolls article into view
        before any article action begins
        """
        while self.page_to_start_from > 1:
            self.driver.execute_script("arguments[0].scrollIntoView();",
                                       self.driver.find_element_by_xpath(self.load_more_button_path)
                                      )
            self.driver.find_element_by_xpath(self.load_more_button_path).click()
            time.sleep(1)
            self.page_to_start_from -= 1

    def __open_article_on_new_page_and_collect(self, href):
        """
        This func open an individual article in a new window and calls
        functions to collect the text. 
        """
        #link = self.driver.find_element_by_xpath(self.article_path.format(
            #self.article_xpath_number)).get_attribute("href")
        #print(link)
        self.driver.execute_script("window.open('{}')".format(href))
        
        windows = self.driver.window_handles
        self.driver.switch_to.window(windows[1])
        
        self.__on_article_action()
        self.article_to_start_from += 1
        
        self.driver.close()
        self.driver.switch_to.window(windows[0])
        
    def __collect_page(self):
        """
        This func loops over all articles on a page, opens each
        in a new window and writes its text to file. 
        """
        # initialize empty list to hold hrefs
        href_list = list()
        
        # collect all hrefs
        for element in self.driver.find_elements_by_class_name("readmore2"):
            href_list.append(element.get_attribute("href"))
            
        #print(href_list)
        
        # loop over hrefs and collect articles
        for href in href_list[self.article_to_start_from:]:
            self.__open_article_on_new_page_and_collect(href)
            time.sleep(1)
            
        """no_articles_on_page = self.__check_no_of_articles()
        self.article_xpath_number = 3 
        while self.article_xpath_number <= no_articles_on_page:
            print(self.article_xpath_number)
            if self.article_xpath_number > 4:
                """
            
            #self.driver.find_element_by_xpath(self.article.format(self.article_xpath_number))
            
            #self.article_xpath_number += 1
        
    def start(self):
        """
        This func loops over the different sources of the website
        and uses the previous functions to collect text of all articles.
        Where error occurs at a particular article, the article number is 
        stored and the loop continues to the next source. 
        """
        self.driver = self.init_driver()
        
        for source in self.sources[self.source_to_start_from:]:
            print("Starting Source {}".format(source))
            self.driver.get(source)
            if self.page_to_start_from:
                self.__scroll_to_article_range()
            
            while True:
                self.__collect_page()
                self.article_to_start_from = 0 # reinitialize article_to_start_from
                self.page_to_start_from += 1 # increment the number of pages collected
                
                try:
                    # scroll Next button into view and click it
                    self.driver.execute_script("arguments[0].scrollIntoView();", 
                                           self.driver.find_element_by_xpath(self.load_more_button_path)
                                              )
                    time.sleep(1)
                    self.driver.find_element_by_xpath(self.load_more_button_path).click()
                except Exception as e:
                    print("Error occured at Source {} Page {} Article {}".format(self.sources.index(source),
                                                                                 self.page_to_start_from, 
                                                                                 self.article_to_start_from))
                    break
                    #raise e
    
                time.sleep(1)
            self.page_to_start_from = 0 # reinitialize page_to_start_from
        # close file & driver
        self.file.close()
        self.driver.close()                

In [3]:
awks = Awokose(source_to_start_from=3)

In [4]:
awks.start()

Starting Source https://www.hausaloaded.com/search/label/Fadakarwa
Error occured at Source 3 Page 5 Article 0
Starting Source https://www.hausaloaded.com/search/label/kannywood
Error occured at Source 4 Page 111 Article 0
