## Blick Webdriver

In [None]:
#import packages
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException,NoSuchWindowException
import time
import configparser

In [None]:
class blick_webdriver:
    def __init__(self,webdriver_path:str)->list:
        """
        Arguments:
        webdriver_path: Webdriver Path to acivate the Chrome Driver in the form r"path"
        """
        try:
            # Start the Webdriver at the executable path
            self.driver = webdriver.Chrome(executable_path=webdriver_path)
        except WebDriverException as wde:
            print(wde.__context__)
            raise WebDriverException
        except Exception as e:
            print(e.__context__)
            raise e
        # Check if the blick Website is still Available
        try:
            self.driver.get("https://www.blick.ch/")
        except:
            print("website no longer exists")
            raise NoSuchWindowException

    def save_links_of_website(self,link:str):
        """
        Arguments:
        link: link for saving all the links.
        """
        try:
            # open the link of the sector website
            self.driver.get(link)
            # find all links of the sector
            links = self.driver.find_elements_by_xpath(r'/html/body/div/div/div/main//a')
            output_list = []
            # loop over the link and save it in a list
            for p in range(len(links)):
                value = links[p].get_attribute('href')
                if value[-4:] == "html":
                    output_list.append(value)
            return output_list

        # Raise an error 
        except WebDriverException as wde:
            print("The Webdriver have been closed, or there was no link under the xpath")
            print(link)
            raise wde
        except Exception as e:
            print(e.__context__)
            raise e

    def get_sector_links(self,sector_name:str):
        """
        Arguments:
        sector_name: name of a sector
        """
        # get all link of the website
        self.link_list = self.save_links_of_website("https://www.blick.ch/"+ sector_name + "/")
        # Check if there was a link
        if len(self.link_list) == 0:
            print("Didn't found a link under this Website or the xpath of the function (save_links_of_website) is wrong")
            self.driver.close()
            raise ValueError

        # loop over a sector. A sector has 20 separet website, with the links
        for x in range (2,21):
            self.link_list.extend(self.save_links_of_website("https://www.blick.ch/"+ sector_name + "/page" + str(x) + "/"))
            # check if there was a link
            if len(self.link_list[x-1]) == 0:
                print("Didn't found a link under this Website")
                self.driver.close()
                raise ValueError

    def get_text_from_link(self,link:str):
        """
        Arguments:
        link: link from a article
        """
        try:
            # open a article
            self.driver.get(link)
            # get the text of an article
            infos = self.driver.find_elements_by_xpath(r'//p')
            self.text = ""
            for p in range(len(infos)):
                value = infos[p].text
                self.text += value
        
        # raise an error if neccessary
        except WebDriverException as wde:
            print("The Webdriver closed or the xpath changed")
            raise wde
        except Exception as e:
            print(e.__context__)
            raise e

    def iterate_over_link(self):
        """
        Output: return the text of every article from a sector
        """
        try:
            # Check if the xpath of the function get_text_from_link is wrong.
            self.get_text_from_link("https://www.blick.ch/ausland/der-deutsche-bahn-konkurrent-flixtrain-startet-zur-sommerzeit-drei-neue-linien-id17478680.html")
            if len(self.text) == 0:
                print("The xpath of the function get_text_from_link is wrong")
                raise WebDriverException
        except Exception as e:
            print(e.__context__)
            raise e

        # iterate over the link list
        try:
            output = []
            for link in self.link_list:
                self.get_text_from_link(link)
                output.append(self.text)
        except Exception as e:
            print(e.__context__)
            raise e
        # when all text of a sector have been downloaded, close the driver.
        return output

In [None]:
# choose the webdriver path
webdriver_path = r"C:/Users/j/OneDrive/wdb/chromedriver_win32/chromedriver.exe"
# select all sectors for the iteration
iter_list = ["politik","ausland","wirtschaft","meinung"]
# intialize blick webdriver class
blick_w = blick_webdriver(webdriver_path)
blick_output = {}

# loop over each sector
for sector in iter_list:
    blick_w.get_sector_links(sector)
    blick_output[sector] = blick_w.iterate_over_link()
blick_w.driver.close()

In [None]:
# export the file
with open("blick_out", "w", encoding="utf-8") as f:
    f.write(str(blick_output))