In [26]:
#import packages
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException,NoSuchWindowException
import time
import configparser

In [11]:
class nzz_webdriver:
    def __init__(self,webdriver_path:str,e_mail_login:str,password_login:str):
        """
        Arguments:
        webdriver_path: Webdriver Path to acivate the Chrome Driver in the form r"path"
        e_mail_login: e-mail Adress of the nzz account
        password_login: password of the nzz account
        """
        try:
            # without this option the webdriver crashes sometimes (Stackoverflow: https://stackoverflow.com/questions/53902507/unknown-error-session-deleted-because-of-page-crash-from-unknown-error-cannot)
            options = webdriver.ChromeOptions()
            options.add_argument("--no-sandbox")
            # Start the Webdriver at the executable path
            self.driver = webdriver.Chrome(executable_path=webdriver_path,options=options)
        except WebDriverException as wde:
            print(wde.__context__)
            raise WebDriverException
        except Exception as e:
            print(e.__context__)
            raise e
        # Check if the nzz Website is still Available
        try:
            self.driver.get("https://www.nzz.ch/")
    
        except:
            print("website no longer exists")
            raise NoSuchWindowException
   
        # open the register url of nzz, fill in the e-mail adress and click on the login field
        try:
            # open the register url
            self.driver.get("https://abo.nzz.ch/registrieren/?sso=1&target=https%3A%2F%2Fwww.nzz.ch%2F")
            # find the e-mail field with the XPath and fill it with the e-mail adress on it.
            self.driver.find_element(by=By.XPATH, value="//*[@id='c1-login-field']").send_keys(e_mail_login)
            # find the login field with the XPath and click on it.
            self.driver.find_element(by=By.XPATH, value="/html/body/main/div[2]/form/section[1]/div/div/button").click()
            # Wait 5s for the password field to be presented.
            WebDriverWait(self.driver, 5).until(EC.presence_of_element_located((By.ID, "c1-password-field")))
        # If the password field is not present the password field changed.
        except TimeoutException as toe:
            print("It was not possibel to log in, e-mail is not correct or the XPath of the Password Field Changed")
            raise toe
        except WebDriverException as wde:
            print("The XPath changed, the register url changed or the webdriver closed.")
            raise wde
        except Exception as e:
            print(e.__context__)
            raise e
    
        # Click on remember me box, fill in password and click on the "Weiter" button.
        try:
            # find the remember me checkbox with the XPath and click on it.
            self.driver.find_element(by=By.XPATH, value="//*[@id='c1-remember_me-checkbox']").click()
            # find the password field with the XPath and fill in the password.
            self.driver.find_element(by=By.XPATH, value="//*[@id='c1-password-field']").send_keys(password_login)
            time.sleep(0.5)
            # find the "Weiter" button with the XPath and click on it.
            self.driver.find_element(by=By.XPATH, value="/html/body/main/div/div[3]/form/div[5]/button").click()
            # wait 5 seconds until the ID "__nzz" is present. Otherwise raise a Timeoutexception.
            WebDriverWait(self.driver, 5).until(EC.presence_of_element_located((By.ID, "__nzz")))   
        # If the Timeoutexception was raised of the webdriverwaitm the password wasn't filled correctly.
        except TimeoutException as toe:
            print("It was not possibel to log in, password was not correct")
            raise toe
        # Raise a webdriverexception if the XPATH of the Login Button or the remember me Checkbock changed
        except WebDriverException as wde:
            print(wde.__context__)
            print("The XPATH of the Login Button or the remember me Checkbock changed")
        except Exception as e:
            print(e.__context__)
            raise e

    def get_sector_links(self,link_sector:str) -> list:
        """
        Arguments:
        link_sector: link of a nzz sector
        
        Output:
        link_list: list with all links of a sector
        """
        try:
            self.driver.get(link_sector)
            driver_links = self.driver.find_elements_by_xpath(r'//article/div/div/a')
        except WebDriverException as wde:
            print(wde.__context__)
            raise WebDriverException
        except Exception as e:
            print(e.__context__)
            raise e

        # if there are no links in one sector the XPath is wrong
        if len(driver_links) == 0:
            print("No links under this link: "+link_sector+ " available or the XPATH is wrong")
            raise ValueError
        link_list = []
        # Save all links in one list
        for link in driver_links:
            link_list.append(link.get_attribute('href'))
        return link_list
    def get_text(self,sector_links:list)-> list:
        """
        Arguments:
        sector_links: all articles of one nzz sector
        
        Output:
        return all texts of a the sector links
        """
        #loop over the links to get the Text of each Article
        output = []
        for link in sector_links:
            # Check if ld is in the link, otherwise it is not an article. It is only a warning, so no error is raised.
            if "ld" not in link:
                print("This Link: " + link + " is not a article.")
            # get the Text of the links.
            try:
                self.driver.get(link)
                infos = self.driver.find_elements_by_xpath(r'//p')
            # Error if there is a exception with the webdriver.
            except WebDriverException as WDE:
                print("The Webdriver closed or the XPath changed")
                print(link)
                raise WDE
            # All other exceptions (then webdriver exceptions)
            except Exception as e:
                print(e.__context__)
                raise e


            # Add all Text parts of the Article to one Text.
            text = ""
            for p in range(len(infos)):
                # Call a single Text
                value = infos[p].text
                # If the value is None then the webdriver have been closed.
                if value is None:
                    print("The Webdriver closed.")
                    raise WebDriverException
                # Add a single text
                text += value
            # Check if there was a text under a article. Some of Text have only cartoons, but they should be controlled.
            if len(text) == 0:
                print("There was no Text under this website: " + link)
            # Add the text of an article to a list.
            output.append(text)
        return output
    def log_out(self):
        # Log Out, because the Accounts are Limited
        try:
            #Click on the user button.
            self.driver.find_element(by=By.XPATH, value="/html/body/div[3]/div/div/div[4]/div[4]/div[1]/div[2]/div/div[1]/div/div/div[3]/div/div/div[1]").click()
            #Click on the "Abmelden" button.
            self.driver.find_element(by=By.XPATH, value="/html/body/div[3]/div/div/div[4]/div[4]/div[1]/div[2]/div/div[1]/div/div/div[3]/div/div/div[2]/nav/ul[3]/span/li/div/a").click()
            #At the end of the code close the program.
            self.driver.close()
        # Raise and Exception, because the Account is limited to 15 Persons.
        except NoSuchElementException as nsee:
            print("couldn't log out correctly, pay attention the number of accounts are limited")
            raise nsee
        except Exception as e:
            print(e.__context__)
            raise e


In [12]:
nzz_main_links= ["https://www.nzz.ch/international","https://www.nzz.ch/wirtschaft","https://www.nzz.ch/schweiz","https://www.nzz.ch/zuerich","https://www.nzz.ch/sport","https://www.nzz.ch/technologie","https://www.nzz.ch/finanzen","https://www.nzz.ch/wissenschaft","https://www.nzz.ch/gesellschaft","https://www.nzz.ch/mobilitaet"]

In [14]:
config = configparser.ConfigParser()
config.read('login.ini')
webdriver_path = r"C:/Users/j/OneDrive/wdb/chromedriver_win32/chromedriver.exe"
webdriver_c=nzz_webdriver(webdriver_path,config["Login"]["e-mail"],config["Login"]["password"])
nzz_dic = {}
for sector_links in nzz_main_links:
    links = webdriver_c.get_sector_links(sector_links)
    nzz_dic[sector_links] =webdriver_c.get_text(links)
webdriver_c.log_out()

  self.driver = webdriver.Chrome(executable_path=webdriver_path,options=options)
  driver_links = self.driver.find_elements_by_xpath(r'//article/div/div/a')
  infos = self.driver.find_elements_by_xpath(r'//p')


In [25]:
# export the file
with open("nzz_out", "w", encoding="utf-8") as f:
    f.write(str(nzz_dic))