In [None]:
from selenium import webdriver
from urllib.parse import urlparse, urlunparse, urlencode
from urllib.request import urlretrieve
from pathlib import Path
import pandas as pd

import matplotlib.pyplot as plt
import time
import re
from abc import ABC, abstractmethod
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException


In [None]:
class Scraper(ABC):
    def __init__(self, base_url):
        self._base_url = base_url
        self._driver = webdriver.Chrome()

    @abstractmethod
    def open_search_page(self, keyword: str):
        pass

    @abstractmethod
    def get_list_of_coin_urls(self, keyword: str):
        pass

    @abstractmethod
    def scrap_coin_data(self, link: str):
        pass

    @abstractmethod
    def scrap_coin_data_from_page(self, link):
        pass

    def link_of_coins_to_csv(self, keyword, csv_path):
        links = self.get_list_of_coin_urls(keyword)
        df = pd.DataFrame({"link": links})
        df.to_csv(csv_path, header=None, index=None)
        return df
   
    def scroll_to_bottom(self):
        self._driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    def quit(self):
        self._driver.quit()

In [None]:
class NiemsczykScraper(Scraper):
    def __init__(self):
        super().__init__("https://archiwum.niemczyk.pl")

    def open_search_page(self, keyword: str, page = None):
        url = urlparse(self._base_url + "/search")
        params = {"search": keyword}
        if page:
            params['page'] = page
        url_new_query = urlencode(params)
        url_parsed = url._replace(query=url_new_query)
        url = urlunparse(url_parsed)
        self._driver.get(url)

    def get_list_of_coin_urls(self, keyword: str):
        driver = self._driver
        all_links = []
        wait = WebDriverWait(driver, 1)
        page = 1
        self.open_search_page(keyword, page)
        try:
            while wait.until(EC.visibility_of_element_located((By.ID, "post-data"))):
                page_links = list(map(lambda el : el.get_attribute("href"), driver.find_elements_by_xpath("//div[@id='post-data']//h3/a")))
                all_links += page_links
                page = page + 1
                self.open_search_page(keyword, page)
        except TimeoutException as error:
            print("Fetched {} links".format(len(all_links)))
        return all_links

    def scrap_coin_data_from_page(self, link):
        driver = self._driver
        driver.get(link)
        date = driver.find_element_by_xpath("//div[text()='Data sprzedaży:']/../div[2]").text
        title = driver.find_element_by_xpath("//div[@class='row']//h2/span[1]").text
        try:
            images = list(map(lambda el : el.get_attribute("href"), driver.find_elements_by_xpath("//div[@id='links']/a")))
        except Exception as e:
            images = []
        try: 
            desctiption = driver.find_element_by_xpath("//div[@class='tab-content']//div[@class='container']").text
        except Exception as e:
            desctiption = ""
        
        desctiption = desctiption.replace("\n","\\")
        return { "title": title, "date": date, "description": desctiption, "images": images, "link": link }
    
    def scrap_coin_data(self, links, csv_path):
        df = pd.DataFrame()
        for link in links:
            data = self.scrap_coin_data_from_page(link)
            df = df.append(data, ignore_index=True)
        df.to_csv(csv_path, index=None)
        return df

    def scrap_coin_data_dev(self, links, csv_path):
        try:
            df = pd.read_csv(csv_path, sep='|')
        except:
            df = pd.DataFrame()
        for link in links:
            data = self.scrap_coin_data_from_page(link)
            df = df.append(data, ignore_index=True)
            df.to_csv(csv_path, index=None, sep='|')
        return df

In [None]:
niemczyk.quit()
niemczyk = NiemsczykScraper()

keyword = "Sztandar 1930"
filename = "{}.csv".format(keyword.lower().replace(" ","_"))
links_filename = "niemczyk/links/" + filename 
data_filename = "niemczyk/data/" + filename

niemczyk.link_of_coins_to_csv(keyword, links_filename)
df = pd.read_csv(links_filename, header=None)
niemczyk.scrap_coin_data_dev(list(df[0])[334:], data_filename)

In [None]:
class MarciniakScraper(Scraper):
    def __init__(self, ):
        super().__init__("https://aukcje.gndm.pl/pl/archive")

    def open_search_page(self, keyword: str, page = 1):
        url = urlparse("{}/{}/0/0/0/{}".format(self._base_url, page, keyword))
        url = urlunparse(url)
        self._driver.get(url)


    def get_list_of_coin_urls(self, keyword: str):
        driver = self._driver
        all_links = []
        page = 1
        wait = WebDriverWait(driver, 3)
       
        while True:
            self.open_search_page(keyword, page)
            i = 1
            try:
                while wait.until(EC.visibility_of_element_located((By.ID, "scrollpart" + str(i)))):
                    scrollpart_links = list(map(lambda el : el.get_attribute("href"),
                                                driver.find_elements_by_xpath("//div[@id='scrollpart{}']//a[@class='title']".format(i))
                                                ))
                    all_links += scrollpart_links
                    self.scroll_to_bottom()
                    i = i + 1
            except TimeoutException as error:
                if i != 11:
                    break
            finally:
                page += 1

        print("Fetched {} links".format(len(all_links)))
        return all_links

    def scrap_coin_data(self, link: str):
        pass

    def scrap_coin_data_from_page(self, link):
        pass


In [None]:
marciniak.quit()
marciniak = MarciniakScraper()

keyword = "Sztandar 1930"
filename = "{}.csv".format(keyword.lower().replace(" ","_"))
links_filename = "marciniak/links/" + filename 
data_filename = "marciniak/data/" + filename

marciniak.link_of_coins_to_csv(keyword, links_filename)