diff --git a/Cargo.lock b/Cargo.lock index e5541e4..4df7706 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1366,6 +1366,7 @@ dependencies = [ "scraper", "serde", "serde_json", + "slog", "url", ] @@ -1375,6 +1376,12 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c111b5bd5695e56cffe5129854aa230b39c93a305372fdbb2668ca2394eea9f8" +[[package]] +name = "slog" +version = "2.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8347046d4ebd943127157b94d63abb990fcf729dc4e9978927fdf4ac3c998d06" + [[package]] name = "smallvec" version = "1.6.0" diff --git a/common-scraper/src/common_scraper.rs b/common-scraper/src/common_scraper.rs index 274814c..afb102b 100644 --- a/common-scraper/src/common_scraper.rs +++ b/common-scraper/src/common_scraper.rs @@ -60,15 +60,15 @@ where if let Ok(mut potential_listing_share) = potential_listing_share_mutex.lock() { match potential_listing_share.get() { SemaphoreShareResult::Red => { - println!("Found RED on thread {}", thread_index); + // println!("Found RED on thread {}", thread_index); break; } SemaphoreShareResult::Green(potential_listing) => { - println!("Found GREEN on thread {}", thread_index); + // println!("Found GREEN on thread {}", thread_index); potential_listing_option = Some(potential_listing); } SemaphoreShareResult::Yellow => { - println!("Found YELLOW on thread {}", thread_index); + // println!("Found YELLOW on thread {}", thread_index); continue; } } diff --git a/common-scraper/src/potential_listing.rs b/common-scraper/src/potential_listing.rs index fb982cb..70fbe3a 100644 --- a/common-scraper/src/potential_listing.rs +++ b/common-scraper/src/potential_listing.rs @@ -1,6 +1,6 @@ use url::Url; -#[derive(Clone)] +#[derive(Debug, Clone)] pub struct PotentialListing { listing_url: Url, listing_page_url: Url, diff --git a/skelbiu-lt-scraper/Cargo.toml b/skelbiu-lt-scraper/Cargo.toml index f8ccda4..337e0a3 100644 --- a/skelbiu-lt-scraper/Cargo.toml +++ b/skelbiu-lt-scraper/Cargo.toml @@ -12,3 +12,4 @@ reqwest = { version = "0.11.5", features = ["blocking"]} scraper = "0.12.0" serde = { version = "1.0.117", features = ["derive"] } serde_json = "1.0" +slog = "2.7.0" diff --git a/skelbiu-lt-scraper/src/lib.rs b/skelbiu-lt-scraper/src/lib.rs index 4c49dd3..8659336 100644 --- a/skelbiu-lt-scraper/src/lib.rs +++ b/skelbiu-lt-scraper/src/lib.rs @@ -1,3 +1,6 @@ +#[macro_use] +extern crate slog; + mod skelbiu_lt_listing; mod skelbiu_lt_listing_scraper; mod skelbiu_lt_scraper; diff --git a/skelbiu-lt-scraper/src/skelbiu_lt_listing_scraper.rs b/skelbiu-lt-scraper/src/skelbiu_lt_listing_scraper.rs index e66a4be..95423f1 100644 --- a/skelbiu-lt-scraper/src/skelbiu_lt_listing_scraper.rs +++ b/skelbiu-lt-scraper/src/skelbiu_lt_listing_scraper.rs @@ -1,9 +1,10 @@ use crate::skelbiu_lt_listing::SkelbiuLtListing; use common_scraper::{ListingScraper, PotentialListing}; use scraper::Selector; -use std::ops::Index; +use slog::Logger; pub struct SkelbiuLtListingScraper { + logger: Logger, id_selector: Selector, title_selector: Selector, description_selector: Selector, @@ -17,6 +18,7 @@ pub struct SkelbiuLtListingScraper { impl SkelbiuLtListingScraper { pub fn new( + logger: Logger, id_selector: &str, title_selector: &str, description_selector: &str, @@ -38,6 +40,7 @@ impl SkelbiuLtListingScraper { let price_selector = Selector::parse(price_selector).unwrap(); Self { + logger, id_selector, title_selector, description_selector, @@ -53,9 +56,14 @@ impl SkelbiuLtListingScraper { impl ListingScraper for SkelbiuLtListingScraper { fn scrape_listing(&self, potential_listing: &PotentialListing) -> Option { + debug!(self.logger, "Started logging {:?}", &potential_listing); + let listing_url = potential_listing.listing_url().to_string(); if let Ok(response) = reqwest::blocking::get(&listing_url) { + debug!(self.logger, "Got response from {}", &listing_url); + let html = scraper::Html::parse_document(response.text().unwrap().as_str()); + debug!(self.logger, "Parsed html for {}", &listing_url); let title = html .select(&self.title_selector) @@ -65,6 +73,7 @@ impl ListingScraper for SkelbiuLtListingScraper { .collect::() .trim() .to_string(); + debug!(self.logger, "Found title for {}", &listing_url); let description = html .select(&self.description_selector) @@ -74,6 +83,7 @@ impl ListingScraper for SkelbiuLtListingScraper { .collect::() .trim() .to_string(); + debug!(self.logger, "Found description for {}", &listing_url); let id = html .select(&self.id_selector) @@ -84,6 +94,7 @@ impl ListingScraper for SkelbiuLtListingScraper { .replace("ID: ", "") .trim() .to_string(); + debug!(self.logger, "Found id for {}", &listing_url); let views = html .select(&self.view_selector) @@ -93,6 +104,7 @@ impl ListingScraper for SkelbiuLtListingScraper { .collect::() .trim() .to_string(); + debug!(self.logger, "Found views for {}", &listing_url); let updated_at = html .select(&self.updated_at_selector) @@ -102,6 +114,7 @@ impl ListingScraper for SkelbiuLtListingScraper { .collect::() .trim() .replace("Atnaujintas ", ""); + debug!(self.logger, "Found updated_at for {}", &listing_url); let liked_amount = html .select(&self.liked_amount_selector) @@ -111,6 +124,7 @@ impl ListingScraper for SkelbiuLtListingScraper { .collect::() .trim() .replace("Įsimintas ", ""); + debug!(self.logger, "Found liked_amount for {}", &listing_url); let mut location = html .select(&self.location_selector) @@ -118,18 +132,29 @@ impl ListingScraper for SkelbiuLtListingScraper { .unwrap_or_else(|| panic!("Could not find location for {}", &listing_url)) .text() .collect::(); - location.truncate(location.find("Siųsti siuntą vos nuo").unwrap()); + if let Some(send_index) = location.find("Siųsti siuntą vos nuo") { + location.truncate(send_index); + } location = location.trim().to_string(); + debug!(self.logger, "Found location for {}", &listing_url); let quality = if let Some(quality) = html.select(&self.quality_selector).next() { + debug!(self.logger, "Found quality for {}", &listing_url); + Some(quality.text().collect::().trim().to_string()) } else { + debug!(self.logger, "Could not find quality for {}", &listing_url); + None }; let price = if let Some(price) = html.select(&self.price_selector).next() { + debug!(self.logger, "Found price for {}", &listing_url); + Some(price.text().collect::().trim().to_string()) } else { + debug!(self.logger, "Could not find price for {}", &listing_url); + None }; diff --git a/skelbiu-lt-scraper/src/skelbiu_lt_scraper.rs b/skelbiu-lt-scraper/src/skelbiu_lt_scraper.rs index f7931e0..2265cb9 100644 --- a/skelbiu-lt-scraper/src/skelbiu_lt_scraper.rs +++ b/skelbiu-lt-scraper/src/skelbiu_lt_scraper.rs @@ -3,14 +3,19 @@ use crate::skelbiu_lt_listing_scraper::SkelbiuLtListingScraper; use common_scraper::{ CommonPageScraper, CommonScrapper, ListingScraper, PageScraper, ScraperSettings, }; +use slog::Logger; pub struct SkelbiuLtScraper { + logger: Logger, scraper_settings: ScraperSettings, } impl SkelbiuLtScraper { - pub fn new(scraper_settings: ScraperSettings) -> Self { - Self { scraper_settings } + pub fn new(logger: Logger, scraper_settings: ScraperSettings) -> Self { + Self { + logger, + scraper_settings, + } } } @@ -26,6 +31,7 @@ impl CommonScrapper for SkelbiuLtScraper { fn get_listing_scraper(&self) -> Box> { // TODO: Refactor this to use DI & clone Box::new(SkelbiuLtListingScraper::new( + self.logger.clone(), ".id", "h1[itemprop=name]", ".description",