/
news_scraper.rs
137 lines (122 loc) · 4.95 KB
/
news_scraper.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
use std::fmt;
use colored::*;
use scraper::{ElementRef, Html};
// Scraper for Hacker News
#[derive(Debug)]
struct NewsHeadline {
headline: String,
link: String,
time: String,
num_points: Option<u32>,
num_comments: Option<String>,
author: Option<String>,
}
impl fmt::Display for NewsHeadline {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str(&format!("{}", self.headline.bold(),))?;
f.write_str(&format!(
"\n\t{} {} {} | {}",
self.num_points
.map(|num_points| format!("{} points", num_points))
.map(|num_points| num_points.magenta())
.unwrap_or_default(),
self.author
.clone()
.map(|author| format!("by {}", author))
.map(|author| author.bright_black())
.unwrap_or_default(),
self.time.dimmed(),
self.num_comments
.clone()
.map(|num_comments| num_comments.to_string())
.map(|num_comments| num_comments.green())
.unwrap_or_default()
))?;
f.write_str(&format!("\n\t{}", self.link.underline().blue()))?;
Ok(())
}
}
pub struct NewsScraper {
news: Vec<NewsHeadline>,
}
impl NewsScraper {
pub fn new() -> Self {
NewsScraper { news: Vec::new() }
}
pub fn scrape(&mut self, page: String) {
// Parse the page into a DOM tree
let document = Html::parse_document(&page);
// Get all the elements with class "athing"
let athing_selector = scraper::Selector::parse(".athing").unwrap();
let athing_elements = document.select(&athing_selector);
// traverse the elements
for athing_element in athing_elements {
// on the athing element get the span with `titleline`
let titleline_selector = scraper::Selector::parse(".titleline").unwrap();
let titleline_element = athing_element.select(&titleline_selector).next().unwrap();
// get the first anchor element
let anchor_selector = scraper::Selector::parse("a").unwrap();
let anchor_element = titleline_element.select(&anchor_selector).next().unwrap();
// get the text of the anchor element
let headline = anchor_element.text().collect::<Vec<_>>().join("");
// get the href attribute of the anchor element
let link = anchor_element.value().attr("href").unwrap().to_string();
// get the next sibling of the athing element
let athing_next_sibling = athing_element.next_sibling().unwrap();
// convert the node to an element
let athing_next_sibling_element = ElementRef::wrap(athing_next_sibling).unwrap();
// get the span with class "age"
let age_selector = scraper::Selector::parse(".age").unwrap();
let time = athing_next_sibling_element
.select(&age_selector)
.next()
.map(|elem| elem.text().collect::<String>());
// get the text of the age element
// get the score element
let score_selector = scraper::Selector::parse(".score").unwrap();
let num_points = athing_next_sibling_element
.select(&score_selector)
.next()
.map(|elem| elem.text().collect::<String>());
// get the text of the score element
// get the anchor element with class "hnuser"
let hnuser_selector = scraper::Selector::parse(".hnuser").unwrap();
let author = athing_next_sibling_element
.select(&hnuser_selector)
.next()
.map(|elem| elem.text().collect::<String>());
// get the last child of subline element to get the number of comments
let subline_selector = scraper::Selector::parse(".subline > a:last-child").unwrap();
let num_comments = athing_next_sibling_element
.select(&subline_selector)
.next()
.map(|elem| elem.text().collect::<String>());
self.news.push(NewsHeadline {
headline,
link,
time: time.unwrap(),
num_points: num_points.map(|num_points| {
num_points
.split_whitespace()
.next()
.unwrap()
.parse()
.unwrap()
}),
num_comments,
author,
});
}
}
pub fn get_news(&self) -> String {
let mut news = String::new();
news.push_str(&format!(
"\n{}\n",
" Hacker News ".bold().on_bright_green().black()
));
for (i, news_headline) in self.news.iter().enumerate() {
news.push_str(&format!("\n{}. {}", i + 1, news_headline));
}
news
}
}