Skip to content

Commit

Permalink
Add scrape command; fix #310
Browse files Browse the repository at this point in the history
  • Loading branch information
devth committed Dec 28, 2016
1 parent 9378987 commit 77487d3
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 16 deletions.
18 changes: 7 additions & 11 deletions doc/CHANGELOG.md
@@ -1,23 +1,19 @@
# yetibot changelog

## 0.4.1

- Added `scrape` command [#310](https://github.com/devth/yetibot/issues/310)

## 0.4.0

0.4.0 brings non-backward compatible changes, particularly around configuration
refactoring.

### Non-backward Compatible Changes

- *Config*: main config is now immutable, and can be provided in
12-Factor-compatible methods, such as env-vars. As a result, is it also flat
KV pairs now, which are exploded into nested maps by
[dec](https://github.com/devth/dec).

- new [profiles.sample.clj](https://github.com/devth/yetibot.core/blob/master/profiles.sample.clj)
- new [Configuration docs](https://github.com/devth/yetibot.core/blob/master/docs/CONFIGURATION.md)

- *Mutable config*: mutable config, such as which IRC rooms to join and
channel-specific settings has been extracted into a separate file that is
managed by Yetibot.
- See [yetibot.core
CHANGELOG](https://github.com/devth/yetibot.core/blob/master/doc/CHANGELOG.md#040)
for info on configuration changes.

- Upgraded to Clojure 1.8.0

Expand Down
11 changes: 6 additions & 5 deletions project.clj
Expand Up @@ -3,7 +3,6 @@
:url "https://github.com/devth/yetibot"
:license {:name "Eclipse Public License"
:url "http://www.eclipse.org/legal/epl-v10.html"}
:lein-release {:deploy-via :clojars}
:deploy-repositories [["releases" :clojars]]
:profiles {:dev {:source-paths ["dev"]}
:uberjar {:uberjar-name "yetibot.jar"
Expand All @@ -14,19 +13,21 @@
:welcome (println "Welcome to the yetibot development REPL!")}
:jvm-opts ["-server"]
:dependencies [[org.clojure/clojure "1.8.0"]
[yetibot.core "0.4.1"]
[yetibot.core "0.4.2"]

; apis
[twitter-api "0.7.6"]
[clj-aws-s3 "0.3.10" :exclusions [joda-time]]

; scraping
[org.jsoup/jsoup "1.10.1"]

; utils
[org.flatland/useful "0.11.5"]
[org.clojure/tools.cli "0.3.1"]

;for polling
[robert/bruce "0.8.0"]
]
; polling
[robert/bruce "0.8.0"]]
:plugins [[lein-exec "0.3.5"]
[lein-environ "1.0.3"]
[lein-cloverage "1.0.7-SNAPSHOT"]
Expand Down
42 changes: 42 additions & 0 deletions src/yetibot/commands/scrape.clj
@@ -0,0 +1,42 @@
(ns yetibot.commands.scrape
(:require
[taoensso.timbre :refer [debug info warn error]]
[yetibot.core.hooks :refer [cmd-hook]])
(:import
(org.jsoup Jsoup)
(org.jsoup.select Elements)
(org.jsoup.nodes Element)))

(defn get-page [url]
(.get (.userAgent (Jsoup/connect url) "curl/7.51.0")))

(defn get-elems [page selector]
(.select page selector))

(defn get-attr [element attr]
(condp = attr
"text" (.text element)
"html" (.html element)
(.attr element attr)))

(defn scrape [url selector attr]
(remove nil?
(for [e (-> url
get-page
(get-elems selector))]
(get-attr e attr))))

(defn scrape-cmd
"scrape <url> <selector-path> <attr> # scrape a url and select elements' text, html or attributes via jsoup. <attr> can be:
- text - inner text of the element(s)
- html - html of the element(s)
- anything else - an attribute of the element(s)"
{:yb/cat #{:util}}
[{[_ url selector-and-attr] :match}]
(let [[_ selector attr] (re-find
#"(.*)\s(\w+)$"
selector-and-attr)]
(scrape url selector attr)))

(cmd-hook #"scrape"
#"(\S+)(.*)" scrape-cmd)
8 changes: 8 additions & 0 deletions test/yetibot/test/commands/scrape.clj
@@ -0,0 +1,8 @@
(ns yetibot.test.commands.scrape
(:require
[clojure.test :refer :all]
[yetibot.commands.scrape :refer :all]))

(deftest test-scrape
(testing "Imgflip"
(is (not-empty (scrape "https://imgflip.com" ".base-img[src!='']" "src")))))

0 comments on commit 77487d3

Please sign in to comment.