diff --git a/doc/CHANGELOG.md b/doc/CHANGELOG.md index 1d6e74df..476fdcf9 100644 --- a/doc/CHANGELOG.md +++ b/doc/CHANGELOG.md @@ -1,5 +1,9 @@ # yetibot changelog +## 0.4.1 + +- Added `scrape` command [#310](https://github.com/devth/yetibot/issues/310) + ## 0.4.0 0.4.0 brings non-backward compatible changes, particularly around configuration @@ -7,17 +11,9 @@ refactoring. ### Non-backward Compatible Changes -- *Config*: main config is now immutable, and can be provided in - 12-Factor-compatible methods, such as env-vars. As a result, is it also flat - KV pairs now, which are exploded into nested maps by - [dec](https://github.com/devth/dec). - - - new [profiles.sample.clj](https://github.com/devth/yetibot.core/blob/master/profiles.sample.clj) - - new [Configuration docs](https://github.com/devth/yetibot.core/blob/master/docs/CONFIGURATION.md) - -- *Mutable config*: mutable config, such as which IRC rooms to join and - channel-specific settings has been extracted into a separate file that is - managed by Yetibot. +- See [yetibot.core + CHANGELOG](https://github.com/devth/yetibot.core/blob/master/doc/CHANGELOG.md#040) + for info on configuration changes. - Upgraded to Clojure 1.8.0 diff --git a/project.clj b/project.clj index 29743611..a7ceb975 100644 --- a/project.clj +++ b/project.clj @@ -3,7 +3,6 @@ :url "https://github.com/devth/yetibot" :license {:name "Eclipse Public License" :url "http://www.eclipse.org/legal/epl-v10.html"} - :lein-release {:deploy-via :clojars} :deploy-repositories [["releases" :clojars]] :profiles {:dev {:source-paths ["dev"]} :uberjar {:uberjar-name "yetibot.jar" @@ -14,19 +13,21 @@ :welcome (println "Welcome to the yetibot development REPL!")} :jvm-opts ["-server"] :dependencies [[org.clojure/clojure "1.8.0"] - [yetibot.core "0.4.1"] + [yetibot.core "0.4.2"] ; apis [twitter-api "0.7.6"] [clj-aws-s3 "0.3.10" :exclusions [joda-time]] + ; scraping + [org.jsoup/jsoup "1.10.1"] + ; utils [org.flatland/useful "0.11.5"] [org.clojure/tools.cli "0.3.1"] - ;for polling - [robert/bruce "0.8.0"] - ] + ; polling + [robert/bruce "0.8.0"]] :plugins [[lein-exec "0.3.5"] [lein-environ "1.0.3"] [lein-cloverage "1.0.7-SNAPSHOT"] diff --git a/src/yetibot/commands/scrape.clj b/src/yetibot/commands/scrape.clj new file mode 100644 index 00000000..325c89bc --- /dev/null +++ b/src/yetibot/commands/scrape.clj @@ -0,0 +1,42 @@ +(ns yetibot.commands.scrape + (:require + [taoensso.timbre :refer [debug info warn error]] + [yetibot.core.hooks :refer [cmd-hook]]) + (:import + (org.jsoup Jsoup) + (org.jsoup.select Elements) + (org.jsoup.nodes Element))) + +(defn get-page [url] + (.get (.userAgent (Jsoup/connect url) "curl/7.51.0"))) + +(defn get-elems [page selector] + (.select page selector)) + +(defn get-attr [element attr] + (condp = attr + "text" (.text element) + "html" (.html element) + (.attr element attr))) + +(defn scrape [url selector attr] + (remove nil? + (for [e (-> url + get-page + (get-elems selector))] + (get-attr e attr)))) + +(defn scrape-cmd + "scrape # scrape a url and select elements' text, html or attributes via jsoup. can be: + - text - inner text of the element(s) + - html - html of the element(s) + - anything else - an attribute of the element(s)" + {:yb/cat #{:util}} + [{[_ url selector-and-attr] :match}] + (let [[_ selector attr] (re-find + #"(.*)\s(\w+)$" + selector-and-attr)] + (scrape url selector attr))) + +(cmd-hook #"scrape" + #"(\S+)(.*)" scrape-cmd) diff --git a/test/yetibot/test/commands/scrape.clj b/test/yetibot/test/commands/scrape.clj new file mode 100644 index 00000000..c36c1d39 --- /dev/null +++ b/test/yetibot/test/commands/scrape.clj @@ -0,0 +1,8 @@ +(ns yetibot.test.commands.scrape + (:require + [clojure.test :refer :all] + [yetibot.commands.scrape :refer :all])) + +(deftest test-scrape + (testing "Imgflip" + (is (not-empty (scrape "https://imgflip.com" ".base-img[src!='']" "src")))))