Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Merge branch 'master' of github.com:clj-sys/webmine

  • Loading branch information...
commit ef712a6eea6a595b14df8c72858cb85e273517b0 2 parents 8be15ab + 8b02276
@aria42 aria42 authored
Showing with 43 additions and 38 deletions.
  1. +6 −2 README.md
  2. +23 −22 src/webmine/feeds.clj
  3. +14 −14 src/webmine/parser.clj
View
8 README.md
@@ -1,6 +1,8 @@
# webmine
-A web mining toolkit for Clojure. A swiss army knife for processing text, images, and feeds from HTML. The goal is to give you the most common tools you need out of the box, but give you enough to customize your processing.
+A web mining toolkit for Clojure. A swiss army knife for processing text, images, and feeds from HTML.
+
+The library gives you the most common tools you need out of the box, but give is fine grained enough to build your custom processing tools.
## link extraction and url fu
@@ -93,5 +95,7 @@ For leiningen:
## Authors
-Bradford Cross, Matt Revelle, and Aria Haghighi
+- Copyright (c) Bradford Cross, Matt Revelle, and Aria Haghighi released under the MIT License (http://www.opensource.org/licenses/mit-license.php).
+
+
View
45 src/webmine/feeds.clj
@@ -61,28 +61,29 @@
(defn- item-node-to-entry [item]
(let [item-root (zip/xml-zip item)
- get-text (fn [k] (xml-zip/xml1-> item-root k xml-zip/text))
- entry
- (FeedEntry.
- ; title
- (get-text :title)
- ; link
- (get-text :link)
- ; content
- (apply max-key count
- (map get-text [:content :description :content:encoded]))
- ; des
- (first (filter identity
- (map get-text [:description :content :content:encoded])))
- ; date
- (try (first (for [k [:pubDate :date :updatedDate]
- :let [s (get-text k)]
- :when k] (if s (compact-date-time s)
- nil)))
- (catch Exception e (log/error e)))
- ; author
- (get-text :author))]
- (mk-des entry)))
+ get-text (fn [k] (xml-zip/xml1-> item-root k xml-zip/text))
+ entry (FeedEntry.
+ ;; title
+ (get-text :title)
+ ;; link
+ (get-text :link)
+ ;; content
+ (apply max-key count
+ (map get-text [:content :description :content:encoded]))
+ ;; des
+ (first (filter identity
+ (map get-text [:description :content :content:encoded])))
+ ;; date
+ (try (first (for [k [:pubDate :date :updatedDate]
+ :let [s (get-text k)]
+ :when k] (if s (compact-date-time s)
+ nil)))
+ (catch Exception e (log/error e)))
+ ;; author
+ (get-text :author))]
+ (try (mk-des entry)
+ (catch Exception _
+ entry))))
(defn parse-feed [source]
"returns record Feed representing a snapshot of a feed. Supports keys
View
28 src/webmine/parser.clj
@@ -22,20 +22,20 @@
"html string -> dom using TagSoup.
the features we set on the parser come from different implementations that I found in nutch, HtmlParser, as well as other parsers."
(try
- (let [result (org.apache.xalan.xsltc.trax.SAX2DOM.)
- input (if (instance? java.net.URL source)
- (.openStream source)
- (StringReader. source))
- parser (doto (Parser.)
- (.setContentHandler result)
- (.setFeature Parser/namespacesFeature false)
- (.setFeature Parser/namespacePrefixesFeature false)
- (.setFeature Parser/bogonsEmptyFeature false)
- (.setFeature Parser/ignoreBogonsFeature true)
- (.parse (InputSource. input)))]
- (cast Document (.getDOM result)))
- (catch org.w3c.dom.DOMException _ )
- (catch java.io.IOException _ ))) ;;pushback buffer overflow
+ (let [result (org.apache.xalan.xsltc.trax.SAX2DOM.)
+ input (if (instance? java.net.URL source)
+ (.openStream source)
+ (StringReader. source))
+ parser (doto (Parser.)
+ (.setContentHandler result)
+ (.setFeature Parser/namespacesFeature false)
+ (.setFeature Parser/namespacePrefixesFeature false)
+ (.setFeature Parser/bogonsEmptyFeature false)
+ (.setFeature Parser/ignoreBogonsFeature true)
+ (.parse (InputSource. input)))]
+ (cast Document (.getDOM result)))
+ (catch org.w3c.dom.DOMException _ )
+ (catch java.io.IOException _ ))) ;;pushback buffer overflow
; const unsigned short ELEMENT_NODE = 1;
; const unsigned short ATTRIBUTE_NODE = 2;
Please sign in to comment.
Something went wrong with that request. Please try again.