Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Newer
Older
100644 166 lines (148 sloc) 5.256 kb
aebb5bd @havocp Add IndexerActor
havocp authored
1 package com.typesafe.webwords.indexer
2
3 import scala.collection.JavaConverters._
575cf97 @havocp discard invalid urls when saving links
havocp authored
4 import akka.actor.{ Index => _, _ }
52a06b6 @havocp import actorOf rather than using Actor.actorOf
havocp authored
5 import akka.actor.Actor.actorOf
aebb5bd @havocp Add IndexerActor
havocp authored
6 import com.typesafe.webwords.common.CPUBoundActorPool
7 import java.net.URL
575cf97 @havocp discard invalid urls when saving links
havocp authored
8 import java.net.URI
9 import java.net.URISyntaxException
10 import java.net.MalformedURLException
aebb5bd @havocp Add IndexerActor
havocp authored
11 import org.jsoup.Jsoup
12 import org.jsoup.nodes.Document
df2b9eb @havocp use aggregate() not foldLeft() to count words
havocp authored
13 import scala.collection.parallel._
7dc81c1 @havocp move Index class to common
havocp authored
14 import com.typesafe.webwords.common.Index
aebb5bd @havocp Add IndexerActor
havocp authored
15
70f2c89 @havocp test IndexerActor and fix a couple minor things
havocp authored
16 sealed trait IndexerRequest
17 case class IndexHtml(url: URL, doc: String) extends IndexerRequest
18
19 sealed trait IndexerReply
20 case class IndexedHtml(index: Index) extends IndexerReply
21
29ce50c @havocp Cosmetic changes, comments and variable names
havocp authored
22 /**
23 * IndexerActor is a CPU-bound actor which parses HTML with the jsoup
24 * library and scrapes some data out of it. The code in here illustrates
25 * algorithmic code in Scala, in a functional style, including use of
26 * parallel collections.
27 */
aebb5bd @havocp Add IndexerActor
havocp authored
28 class IndexerActor
29 extends Actor
30 with CPUBoundActorPool {
31
52a06b6 @havocp import actorOf rather than using Actor.actorOf
havocp authored
32 // actorOf[Worker] doesn't work on nested classes
33 override def instance = actorOf(new Worker)
aebb5bd @havocp Add IndexerActor
havocp authored
34
35 override def receive = _route
36
37 private class Worker extends Actor {
70f2c89 @havocp test IndexerActor and fix a couple minor things
havocp authored
38 import IndexerActor._
39
aebb5bd @havocp Add IndexerActor
havocp authored
40 private def links(doc: Document) = {
41 val as = doc.select("a").asScala
42 val builder = Map.newBuilder[String, String]
43 for (a <- as) {
44 val text = a.text
575cf97 @havocp discard invalid urls when saving links
havocp authored
45 val href = try {
46 // be paranoid here and we don't have to worry about it
47 // anywhere else in the code.
48 val maybeInvalid = a.attr("abs:href")
49 if (maybeInvalid.isEmpty)
50 throw new URISyntaxException(maybeInvalid, "empty URI")
51 new URI(maybeInvalid)
52 new URL(maybeInvalid)
53 maybeInvalid
54 } catch {
55 case e: URISyntaxException =>
56 ""
57 case e: MalformedURLException =>
58 ""
59 }
60
aebb5bd @havocp Add IndexerActor
havocp authored
61 if (href.nonEmpty && text.nonEmpty)
62 builder += (text -> href)
63 }
bb79737 @havocp return links from indexer as a sorted list
havocp authored
64 builder.result.toSeq.sortBy(_._1)
aebb5bd @havocp Add IndexerActor
havocp authored
65 }
66
67 private def wordCounts(doc: Document) = {
68 val body = doc.select("body").first
69 // splitWords creates a parallel collection so this is multithreaded!
70f2c89 @havocp test IndexerActor and fix a couple minor things
havocp authored
70 // in a real app you'd want to profile and see if this makes sense;
71 // it may well not depending on workload, number of cores, etc.
72 // but it's interesting to see how to do it.
73 val words = splitWords(body.text) filter { !boring(_) }
74 wordCount(words).toSeq.sortBy(0 - _._2) take 50
aebb5bd @havocp Add IndexerActor
havocp authored
75 }
76
77 override def receive = {
78 case request: IndexerRequest => request match {
79 case IndexHtml(url, docString) =>
80 val doc = Jsoup.parse(docString, url.toExternalForm)
53043e3 @havocp Convert the Index class to a case class
havocp authored
81 val index = Index(links(doc), wordCounts(doc))
aebb5bd @havocp Add IndexerActor
havocp authored
82 self.tryReply(IndexedHtml(index))
83 }
84 }
85 }
86 }
87
88 object IndexerActor {
89 private val notWordRegex = """\W""".r
90
91 private[indexer] def splitWords(s: String): ParSeq[String] = {
70f2c89 @havocp test IndexerActor and fix a couple minor things
havocp authored
92 // ".par" is the magic that gives us a parallel algorithm
aebb5bd @havocp Add IndexerActor
havocp authored
93 val lines = s.split("\\n").toSeq.par
94 val words = lines flatMap { line =>
70f2c89 @havocp test IndexerActor and fix a couple minor things
havocp authored
95 notWordRegex.split(line) filter { w => w.nonEmpty }
aebb5bd @havocp Add IndexerActor
havocp authored
96 }
97 words
98 }
99
df2b9eb @havocp use aggregate() not foldLeft() to count words
havocp authored
100 private[indexer] def mergeCounts(a: Map[String, Int], b: Map[String, Int]): Map[String, Int] = {
101 val builder = Map.newBuilder[String, Int]
102 val (intersection, notInB) = a partition { kv => b.contains(kv._1) }
103 val notInA = b filter { kv => !a.contains(kv._1) }
104 for ((key, value) <- intersection.iterator) {
105 builder += (key -> (value + b.get(key).get))
106 }
107 builder ++= notInA
108 builder ++= notInB
109 builder.result
110 }
111
aebb5bd @havocp Add IndexerActor
havocp authored
112 private[indexer] def wordCount(words: ParSeq[String]) = {
df2b9eb @havocp use aggregate() not foldLeft() to count words
havocp authored
113 // using foldLeft avoids the need for mergeCounts,
114 // but foldLeft is inherently sequential.
115 // You'd have to benchmark to see which is faster.
116
117 words.aggregate(Map.empty[String, Int])({ (sofar, word) =>
aebb5bd @havocp Add IndexerActor
havocp authored
118 sofar.get(word) match {
119 case Some(old) =>
120 sofar + (word -> (old + 1))
121 case None =>
122 sofar + (word -> 1)
123 }
df2b9eb @havocp use aggregate() not foldLeft() to count words
havocp authored
124 }, mergeCounts)
aebb5bd @havocp Add IndexerActor
havocp authored
125 }
70f2c89 @havocp test IndexerActor and fix a couple minor things
havocp authored
126
127 // not very scientific or internationalized ;-)
128 private val boringEnglishWords = Set(
129 "a",
130 "also",
131 "an",
132 "and",
133 "are",
134 "as",
135 "at",
136 "be",
137 "been",
138 "by",
139 "can",
140 "for",
141 "from",
142 "has",
143 "have",
144 "in",
145 "it",
146 "is",
147 "may",
148 "not",
149 "of",
150 "on",
151 "or",
152 "such",
153 "that",
154 "the",
155 "this",
156 "to",
157 "was",
158 "which",
159 "with")
160 private[indexer] def boring(word: String) = {
161 // no single letters or super-high-frequency words
162 word.length == 1 ||
163 boringEnglishWords.contains(word.toLowerCase)
164 }
aebb5bd @havocp Add IndexerActor
havocp authored
165 }
Something went wrong with that request. Please try again.