Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Newer
Older
100644 150 lines (134 sloc) 4.636 kB
aebb5bd @havocp Add IndexerActor
havocp authored
1 package com.typesafe.webwords.indexer
2
3 import scala.collection.JavaConverters._
575cf97 @havocp discard invalid urls when saving links
havocp authored
4 import akka.actor.{ Index => _, _ }
aebb5bd @havocp Add IndexerActor
havocp authored
5 import com.typesafe.webwords.common.CPUBoundActorPool
6 import java.net.URL
575cf97 @havocp discard invalid urls when saving links
havocp authored
7 import java.net.URI
8 import java.net.URISyntaxException
9 import java.net.MalformedURLException
aebb5bd @havocp Add IndexerActor
havocp authored
10 import org.jsoup.Jsoup
11 import org.jsoup.nodes.Document
12 import scala.collection.parallel.ParSeq
7dc81c1 @havocp move Index class to common
havocp authored
13 import com.typesafe.webwords.common.Index
aebb5bd @havocp Add IndexerActor
havocp authored
14
70f2c89 @havocp test IndexerActor and fix a couple minor things
havocp authored
15 sealed trait IndexerRequest
16 case class IndexHtml(url: URL, doc: String) extends IndexerRequest
17
18 sealed trait IndexerReply
19 case class IndexedHtml(index: Index) extends IndexerReply
20
29ce50c @havocp Cosmetic changes, comments and variable names
havocp authored
21 /**
22 * IndexerActor is a CPU-bound actor which parses HTML with the jsoup
23 * library and scrapes some data out of it. The code in here illustrates
24 * algorithmic code in Scala, in a functional style, including use of
25 * parallel collections.
26 */
aebb5bd @havocp Add IndexerActor
havocp authored
27 class IndexerActor
28 extends Actor
29 with CPUBoundActorPool {
30
31 override def instance = Actor.actorOf(new Worker())
32
33 override def receive = _route
34
35 private class Worker extends Actor {
70f2c89 @havocp test IndexerActor and fix a couple minor things
havocp authored
36 import IndexerActor._
37
aebb5bd @havocp Add IndexerActor
havocp authored
38 private def links(doc: Document) = {
39 val as = doc.select("a").asScala
40 val builder = Map.newBuilder[String, String]
41 for (a <- as) {
42 val text = a.text
575cf97 @havocp discard invalid urls when saving links
havocp authored
43 val href = try {
44 // be paranoid here and we don't have to worry about it
45 // anywhere else in the code.
46 val maybeInvalid = a.attr("abs:href")
47 if (maybeInvalid.isEmpty)
48 throw new URISyntaxException(maybeInvalid, "empty URI")
49 new URI(maybeInvalid)
50 new URL(maybeInvalid)
51 maybeInvalid
52 } catch {
53 case e: URISyntaxException =>
54 ""
55 case e: MalformedURLException =>
56 ""
57 }
58
aebb5bd @havocp Add IndexerActor
havocp authored
59 if (href.nonEmpty && text.nonEmpty)
60 builder += (text -> href)
61 }
bb79737 @havocp return links from indexer as a sorted list
havocp authored
62 builder.result.toSeq.sortBy(_._1)
aebb5bd @havocp Add IndexerActor
havocp authored
63 }
64
65 private def wordCounts(doc: Document) = {
66 val body = doc.select("body").first
67 // splitWords creates a parallel collection so this is multithreaded!
70f2c89 @havocp test IndexerActor and fix a couple minor things
havocp authored
68 // in a real app you'd want to profile and see if this makes sense;
69 // it may well not depending on workload, number of cores, etc.
70 // but it's interesting to see how to do it.
71 val words = splitWords(body.text) filter { !boring(_) }
72 wordCount(words).toSeq.sortBy(0 - _._2) take 50
aebb5bd @havocp Add IndexerActor
havocp authored
73 }
74
75 override def receive = {
76 case request: IndexerRequest => request match {
77 case IndexHtml(url, docString) =>
78 val doc = Jsoup.parse(docString, url.toExternalForm)
53043e3 @havocp Convert the Index class to a case class
havocp authored
79 val index = Index(links(doc), wordCounts(doc))
aebb5bd @havocp Add IndexerActor
havocp authored
80 self.tryReply(IndexedHtml(index))
81 }
82 }
83 }
84 }
85
86 object IndexerActor {
87 private val notWordRegex = """\W""".r
88
89 // this is in the companion object for ease of unit testing
90 private[indexer] def splitWords(s: String): ParSeq[String] = {
70f2c89 @havocp test IndexerActor and fix a couple minor things
havocp authored
91 // ".par" is the magic that gives us a parallel algorithm
aebb5bd @havocp Add IndexerActor
havocp authored
92 val lines = s.split("\\n").toSeq.par
93 val words = lines flatMap { line =>
70f2c89 @havocp test IndexerActor and fix a couple minor things
havocp authored
94 notWordRegex.split(line) filter { w => w.nonEmpty }
aebb5bd @havocp Add IndexerActor
havocp authored
95 }
96 words
97 }
98
99 // this is in the companion object for ease of unit testing
100 private[indexer] def wordCount(words: ParSeq[String]) = {
101 words.foldLeft(Map.empty[String, Int])({ (sofar, word) =>
102 sofar.get(word) match {
103 case Some(old) =>
104 sofar + (word -> (old + 1))
105 case None =>
106 sofar + (word -> 1)
107 }
108 })
109 }
70f2c89 @havocp test IndexerActor and fix a couple minor things
havocp authored
110
111 // not very scientific or internationalized ;-)
112 private val boringEnglishWords = Set(
113 "a",
114 "also",
115 "an",
116 "and",
117 "are",
118 "as",
119 "at",
120 "be",
121 "been",
122 "by",
123 "can",
124 "for",
125 "from",
126 "has",
127 "have",
128 "in",
129 "it",
130 "is",
131 "may",
132 "not",
133 "of",
134 "on",
135 "or",
136 "such",
137 "that",
138 "the",
139 "this",
140 "to",
141 "was",
142 "which",
143 "with")
144 private[indexer] def boring(word: String) = {
145 // no single letters or super-high-frequency words
146 word.length == 1 ||
147 boringEnglishWords.contains(word.toLowerCase)
148 }
aebb5bd @havocp Add IndexerActor
havocp authored
149 }
Something went wrong with that request. Please try again.