Permalink
Browse files

Get uk & us front info by scraping them

Closes #14
  • Loading branch information...
1 parent f0be0fc commit c1c7e6f7a0a67facd896953ef6ac679b0623f61f @tackley committed Mar 7, 2012
@@ -4,4 +4,14 @@
.published-content {
.pub-date { width: 100px }
-}
+}
+
+.icon-uk {
+ background-image:url(../img/uk.png);
+ background-position:0 0;
+}
+.icon-us{
+ background-image:url(../img/us.png);
+ background-position:0 0;
+}
+
@@ -30,6 +30,9 @@ object Application extends Controller {
}
val currentHits = Api.fullData
+ val ukFrontLinks = Backend.ukFrontLinkTracker.links()
+ val usFrontLinks = Backend.usFrontLinkTracker.links()
+
Backend.publishedContent.map { c =>
PublishedContent(
c.webPublicationDate, c.webUrl, c.webTitle,
@@ -39,7 +42,9 @@ object Application extends Controller {
c.tags,
currentHits.get(c.webUrl),
altTextOfMainImageFor(c),
- c.isLead.getOrElse(false)
+ c.isLead.getOrElse(false),
+ ukFrontLinks.contains(c.id),
+ usFrontLinks.contains(c.id)
)
}
}
@@ -65,7 +70,9 @@ case class PublishedContent(
tags: List[Tag],
hitReport: Option[HitReport],
altText: Option[String],
- isLead: Boolean
+ isLead: Boolean,
+ onUkFront: Boolean,
+ onUsFront: Boolean
) {
lazy val cpsCssClass = hitsPerSec match {
case "0" => "zero"
@@ -74,15 +81,4 @@ case class PublishedContent(
case "trace" => ""
case _ => "high"
}
-
- lazy val rowCssClass = if (hasNetworkFrontReferrer) "front-referral" else ""
-
- lazy val networkFrontTooltip =
- if (hasNetworkFrontReferrer) "Have seen referrals from the UK network front"
- else "No clicks to this page from the UK network front have been seen"
-
- lazy val networkFrontText = if (hasNetworkFrontReferrer) "NF" else ""
-
- lazy val hasNetworkFrontReferrer =
- hitReport map { _.referrers contains "http://www.guardian.co.uk/" } getOrElse false
}
@@ -19,12 +19,17 @@ object Backend {
val latestContent = new LatestContent
+ val ukFrontLinkTracker = new LinkTracker("http://www.guardian.co.uk")
+ val usFrontLinkTracker = new LinkTracker("http://www.guardiannews.com")
+
val mqReader = new MqReader(listener :: searchTerms :: Nil)
def start() {
system.scheduler.schedule(1 minute, 1 minute, listener, ClickStreamActor.TruncateClickStream)
system.scheduler.schedule(5 seconds, 5 seconds, listener, ClickStreamActor.SendClickStreamTo(calculator))
system.scheduler.schedule(5 seconds, 30 seconds) { latestContent.refresh() }
+ system.scheduler.schedule(1 seconds, 20 seconds) { ukFrontLinkTracker.refresh() }
+ system.scheduler.schedule(20 seconds, 60 seconds) { usFrontLinkTracker.refresh() }
spawn {
mqReader.start()
@@ -75,7 +75,7 @@ class LatestContent(implicit sys: ActorSystem) {
val contentMissingLeadStatus = contentList.filter(_.isLead.isEmpty)
val leadSections = contentMissingLeadStatus.flatMap(_.sectionId).sorted.distinct
- log.info("Need to find lead content status for " + leadSections)
+ log.info("Getting lead content status for " + leadSections)
val leadItemsPromise = for {
section <- leadSections
@@ -84,7 +84,7 @@ class LatestContent(implicit sys: ActorSystem) {
section -> leadContentForTag(sectionTag)
}
- log.info("leadItemsPromise = " + leadItemsPromise)
+ log.debug("leadItemsPromise = " + leadItemsPromise)
// now redeem those promises
val leadItems = leadItemsPromise.flatMap {
@@ -98,7 +98,7 @@ class LatestContent(implicit sys: ActorSystem) {
)
}.toMap
- log.info("leadItems = " + leadItems)
+ log.debug("leadItems = " + leadItems)
val result = contentList.map {
case c if c.isLead.isDefined => c
@@ -109,13 +109,14 @@ class LatestContent(implicit sys: ActorSystem) {
val isLead = leadList contains c.id
- log.info("%s (%s) -> isLead = %s" format (c.id, section, isLead))
- log.info("available lead content for this section: %s" format leadList.mkString("\t\n"))
+ log.debug("%s (%s) -> isLead = %s" format (c.id, section, isLead))
+ log.debug("available lead content for this section: %s" format leadList.mkString("\t\n"))
c.copy(isLead = Some(isLead))
}
-
-
+
+ log.info("Lead content processing complete")
+
result
}
}
@@ -0,0 +1,46 @@
+package lib
+
+import akka.actor.ActorSystem
+import akka.event.Logging._
+import akka.agent.Agent._
+import akka.agent.Agent
+import akka.event.Logging
+import play.api.libs.ws.WS
+import play.api.http.HeaderNames
+import org.jsoup.Jsoup
+import collection.JavaConversions._
+import java.net.URL
+
+class LinkTracker(url: String)(implicit actorSys: ActorSystem) {
+ private val log = Logging(actorSys, this.getClass)
+
+ // the current outgoing links on this front
+ val links = Agent[List[String]](Nil)
+
+ def refresh() {
+ import HeaderNames._
+
+ links sendOff { l =>
+ val retrievedLinksPromise = WS.url(url)
+ .withHeaders(USER_AGENT -> "SEO live dashboard; contact graham.tackley@guardian.co.uk")
+ .get()
+ .map { r =>
+ val doc = Jsoup.parse(r.body, url)
+
+ doc.select("a[href^=http:]")
+ .map(_.attr("href"))
+ .map(new URL(_))
+ .filter(_.getHost.endsWith("guardian.co.uk"))
+ .map(_.getPath.dropWhile(_ == '/'))
+ .toList
+ }
+
+ val retrievedLinks = retrievedLinksPromise.await.get
+
+ log.info("%d links on %s" format (retrievedLinks.size, url))
+
+ retrievedLinks
+ }
+ }
+
+}
@@ -3,16 +3,21 @@
<table class="published-content table table-striped">
@for(c <- content) {
-<tr class="@c.rowCssClass">
+<tr>
<td class="pub-date">@c.publicationDate.toString("d MMM HH:mm:ss")</td>
<td class="cps">
<span class="label percent-cps @c.cpsCssClass" title="Average hits per second over the last 15 minutes">
@c.hitsPerSec
</span>
</td>
- <td class="front-referral-status" title="@c.networkFrontTooltip">
- @if(c.hasNetworkFrontReferrer) {
- <i class="icon-th-list"></i>
+ <td class="uk-front-referral-status">
+ @if(c.onUkFront) {
+ <i class="icon-uk" title="On www.guardian.co.uk"></i>
+ }
+ </td>
+ <td class="us-front-referral-status">
+ @if(c.onUsFront) {
+ <i class="icon-us" title="On www.guardiannews.com"></i>
}
</td>
<td class="lead-status">
View
@@ -13,6 +13,7 @@ object ApplicationBuild extends Build {
"com.gu.openplatform" %% "content-api-client" % "1.13",
"com.typesafe.akka" % "akka-agent" % "2.0-RC1",
"org.joda" % "joda-convert" % "1.1" % "provided",
+ "org.jsoup" % "jsoup" % "1.6.1",
"net.liftweb" %% "lift-json" % "2.4-M4",
"net.liftweb" %% "lift-json-ext" % "2.4-M4",
"org.specs2" %% "specs2" % "1.6.1" % "test"
View
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
View
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit c1c7e6f

Please sign in to comment.