From d643f38124e3d5299e1a15f7141e614935c0dd4f Mon Sep 17 00:00:00 2001
From: Wai Hon Law <whhone@gmail.com>
Date: Sun, 29 Mar 2015 17:58:31 -0700
Subject: [PATCH] Integrate with CoreNLP by the binding
 `gangeli/CoreNLP-Scala`.

Tokenize the sentence and parse it to a Tree structure.
Calculate the sentiment on the tree structure.
---
 .gitignore                                    |   4 +
 build.sbt                                     |   9 +-
 data/import_eventserver.py                    |   8 +-
 data/send_query.py                            |  18 ++
 engine.json                                   |   4 +-
 src/main/scala/Algorithm.scala                |  49 ++++
 src/main/scala/CoreNLP-Scala/Makefile         |  44 +++
 src/main/scala/CoreNLP-Scala/README.md        |   3 +
 .../src/edu/stanford/nlp/Berkeley.scala       |  90 ++++++
 .../src/edu/stanford/nlp/Classify.scala       | 272 ++++++++++++++++++
 .../src/edu/stanford/nlp/Document.scala       |  55 ++++
 .../src/edu/stanford/nlp/Magic.scala          |  76 +++++
 .../src/edu/stanford/nlp/NLP.scala            | 196 +++++++++++++
 .../src/edu/stanford/nlp/NLPConfig.scala      |  44 +++
 .../src/edu/stanford/nlp/Optimize.scala       | 157 ++++++++++
 .../src/edu/stanford/nlp/Sentence.scala       | 246 ++++++++++++++++
 .../src/edu/stanford/nlp/TokensRegex.scala    |  82 ++++++
 src/main/scala/DataSource.scala               |   6 +-
 src/main/scala/DummyAlgorithm.scala           |  34 ---
 src/main/scala/DummyModel.scala               |   3 -
 src/main/scala/Engine.scala                   |   2 +-
 src/main/scala/Model.scala                    |  73 +++++
 22 files changed, 1427 insertions(+), 48 deletions(-)
 create mode 100644 data/send_query.py
 create mode 100644 src/main/scala/Algorithm.scala
 create mode 100644 src/main/scala/CoreNLP-Scala/Makefile
 create mode 100644 src/main/scala/CoreNLP-Scala/README.md
 create mode 100644 src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Berkeley.scala
 create mode 100644 src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Classify.scala
 create mode 100644 src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Document.scala
 create mode 100644 src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Magic.scala
 create mode 100644 src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/NLP.scala
 create mode 100644 src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/NLPConfig.scala
 create mode 100644 src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Optimize.scala
 create mode 100644 src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Sentence.scala
 create mode 100644 src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/TokensRegex.scala
 delete mode 100644 src/main/scala/DummyAlgorithm.scala
 delete mode 100644 src/main/scala/DummyModel.scala
 create mode 100644 src/main/scala/Model.scala

diff --git a/.gitignore b/.gitignore
index 5f1f0a2..2abf820 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,4 +2,8 @@ manifest.json
 pio.log
 /pio.sbt
 target/
+data/*.csv
+data/*.tsv
+data/*.zip
+data/gen_submission.py
 *~
diff --git a/build.sbt b/build.sbt
index 473124e..361b159 100644
--- a/build.sbt
+++ b/build.sbt
@@ -6,7 +6,14 @@ name := "template-scala-sentiment-analysis"
 
 organization := "io.prediction"
 
+excludeFilter in unmanagedSources := "Berkeley.scala"
+
 libraryDependencies ++= Seq(
   "io.prediction"    %% "core"          % pioVersion.value % "provided",
   "org.apache.spark" %% "spark-core"    % "1.2.0" % "provided",
-  "org.apache.spark" %% "spark-mllib"   % "1.2.0" % "provided")
+  "org.apache.spark" %% "spark-mllib"   % "1.2.0" % "provided",
+  "edu.stanford.nlp" % "stanford-corenlp" % "3.4",
+  "edu.stanford.nlp" % "stanford-corenlp" % "3.4" classifier "models",
+  "edu.stanford.nlp" % "stanford-parser" % "3.4"
+)
+
diff --git a/data/import_eventserver.py b/data/import_eventserver.py
index 444106a..49cd2a0 100644
--- a/data/import_eventserver.py
+++ b/data/import_eventserver.py
@@ -13,11 +13,11 @@ def import_events(client, file):
     data = line.rstrip('\r\n').split("\t")
     if True:
       client.create_event(
-        event="rate",
+        event="train",
         entity_type="user",
         entity_id=data[0],
         properties= {
-          "sentence" : str(data[2]),
+          "phrase" : str(data[2]),
           "sentiment" : float(data[3])
         }
       )
@@ -41,6 +41,6 @@ def import_events(client, file):
   client = predictionio.EventClient(
     access_key=args.access_key,
     url=args.url,
-    threads=5,
-    qsize=500)
+    threads=10,
+    qsize=1000)
   import_events(client, args.file)
diff --git a/data/send_query.py b/data/send_query.py
new file mode 100644
index 0000000..72f2e39
--- /dev/null
+++ b/data/send_query.py
@@ -0,0 +1,18 @@
+"""
+Send sample query to prediction engine
+"""
+
+import predictionio
+client = predictionio.EngineClient(url="http://localhost:8000")
+
+def test(s):
+  print s + ' : ' + str(client.send_query({"s": s})['sentiment'])
+
+test('sad')
+test('happy')
+test('oh')
+test('not')
+test('not sad')
+test('very sad')
+test('very happy')
+test('not very sad')
diff --git a/engine.json b/engine.json
index c52c636..bf61e5a 100644
--- a/engine.json
+++ b/engine.json
@@ -9,9 +9,9 @@
   },
   "algorithms": [
     {
-      "name": "dummy",
+      "name": "nlpparse",
       "params": {
-        
+        "baseWeight": 1
       }
     }
   ]
diff --git a/src/main/scala/Algorithm.scala b/src/main/scala/Algorithm.scala
new file mode 100644
index 0000000..98e3a76
--- /dev/null
+++ b/src/main/scala/Algorithm.scala
@@ -0,0 +1,49 @@
+package org.template.sentimentanalysis
+
+import io.prediction.controller.PAlgorithm
+import io.prediction.controller.Params
+import io.prediction.data.storage.BiMap
+
+import org.apache.spark.SparkContext
+import org.apache.spark.SparkContext._
+import org.apache.spark.rdd.RDD
+
+import edu.stanford.nlp.Magic._
+
+import grizzled.slf4j.Logger
+
+case class AlgorithmParams(
+  val baseWeight: Double
+)extends Params
+
+class Algorithm(val ap: AlgorithmParams)
+  extends PAlgorithm[PreparedData, Model, Query, PredictedResult] {
+
+  @transient lazy val logger = Logger[this.type]
+
+  def train(sc: SparkContext, data: PreparedData): Model = {
+    require(!data.sentiments.take(1).isEmpty,
+      s"RDD[sentiments] in PreparedData cannot be empty." +
+      " Please check if DataSource generates TrainingData" +
+      " and Preprator generates PreparedData correctly.")
+
+    val itemSets: RDD[(String, Double)] = data.sentiments.map(
+      s => (s.phrase.toLowerCase(), s.sentiment)
+    ).cache()
+
+    val rules = itemSets.groupByKey
+      .mapValues(
+        // assume the last training data is the most up-to-date
+        iter => iter.toVector.last
+      )
+    .collectAsMap.toMap
+
+    new Model(rules)
+  }
+
+  def predict(model: Model, query: Query): PredictedResult = {
+    new PredictedResult(
+      model.getSentiment(query.s, ap)
+    )
+  }
+}
diff --git a/src/main/scala/CoreNLP-Scala/Makefile b/src/main/scala/CoreNLP-Scala/Makefile
new file mode 100644
index 0000000..d833b82
--- /dev/null
+++ b/src/main/scala/CoreNLP-Scala/Makefile
@@ -0,0 +1,44 @@
+#
+# To Build:
+#  1. Set CORENLP_HOME to the root of CoreNLP
+#  2. [optional] Set BERKELEY to the path to the Berkeley parser
+#  3. Build using either 'make stanford' or 'make berkeley' (if the Berkeley parser is configured)
+#
+
+CORENLP=$(CORENLP_HOME)/classes:$(CORENLP_HOME)/lib/joda-time.jar:$(CORENLP_HOME)/lib/jollyday-0.4.7.jar
+BERKELEY=$(CORENLP_HOME)/../more/lib/BerkeleyParser.jar
+
+JAVAC=javac
+SCALAC=scalac
+
+SRC=src
+SOURCES = $(wildcard src/edu/stanford/nlp/*.scala)
+TEST_SRC=test/src
+LIB=lib
+BUILD=classes
+TEST_BUILD=test/classes
+DIST=dist
+
+dist: stanford
+	mkdir -p ${DIST}
+	jar cf ${DIST}/corenlp-scala.jar -C $(BUILD) .
+	jar uf ${DIST}/corenlp-scala.jar -C $(SRC) .
+
+berkeley: stanford
+	$(SCALAC) -cp $(CORENLP):${BERKELEY} -d $(BUILD) `find $(SRC) -name "*.scala"`
+
+stanford: ${SOURCES}
+	mkdir -p $(BUILD)
+	sed -e 's/BerkeleyUtil.berkeleyParser/throw new IllegalStateException("Could not find parser model (and was not compiled to run with Berkeley parser)")/g' ${SRC}/edu/stanford/nlp/NLP.scala > /tmp/NLP_stanfordonly.scala
+	$(SCALAC) -cp $(CORENLP) -d $(BUILD) `find $(SRC) -name "*.scala" ! -name "*Berkeley.scala" ! -name "NLP.scala"` /tmp/NLP_stanfordonly.scala
+	rm /tmp/NLP_stanfordonly.scala
+
+default:  stanford
+
+clean:
+	rm -r $(BUILD)
+	rm -r ${DIST}
+
+
+cmd:
+	@echo "scala -J-Xmx4G -cp $(CORENLP):$(BUILD)":${HOME}/lib/corenlp-models.jar
diff --git a/src/main/scala/CoreNLP-Scala/README.md b/src/main/scala/CoreNLP-Scala/README.md
new file mode 100644
index 0000000..7f5ae08
--- /dev/null
+++ b/src/main/scala/CoreNLP-Scala/README.md
@@ -0,0 +1,3 @@
+Since gangeli/CoreNLP-Scala does not provide a way to install by the build.sbt file,
+copy it from https://github.com/gangeli/CoreNLP-Scala.
+
diff --git a/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Berkeley.scala b/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Berkeley.scala
new file mode 100644
index 0000000..4d52f79
--- /dev/null
+++ b/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Berkeley.scala
@@ -0,0 +1,90 @@
+package edu.stanford.nlp;
+
+import scala.collection.JavaConversions._
+import scala.concurrent.Lock
+
+import edu.stanford.nlp.trees.Tree
+import edu.stanford.nlp.trees.Trees
+import edu.stanford.nlp.trees.LabeledScoredTreeNode
+import edu.stanford.nlp.ling.HasWord
+import edu.stanford.nlp.ling.Word
+
+import edu.berkeley.nlp.PCFGLA._
+import edu.berkeley.nlp.util.Numberer
+
+import NLPConfig._
+
+object BerkeleyUtil {
+  type BerkeleyTree = edu.berkeley.nlp.syntax.Tree[String]
+
+  implicit def stanfordTree2BerkeleyTree(btree:BerkeleyTree):Tree = {
+    val roots = TreeAnnotations.unAnnotateTree(btree).getChildren;
+    if (roots.isEmpty) {
+      new LabeledScoredTreeNode();
+    } else {
+      def convert(src:BerkeleyTree):Tree = {
+        val dst:Tree = new LabeledScoredTreeNode
+        if (src.getLabel != null) dst.setLabel(new Word(src.getLabel))
+        dst.setChildren(src.getChildren.map( convert(_) ).toArray)
+        dst
+      }
+      new LabeledScoredTreeNode(new Word("TOP"),
+                                List[Tree](convert(roots.get(0))))
+    }
+  }
+  
+  lazy val berkeleyParser = {
+    // (function to create parser)
+    def mkParser = {
+      // (setup parser)
+      val pData = ParserData.Load(parse.model)
+      if (pData == null) throw new RuntimeException("Failed to load Berkeley parser model")
+      val grammar = pData.getGrammar();
+      val lexicon = pData.getLexicon();
+      Numberer.setNumberers(pData.getNumbs());
+      // (create parser object)
+      val parser = new CoarseToFineMaxRuleParser(
+                   grammar, lexicon, 1.0, -1, false, false, false,
+                   false, false, true, true)
+      // (set binarization)
+      try {
+        val binarizationField = classOf[ConstrainedArrayParser].getDeclaredField("binarization");
+        binarizationField.setAccessible(true);
+        binarizationField.set(parser, pData.getBinarization());
+        binarizationField.setAccessible(false);
+      } catch { case (e:Exception) => throw new RuntimeException(e) }
+      // (parser object)
+      new {
+        def parse(words:List[String], pos:List[String]):Tree = {
+          var parsedTree:BerkeleyTree 
+            = parser.getBestConstrainedParse(words, pos, null);
+          if (parsedTree.getChildren().isEmpty()) {
+            parsedTree = parser.getBestConstrainedParse(words, null, null);
+          }
+          parsedTree
+        }
+      }
+    }
+    // (create parsers)
+    val parsers = (0 until numThreads).map{ x => (mkParser, new Lock) }.toList
+    // (multithreaded implementation)
+    new {
+      def parse(words:List[String], pos:List[String]):Tree = {
+        def tryParse:Tree = {
+          val validParser = parsers.indexWhere{
+            (pair:({def parse(words:List[String],pos:List[String]):Tree},Lock)) =>
+              pair._2.available
+          }
+          if (validParser >= 0) { // case: [likely] found parser to run
+            val (parser, lock) = parsers(validParser)
+            lock.acquire
+            val rtn = parser.parse(words, pos)
+            lock.release
+            rtn
+          } else { Thread.sleep(1000); tryParse } // case: no parser found
+        }
+        tryParse
+      }
+    }
+  }
+}
diff --git a/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Classify.scala b/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Classify.scala
new file mode 100644
index 0000000..f3a6866
--- /dev/null
+++ b/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Classify.scala
@@ -0,0 +1,272 @@
+package edu.stanford.nlp;
+
+import scala.collection.JavaConversions._
+import scala.collection.MapLike
+import scala.collection.Map
+import scala.collection.generic.CanBuildFrom
+import scala.concurrent.Lock
+
+import java.io.ObjectInputStream
+import java.lang.ref.SoftReference
+import java.lang.ref.ReferenceQueue
+import java.util.Properties
+
+import edu.stanford.nlp.classify.LinearClassifierFactory
+import edu.stanford.nlp.classify.LogPrior
+import edu.stanford.nlp.classify.RVFDataset
+import edu.stanford.nlp.ie.NERClassifierCombiner
+import edu.stanford.nlp.io.IOUtils
+import edu.stanford.nlp.ling.HasWord
+import edu.stanford.nlp.ling.RVFDatum
+import edu.stanford.nlp.ling.Word
+import edu.stanford.nlp.ling.CoreLabel
+import edu.stanford.nlp.optimization.DiffFunction
+import edu.stanford.nlp.optimization.QNMinimizer
+import edu.stanford.nlp.optimization.SGDToQNMinimizer
+import edu.stanford.nlp.parser.lexparser.LexicalizedParser
+import edu.stanford.nlp.process.Morphology
+import edu.stanford.nlp.process.PTBTokenizer
+import edu.stanford.nlp.stats.ClassicCounter
+import edu.stanford.nlp.stats.Counter
+import edu.stanford.nlp.tagger.maxent.MaxentTagger
+import edu.stanford.nlp.trees.CollinsHeadFinder
+import edu.stanford.nlp.trees.LabeledScoredTreeNode
+import edu.stanford.nlp.trees.Tree
+import edu.stanford.nlp.trees.Trees
+import edu.stanford.nlp.trees.GrammaticalStructureFactory
+import edu.stanford.nlp.trees.GrammaticalStructure
+import edu.stanford.nlp.trees.PennTreebankLanguagePack
+import edu.stanford.nlp.trees.TypedDependency
+import edu.stanford.nlp.util.logging.Redwood.Util._
+
+import NLPConfig._
+import NLP._
+
+// ----------
+// Classifiers
+// ----------
+@SerialVersionUID(1l)
+class Classifier[I,O](
+     regression:I=>Map[O,Double],
+     val data:Map[I,(O,Float)]) extends Function1[I,O] with Serializable {
+  override def apply(in:I):O = {
+    regression(in).maxBy(_._2)._1
+  }
+}
+
+class Mapping[I,O](map:Map[I,(O,Float)]) {
+  import Mapping.{toCounter,defaultFeatures}
+
+  def scorer[F](featurizer:I=>Iterable[F]):I=>Map[O,Double] = {
+    // -- Create Dataset
+    val weights = new Array[Float](map.size)
+    val dataset = new RVFDataset[O, F](map.size)
+    map.zipWithIndex.foreach{
+        case ((input:I, (output:O, weight:Float)),i:Int) =>
+      weights(i) = weight
+      dataset.add( new RVFDatum[O, F](toCounter(featurizer(input)), output) )
+    }
+    // -- Train
+    val prior = new LogPrior(LogPrior.LogPriorType.QUADRATIC)
+    val factory = new LinearClassifierFactory[O,F]()
+    val classifier = factory.trainClassifier(dataset, weights, prior)
+    // -- Return
+    (input:I) => {
+      val scores = classifier.scoresOf(
+        new RVFDatum[O, F](toCounter(featurizer(input)), null.asInstanceOf[O]))
+      scores.keySet.map{ x => (x, scores.getCount(x)) }.toMap
+    }
+  }
+  def scorer:I=>Map[O,Double] = scorer(defaultFeatures(_, map.size))
+
+  def classifier[F](featurizer:I=>Iterable[F]):Classifier[I,O]
+    = new Classifier(scorer(featurizer), map)
+  def classifier:Classifier[I,O]
+    = classifier(defaultFeatures(_, map.size))
+}
+
+object Mapping {
+  def toCounter[X,F](map:Iterable[X]):Counter[F] = {
+    val counts = new ClassicCounter[F]
+    map.foreach{ (x:X) => x match {
+      case (feat:F, n:Number) => counts.incrementCount(feat, n.doubleValue)
+      case (feat:F) => counts.incrementCount(feat, 1)
+      case _ => throw new IllegalStateException("Type mismatch in toCounter")
+    } }
+    return counts
+  }
+  
+  def apply[I,O,X](map:Map[I,X]):Mapping[I,O] = {
+    new Mapping(map.map{ case (i:I, x:X) => x match {
+      case (o:O, n:Number) => (i, (o, n.floatValue))
+      case (o:O) => (i, (o, 1.0.asInstanceOf[Float]))
+      case _ => throw new IllegalStateException("Type mismatch in toCounter")
+    } })
+  }
+
+  def defaultFeatures[I](input:I, datasetSize:Int):Iterable[(String,Float)] = {
+    def ngram[A](seq:List[A], n:Int, tail:List[A] = Nil):List[String] = {
+      if (seq.isEmpty) Nil
+      else (seq.head :: tail.slice(0, n-1)).reverse.mkString("_") :: ngram(seq.tail, n, seq.head :: tail)
+    }
+    input match {
+      case (sent:Sentence) =>
+        val n:Int = (scala.math.log10(datasetSize) / 3.0).toInt + 1
+        // N-grams
+        (ngram(sent.words.toList, n) :::
+         ngram(sent.words.toList.map( _.toLowerCase ), n) :::
+         ngram(sent.lemma.toList, n) :::
+         ngram(sent.ner.toList, n) :::
+         ngram(sent.pos.toList, n) :::
+         // Bag-of-words
+         { if (n > 1)
+            sent.words.toList :::
+            sent.words.toList.map( _.toLowerCase ) :::
+            sent.lemma.toList :::
+            sent.ner.toList :::
+            sent.pos.toList
+           else Nil }
+        ).map{ (_, 1.0.toFloat) }
+      case (str:String) =>
+        val tokens = str.split(" ")
+        val n:Int = (scala.math.log10(datasetSize) / 3.0).toInt + 1
+        if (tokens.length <= 1) {
+          // Case: a single word
+          (tokens(0) ::  // memorize
+            ngram(str.toCharArray.toList, n) :::  // literal n-grams
+            ngram(str.toLowerCase.toCharArray.toList, n)  // case-insensitive n-grams
+            ).map{ (_, 1.0.toFloat) }
+        } else {
+          // Case: a phrase
+          (ngram(tokens.toList, n) :::  // literal n-grams
+           ngram(tokens.toList.map( _.toLowerCase), n)  // case-insensitive n-grams
+          ).map{ (_, 1.0.toFloat) }
+        }
+      case (seq:Iterable[Any]) =>
+      seq.map{ (x:Any) => x match {
+        case (feat:Any, n:Number) => (feat.toString, n.floatValue)
+        case (feat:Any) => (feat.toString, 1.0.toFloat)
+        case _ => (x.toString, 1.0.toFloat)
+      } }
+      case _ => List[(String,Float)]( (input.toString, 1.0.toFloat) )
+    }
+  }
+}
+
+// ----------
+// Ensemble Classifiers
+// ----------
+
+class Ensemble[I](members:Seq[I=>Boolean], dat:Option[Map[I,(Boolean,Float)]]) {
+  // -- Get Data
+  if (!dat.isDefined) {
+    members.foldLeft(Option[Map[I,(Boolean,Float)]](null)){
+        (dat:Option[Map[I,(Boolean,Float)]], fn:I=>Boolean) =>
+      fn match {
+        case (classifier:Classifier[I,Boolean]) =>
+          dat match {
+            case Some(existingData) =>
+              if (classifier.data != existingData) {
+                warn("Classifiers trained on different data; taking union")
+                Some(classifier.data ++ existingData)
+              } else {
+                Some(existingData)
+              }
+            case None => Some(classifier.data)
+          }
+        case _ => dat
+      }
+    }
+  }
+
+  // -- Methods
+  def data(d:Map[I,(Boolean,Float)]):Ensemble[I] = new Ensemble(members, Some(d))
+  def data(d:Seq[(I,Boolean)]):Ensemble[I]
+    = data( d.map( x => (x._1, (x._2, 1.0f)) ).toMap )
+  
+  /**
+   *  Implementation of AdaBoost.
+   *  Taken from http://en.wikipedia.org/wiki/AdaBoost
+   */
+  def boost(data:Map[I,(Boolean,Float)]):Classifier[I,Boolean] = {
+    if (data.isEmpty) throw new IllegalArgumentException("No data to train on!")
+    // -- Cache
+    startTrack("Running Weak Learners")
+    val dataAsArray = data.toArray
+    val gold = dataAsArray.map( _._2._1 )
+    val predictions:Array[(I=>Boolean,Array[(Boolean, Float)])]
+      = members.toList.par.map{ (h:I=>Boolean) =>
+        log("running " + h.toString)
+        (h, dataAsArray.map{ case (in:I, (out:Boolean, weight:Float)) =>
+          (h(in), weight)
+        })
+      }.toArray
+    endTrack("Running Weak Learners")
+    // -- Error Rate
+    def error(predictions:Array[(Boolean,Float)],
+              gold:Array[Boolean],
+              d:Array[Double] = (0 until data.size).map( x => 1.0 / data.size ).toArray
+              ):Double = {
+      predictions.zip(gold).zip(d).foldLeft(0.0){
+          case (sum:Double,
+               (( (guess:Boolean, weight:Float),
+                gold:Boolean),
+                di:Double)) =>
+        if(guess == gold) sum else sum + di * weight
+      }
+    }
+    def regressor(coefficients:Seq[(Double, I=>Boolean)]
+                  ):(I => Map[Boolean, Double]) = (in:I) => {
+      val sum = coefficients.foldLeft(0.0){
+          case (sum:Double, (alpha:Double, h:(I=>Boolean))) =>
+        sum + alpha * { if(h(in)) 1.0 else -1.0 }
+      }
+      Map[Boolean, Double]( true  -> {if(sum >= 0.0) 1.0 else 0.0 },
+                            false -> {if(sum >= 0.0) 0.0 else 1.0 } )
+    }
+    // -- Run an Iteration
+    def iter(t:Int,
+             predictions:Array[(I=>Boolean, Array[(Boolean,Float)])],
+             gold:Array[Boolean],
+             soFar:List[(Double, I=>Boolean)],
+             d:Array[Double] = data.map( x => 1.0 / data.size.toDouble ).toArray,
+             tolerance:Double = NLPConfig.classify.tolerance
+             ):List[(Double, I=>Boolean)] = {
+      startTrack("Iteration " + t)
+      // (get errors)
+      val errors = predictions.map{ case (h, pred:Array[(Boolean,Float)]) =>
+        ( h, pred, error(pred, gold, d) )
+      }
+      val (hOpt, predOpt, et) = errors.maxBy( x => scala.math.abs(0.5 - x._3) )
+      // (compute update)
+      log("optimal classifier: " + hOpt)
+      log("e_t: " + et)
+      val at   = 0.5 * scala.math.log( (1.0 - et) / et )
+      val newD = predOpt.zip(gold).zip(d).map{
+          case (((guess:Boolean, weight:Float), gold:Boolean), di:Double) =>
+        di * scala.math.exp(- {if (guess == gold) 1.0 else -1.0} * at)
+      }
+      val sumD = newD.sum
+      for (i <- 0 until newD.length) { newD(i) /= sumD }
+      // (update coefficients)
+      val coeffs = (at, hOpt) :: soFar
+      log("a_t: " + at)
+      endTrack("Iteration " + t)
+      // (recurse)
+      if ( scala.math.abs(0.5 - et) < tolerance ||
+           t >= NLPConfig.classify.iterations) {
+        coeffs
+      } else {
+        iter(t+1, predictions, gold, coeffs, newD, tolerance)
+      }
+    }
+    // -- Construct Classifier
+    startTrack("Boosting over " + members.length + " classifier and " + data.size + " examples")
+    val fn = regressor(iter(1, predictions, gold, Nil))
+    endTrack("Boosting over " + members.length + " classifier and " + data.size + " examples")
+    new Classifier(fn, data)
+  }
+
+  def boost:Classifier[I,Boolean]
+    = boost(dat.getOrElse(Map[I,(Boolean,Float)]()))
+}
diff --git a/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Document.scala b/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Document.scala
new file mode 100644
index 0000000..0659a50
--- /dev/null
+++ b/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Document.scala
@@ -0,0 +1,55 @@
+package edu.stanford.nlp;
+
+import scala.collection.JavaConversions._
+import scala.collection.MapLike
+import scala.collection.Map
+import scala.collection.generic.CanBuildFrom
+import scala.concurrent.Lock
+
+import java.io.ObjectInputStream
+import java.lang.ref.SoftReference
+import java.lang.ref.ReferenceQueue
+import java.util.Properties
+
+import edu.stanford.nlp.classify.LinearClassifierFactory
+import edu.stanford.nlp.classify.LogPrior
+import edu.stanford.nlp.classify.RVFDataset
+import edu.stanford.nlp.ie.NERClassifierCombiner
+import edu.stanford.nlp.io.IOUtils
+import edu.stanford.nlp.ling.HasWord
+import edu.stanford.nlp.ling.RVFDatum
+import edu.stanford.nlp.ling.Word
+import edu.stanford.nlp.ling.CoreLabel
+import edu.stanford.nlp.optimization.DiffFunction
+import edu.stanford.nlp.optimization.QNMinimizer
+import edu.stanford.nlp.optimization.SGDToQNMinimizer
+import edu.stanford.nlp.parser.lexparser.LexicalizedParser
+import edu.stanford.nlp.process.Morphology
+import edu.stanford.nlp.process.PTBTokenizer
+import edu.stanford.nlp.stats.ClassicCounter
+import edu.stanford.nlp.stats.Counter
+import edu.stanford.nlp.tagger.maxent.MaxentTagger
+import edu.stanford.nlp.trees.CollinsHeadFinder
+import edu.stanford.nlp.trees.LabeledScoredTreeNode
+import edu.stanford.nlp.trees.Tree
+import edu.stanford.nlp.trees.Trees
+import edu.stanford.nlp.trees.GrammaticalStructureFactory
+import edu.stanford.nlp.trees.GrammaticalStructure
+import edu.stanford.nlp.trees.PennTreebankLanguagePack
+import edu.stanford.nlp.trees.TypedDependency
+import edu.stanford.nlp.util.logging.Redwood.Util._
+
+import NLPConfig._
+import NLP._
+
+
+object Document {
+}
+
+
+@SerialVersionUID(1l)
+case class Document(sentences:Array[String]) {
+  // TODO(gabor) coreference
+}
+
+
diff --git a/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Magic.scala b/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Magic.scala
new file mode 100644
index 0000000..692ac76
--- /dev/null
+++ b/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Magic.scala
@@ -0,0 +1,76 @@
+package edu.stanford.nlp;
+
+import scala.collection.JavaConversions._
+import scala.collection.MapLike
+import scala.collection.Map
+import scala.collection.generic.CanBuildFrom
+import scala.concurrent.Lock
+
+import java.io.ObjectInputStream
+import java.lang.ref.SoftReference
+import java.lang.ref.ReferenceQueue
+import java.util.Properties
+
+import edu.stanford.nlp.classify.LinearClassifierFactory
+import edu.stanford.nlp.classify.LogPrior
+import edu.stanford.nlp.classify.RVFDataset
+import edu.stanford.nlp.ie.NERClassifierCombiner
+import edu.stanford.nlp.io.IOUtils
+import edu.stanford.nlp.ling.HasWord
+import edu.stanford.nlp.ling.RVFDatum
+import edu.stanford.nlp.ling.Word
+import edu.stanford.nlp.ling.CoreLabel
+import edu.stanford.nlp.optimization.DiffFunction
+import edu.stanford.nlp.optimization.QNMinimizer
+import edu.stanford.nlp.optimization.SGDToQNMinimizer
+import edu.stanford.nlp.parser.lexparser.LexicalizedParser
+import edu.stanford.nlp.process.Morphology
+import edu.stanford.nlp.process.PTBTokenizer
+import edu.stanford.nlp.stats.ClassicCounter
+import edu.stanford.nlp.stats.Counter
+import edu.stanford.nlp.tagger.maxent.MaxentTagger
+import edu.stanford.nlp.trees.CollinsHeadFinder
+import edu.stanford.nlp.trees.LabeledScoredTreeNode
+import edu.stanford.nlp.trees.Tree
+import edu.stanford.nlp.trees.Trees
+import edu.stanford.nlp.trees.GrammaticalStructureFactory
+import edu.stanford.nlp.trees.GrammaticalStructure
+import edu.stanford.nlp.trees.PennTreebankLanguagePack
+import edu.stanford.nlp.trees.TypedDependency
+import edu.stanford.nlp.util.logging.Redwood.Util._
+
+import NLPConfig._
+
+
+object Magic {
+  import NLP._
+
+  /*
+   * Implicit Conversions
+   */
+  implicit def seq2nlpseq(seq:Seq[String]):Sentence = new Sentence(seq)
+  implicit def string2nlpseq(gloss:String):Sentence = new Sentence(gloss)
+  
+  implicit def map2mapping[I,O,X](map:Map[I,X]):Mapping[I,O] = Mapping(map)
+  
+  implicit def seq2ensemble[I](seq:Seq[I=>Boolean]):Ensemble[I] = new Ensemble(seq, None)
+  
+  implicit def fn2optimizable(
+        fn:Array[Double]=>Double):OptimizableFunction = {
+    optimize.algorithm.toLowerCase match {
+      case "lbfgs" => LBFGSOptimizableApproximateFunction(fn, None)
+      case "braindead" => BraindeadGradientDescent(fn, None)
+      case _ => throw new IllegalStateException("Unknown algorithm: " + optimize.algorithm)
+    }
+  }
+  implicit def fnPair2optimizable(
+        pair:(Array[Double]=>Double,Array[Double]=>Array[Double])):OptimizableFunction = {
+    optimize.algorithm.toLowerCase match {
+       case "lbfgs" => LBFGSOptimizableApproximateFunction(pair._1, Some(pair._2))
+       case "braindead" => BraindeadGradientDescent(pair._1, Some(pair._2))
+       case _ => throw new IllegalStateException("Unknown algorithm: " + optimize.algorithm)
+    }
+  }
+
+  implicit def string2tokensregex(str:String):TokensRegex = new TokensRegex(str)
+}
diff --git a/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/NLP.scala b/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/NLP.scala
new file mode 100644
index 0000000..16cc267
--- /dev/null
+++ b/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/NLP.scala
@@ -0,0 +1,196 @@
+package edu.stanford.nlp
+
+import scala.collection.JavaConversions._
+import scala.collection.MapLike
+import scala.collection.Map
+import scala.collection.generic.CanBuildFrom
+import scala.concurrent.Lock
+
+import java.io.ObjectInputStream
+import java.lang.ref.SoftReference
+import java.lang.ref.ReferenceQueue
+import java.util.Properties
+
+import edu.stanford.nlp.ling.CoreAnnotations._
+import edu.stanford.nlp.classify.LinearClassifierFactory
+import edu.stanford.nlp.classify.LogPrior
+import edu.stanford.nlp.classify.RVFDataset
+import edu.stanford.nlp.ie.NERClassifierCombiner
+import edu.stanford.nlp.ie.crf.CRFBiasedClassifier
+import edu.stanford.nlp.io.IOUtils
+import edu.stanford.nlp.ling.HasWord
+import edu.stanford.nlp.ling.RVFDatum
+import edu.stanford.nlp.ling.Word
+import edu.stanford.nlp.ling.CoreLabel
+import edu.stanford.nlp.optimization.DiffFunction
+import edu.stanford.nlp.optimization.QNMinimizer
+import edu.stanford.nlp.optimization.SGDToQNMinimizer
+import edu.stanford.nlp.parser.lexparser.LexicalizedParser
+import edu.stanford.nlp.process.Morphology
+import edu.stanford.nlp.process.PTBTokenizer
+import edu.stanford.nlp.stats.ClassicCounter
+import edu.stanford.nlp.stats.Counter
+import edu.stanford.nlp.tagger.maxent.MaxentTagger
+import edu.stanford.nlp.trees.CollinsHeadFinder
+import edu.stanford.nlp.trees.LabeledScoredTreeNode
+import edu.stanford.nlp.trees.Tree
+import edu.stanford.nlp.trees.Trees
+import edu.stanford.nlp.trees.GrammaticalStructureFactory
+import edu.stanford.nlp.trees.GrammaticalStructure
+import edu.stanford.nlp.trees.PennTreebankLanguagePack
+import edu.stanford.nlp.trees.TypedDependency
+import edu.stanford.nlp.util.logging.Redwood.Util._
+
+import NLPConfig._
+
+object NLP {
+  implicit def list2hasWordList(lst:Seq[String]):java.util.List[_<:HasWord]
+    = lst.map( new Word(_) ).toList
+
+  // ----------
+  // Parsers
+  // ----------
+  lazy val stanfordParser = {
+    val parser = LexicalizedParser.loadModel(parse.model)
+    new {
+      def parse(words:List[String], pos:List[String]):Tree = {
+        parser.parseStrings(words);
+      }
+    }
+  }
+  lazy val parser = stanfordParser
+  // ----------
+  // Stanford CoreNLP Components
+  // ----------
+  lazy val tagger = new MaxentTagger(pos.model)
+
+  lazy val collinsHeadFinder = new CollinsHeadFinder()
+
+  lazy val morph:((Morphology=>Any)=>Any) = {
+    val morph = new Morphology()
+    val morphLock = new Lock()
+    val f = { (fn:Morphology=>Any) =>
+      morphLock.acquire;
+      val rtn = fn(morph);
+      morphLock.release
+      rtn
+    }
+    f
+  }
+
+  lazy val nerCRF:(Array[String], Array[String])=>Array[String] = {
+    val classifier = new NERClassifierCombiner(ner.model, ner.aux);
+    (words:Array[String], pos:Array[String]) => {
+      val offsets:List[Int] = words.foldLeft( (List[Int](), 0) ){
+          case ((offsetsSoFar:List[Int], offset:Int), word:String) =>
+        (offset :: offsetsSoFar, offset + word.length + 1)
+      }._1.reverse
+      // (construct CoreLabel sentence)
+      val coreSentence = new java.util.ArrayList[CoreLabel](words.length)
+      words.zip(pos).zip(offsets)foreach{
+          case ((word:String, pos:String), offset:Int) =>
+        val label = new CoreLabel
+        label.setWord(word)
+        label.setOriginalText(word)
+        label.setTag(pos)
+        label.setBeginPosition(offset)
+        label.setEndPosition(offset + word.length)
+        coreSentence.add(label)
+      }
+      // (classify)
+      classifier.classifySentence(coreSentence)
+      val output:java.util.List[CoreLabel] = classifier.classifySentence(coreSentence);
+      // (convert back)
+      output.map{ (label:CoreLabel) =>
+        label.ner()
+      }.toArray
+    }
+  }
+
+  /**
+   * The TrueCase classifier implementation.
+   * Takes as input an array of tokens, POS tags, and lemmas,
+   * and returns as output the tokens with their true case applied.
+   * The length of the tokens, POS tags, and lemmas must match.
+   * @return An array of tokens (words as Strings) of the same length
+   *         as the input tokens, but with their inferred true case.
+   */
+  lazy val trueCaser:(Array[String], Array[String], Array[String])=>Array[String] = {
+    // Create classifier
+    val props:Properties = {
+        val p = new Properties
+        p.setProperty("loadClassifier", NLPConfig.truecase.model)
+        p.setProperty("mixedCaseMapFile", NLPConfig.truecase.disambiguation_list)
+        p.setProperty("classBias", NLPConfig.truecase.bias)
+        p
+      }
+    val classifier = new CRFBiasedClassifier[CoreLabel](props);
+    classifier.loadClassifierNoExceptions(NLPConfig.truecase.model, props);
+    // Set classifier biases
+    NLPConfig.truecase.bias.split(",").foreach{ (bias:String) =>
+      val terms = bias.split(":")
+      classifier.setBiasWeight(terms(0), terms(1).toDouble)
+    }
+    // Get mixed case map
+    val mixedCaseMap:Map[String,String]
+      = scala.io.Source.fromInputStream(IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(NLPConfig.truecase.disambiguation_list))
+               .getLines
+               .map( _.trim.split("""\s+""") )
+               .map{ case Array(a:String, b:String) => (a ,b) }
+               .toMap
+    // Return function
+    (words:Array[String], pos:Array[String], lemma:Array[String]) => {
+      // (mock offsets)
+      val offsets:List[Int] = words.foldLeft( (List[Int](), 0) ){
+          case ((offsetsSoFar:List[Int], offset:Int), word:String) =>
+        (offset :: offsetsSoFar, offset + word.length + 1)
+      }._1.reverse
+      // (construct CoreLabel sentence)
+      val coreSentence = new java.util.ArrayList[CoreLabel](words.length)
+      words.zip(pos).zip(offsets)foreach{
+          case ((word:String, pos:String), offset:Int) =>
+        val label = new CoreLabel
+        label.setWord(word.toLowerCase)
+        label.setOriginalText(word)
+        label.setTag(pos)
+        label.setBeginPosition(offset)
+        label.setEndPosition(offset + word.length)
+        coreSentence.add(label)
+      }
+      // (classify)
+      val output:java.util.List[CoreLabel] = classifier.classifySentence(coreSentence);
+      // (convert back)
+      output.map{ (label:CoreLabel) =>
+        val word:String = label.word
+        label.get(classOf[AnswerAnnotation]) match {
+          case "UPPER" => word.toUpperCase
+          case "LOWER" => word.toLowerCase
+          case "INIT_UPPER" => word.substring(0, 1).toUpperCase + word.substring(1).toLowerCase
+          case "O" => mixedCaseMap.get(word).getOrElse(word)
+          case _ => word
+        }
+      }.toArray
+    }
+  }
+
+  // ----------
+  // Methods
+  // ----------
+  def preload(obj: => Any) { new Thread(){ override def run:Unit = obj }.start }
+}
+
+trait CoreLabelSeq extends Seq[CoreLabel] {
+  //
+  // Trivial overrides (still have to define apply(Int):CoreLabel and length:Int though)
+  //
+  override def iterator:Iterator[CoreLabel] = new Iterator[CoreLabel] {
+      var index:Int = 0
+      override def hasNext:Boolean = index < CoreLabelSeq.this.length
+      override def next:CoreLabel = { index += 1; apply(index - 1); }
+    }
+
+  //
+  // Common Methods
+  //
+  def matches(t:TokensRegex) = t.matches(this)
+}
diff --git a/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/NLPConfig.scala b/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/NLPConfig.scala
new file mode 100644
index 0000000..da165e0
--- /dev/null
+++ b/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/NLPConfig.scala
@@ -0,0 +1,44 @@
+package edu.stanford.nlp
+
+import edu.stanford.nlp.pipeline.DefaultPaths._
+
+object NLPConfig {
+  object parse {
+    var model:String = DEFAULT_PARSER_MODEL
+  }
+
+  object pos {
+    var model:String = DEFAULT_POS_MODEL
+  }
+  
+  object ner {
+    var model:String = DEFAULT_NER_CONLL_MODEL
+    var aux:String   = DEFAULT_NER_MUC_MODEL
+  }
+
+  object classify {
+    var tolerance:Double = 1e-5
+    var iterations:Double = 40
+  }
+
+  object optimize {
+    var tolerance:Double = 1e-5
+    var wiggle:Double = 1e-5
+    var algorithm = "LBFGS" // | braindead | ...
+  }
+
+  object truecase {
+    var model:String = "edu/stanford/nlp/models/truecase/truecasing.fast.caseless.qn.ser.gz"
+    var disambiguation_list:String = "edu/stanford/nlp/models/truecase/MixDisambiguation.list"
+    var bias:String = "INIT_UPPER:-0.7,UPPER:-0.7,O:0"
+  }
+
+  def caseless:Unit = {
+    parse.model = "edu/stanford/nlp/models/lexparser/englishPCFG.caseless.ser.gz"
+    pos.model = "edu/stanford/nlp/models/pos-tagger/english-caseless-left3words-distsim.tagger"
+    ner.model = "edu/stanford/nlp/models/ner/english.conll.4class.caseless.distsim.crf.ser.gz"
+    ner.aux = "edu/stanford/nlp/models/ner/english.muc.7class.caseless.distsim.crf.ser.gz"
+  }
+
+  var numThreads = Runtime.getRuntime().availableProcessors();
+}
diff --git a/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Optimize.scala b/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Optimize.scala
new file mode 100644
index 0000000..e7c8384
--- /dev/null
+++ b/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Optimize.scala
@@ -0,0 +1,157 @@
+package edu.stanford.nlp;
+
+import scala.collection.JavaConversions._
+import scala.collection.MapLike
+import scala.collection.Map
+import scala.collection.generic.CanBuildFrom
+import scala.concurrent.Lock
+
+import java.io.ObjectInputStream
+import java.lang.ref.SoftReference
+import java.lang.ref.ReferenceQueue
+import java.util.Properties
+
+import edu.stanford.nlp.classify.LinearClassifierFactory
+import edu.stanford.nlp.classify.LogPrior
+import edu.stanford.nlp.classify.RVFDataset
+import edu.stanford.nlp.ie.NERClassifierCombiner
+import edu.stanford.nlp.io.IOUtils
+import edu.stanford.nlp.ling.HasWord
+import edu.stanford.nlp.ling.RVFDatum
+import edu.stanford.nlp.ling.Word
+import edu.stanford.nlp.ling.CoreLabel
+import edu.stanford.nlp.optimization.DiffFunction
+import edu.stanford.nlp.optimization.QNMinimizer
+import edu.stanford.nlp.optimization.SGDToQNMinimizer
+import edu.stanford.nlp.parser.lexparser.LexicalizedParser
+import edu.stanford.nlp.process.Morphology
+import edu.stanford.nlp.process.PTBTokenizer
+import edu.stanford.nlp.stats.ClassicCounter
+import edu.stanford.nlp.stats.Counter
+import edu.stanford.nlp.tagger.maxent.MaxentTagger
+import edu.stanford.nlp.trees.CollinsHeadFinder
+import edu.stanford.nlp.trees.LabeledScoredTreeNode
+import edu.stanford.nlp.trees.Tree
+import edu.stanford.nlp.trees.Trees
+import edu.stanford.nlp.trees.GrammaticalStructureFactory
+import edu.stanford.nlp.trees.GrammaticalStructure
+import edu.stanford.nlp.trees.PennTreebankLanguagePack
+import edu.stanford.nlp.trees.TypedDependency
+import edu.stanford.nlp.util.logging.Redwood.Util._
+
+import NLPConfig._
+import NLP._
+import Optimize._
+
+// ----------
+// Optimizers
+// ----------
+object Optimize {
+  def empiricalDerivative(fn:Array[Double]=>Double,
+                          x:Array[Double]):Array[Double] = {
+    val y0 = fn(x)
+    def tweak(i:Int, delta:Double):(Double, Double) = {
+      x(i) += delta
+      val y1 = fn(x)
+      x(i) -= delta
+      if (delta < 1e-5 * optimize.wiggle || delta > 1e5 * optimize.wiggle) {
+        (y1, delta)
+      } else {
+        if (scala.math.abs(y1 - y0) / delta > 1e5) tweak(i, delta / 2.0)
+        else if (scala.math.abs(y1 - y0) / delta < 1e-5) tweak(i, delta * 2.0)
+        else (y1, delta)
+      }
+    }
+    {for (i <- 0 until x.length) yield {
+      val (y1, step) = tweak(i, optimize.wiggle)
+      (y1 - y0) / step
+    }}.toArray
+  }
+}
+
+trait OptimizableFunction {
+  def minimize(initial:Array[Double]):Array[Double]
+  def derivative(ddx:Array[Double]=>Array[Double]):OptimizableFunction
+}
+
+/**
+ * A wrapper for QNMinimizer (L-BFGS)
+*/
+case class LBFGSOptimizableApproximateFunction(
+    fn:Array[Double]=>Double, derivative:Option[Array[Double]=>Array[Double]])
+    extends OptimizableFunction{
+
+  override def minimize(initial:Array[Double]):Array[Double] = {
+    // (define a differentiable function)
+    val javaFn:DiffFunction = new DiffFunction {
+      override def domainDimension:Int = initial.length
+      override def valueAt(x:Array[Double]):Double = fn(x)
+      override def derivativeAt(x:Array[Double]):Array[Double] = {
+        derivative match {
+          case Some(ddx) => ddx(x)
+          case None => empiricalDerivative(fn, x)
+        }
+      }
+    }
+    // (optimize using QNMinimizer)
+    val javaInit = initial.map{ (n:Double) => n }
+    val optimizer = new QNMinimizer()
+    optimizer.setRobustOptions()
+    optimizer.minimize(javaFn, optimize.tolerance, javaInit)
+  }
+
+  override def derivative(ddx:Array[Double]=>Array[Double]):LBFGSOptimizableApproximateFunction
+    = new LBFGSOptimizableApproximateFunction(fn, Some(ddx))
+}
+
+/**
+ * An optimization algorithm I made up (thus, "braindead"), that tries its
+ * best to move against the gradient (thus, "gradient descent").
+ * The only motivation to use this over L-BFGS is that it's more robust to
+ * non-convex problems (i.e., won't crash and burn).
+*/
+case class BraindeadGradientDescent(
+    fn:Array[Double]=>Double, derivative:Option[Array[Double]=>Array[Double]])
+    extends OptimizableFunction{
+
+  override def minimize(initial:Array[Double]):Array[Double] = {
+    // (helpers)
+    def dx(x:Array[Double], y0:Double):Array[Double] = derivative match {
+          case Some(ddx) => ddx(x)
+          case None => empiricalDerivative(fn, x)
+        }
+    def move(init:Array[Double], direction:Array[Double], scaling:Double):Array[Double] = {
+      init.zip(direction).map{ case (a:Double, d:Double) => a + scaling * d}
+    }
+    def isImprovementOver(newY:Double, y:Double):Boolean
+      = newY + optimize.tolerance < y
+    // (state)
+    val initialX:Array[Double] = initial
+    val initialY:Double        = fn(initialX)
+    var x:Array[Double]        = initialX
+    var y:Double               = initialY
+    var numIters = 0
+    // (optimization)
+    while (numIters < 100) {
+      var step:Double        = 1.0
+      val dir:Array[Double]  = dx(x, y).map( - _ )
+      var newX:Array[Double] = move(x, dir, step)
+      var newY:Double        = fn(newX)
+      while (!isImprovementOver(newY, y) && step > 1e-5) {
+        step /= 2.0
+        newX = move(x, dir, step)
+        newY = fn(newX)
+      }
+      if (step <= 1e-5) return x // convergence
+      assert(newY < y, "Function value did not decrease!")
+      x = newX
+      y = newY
+      numIters += 1
+    }
+    // (timeout -- no convergence)
+    return x
+  }
+
+  override def derivative(ddx:Array[Double]=>Array[Double]):BraindeadGradientDescent
+    = new BraindeadGradientDescent(fn, Some(ddx))
+}
diff --git a/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Sentence.scala b/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Sentence.scala
new file mode 100644
index 0000000..404499a
--- /dev/null
+++ b/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Sentence.scala
@@ -0,0 +1,246 @@
+package edu.stanford.nlp;
+
+import scala.collection.JavaConversions._
+import scala.collection.MapLike
+import scala.collection.Map
+import scala.collection.generic.CanBuildFrom
+import scala.concurrent.Lock
+
+import java.io.ObjectInputStream
+import java.lang.ref.SoftReference
+import java.lang.ref.ReferenceQueue
+import java.util.Properties
+
+import edu.stanford.nlp.classify.LinearClassifierFactory
+import edu.stanford.nlp.classify.LogPrior
+import edu.stanford.nlp.classify.RVFDataset
+import edu.stanford.nlp.ie.NERClassifierCombiner
+import edu.stanford.nlp.io.IOUtils
+import edu.stanford.nlp.ling.HasWord
+import edu.stanford.nlp.ling.RVFDatum
+import edu.stanford.nlp.ling.Word
+import edu.stanford.nlp.ling.CoreLabel
+import edu.stanford.nlp.optimization.DiffFunction
+import edu.stanford.nlp.optimization.QNMinimizer
+import edu.stanford.nlp.optimization.SGDToQNMinimizer
+import edu.stanford.nlp.parser.lexparser.LexicalizedParser
+import edu.stanford.nlp.process.Morphology
+import edu.stanford.nlp.process.PTBTokenizer
+import edu.stanford.nlp.stats.ClassicCounter
+import edu.stanford.nlp.stats.Counter
+import edu.stanford.nlp.tagger.maxent.MaxentTagger
+import edu.stanford.nlp.trees.CollinsHeadFinder
+import edu.stanford.nlp.trees.LabeledScoredTreeNode
+import edu.stanford.nlp.trees.Tree
+import edu.stanford.nlp.trees.Trees
+import edu.stanford.nlp.trees.GrammaticalStructureFactory
+import edu.stanford.nlp.trees.GrammaticalStructure
+import edu.stanford.nlp.trees.PennTreebankLanguagePack
+import edu.stanford.nlp.trees.TypedDependency
+import edu.stanford.nlp.util.logging.Redwood.Util._
+
+import NLPConfig._
+import NLP._
+
+object Sentence {
+  val tokenizerFactory = PTBTokenizer.factory
+  val grammaticalStructureFactory
+    = new PennTreebankLanguagePack().grammaticalStructureFactory
+  
+  def apply(word:Seq[String]):Sentence = new Sentence(word.toArray)
+  def apply(gloss:String):Sentence = new Sentence(gloss)
+}
+
+
+@SerialVersionUID(2l)
+case class Sentence(word:Array[String]) extends CoreLabelSeq {
+
+  def this(word:Seq[String]) = this(word.toArray)
+
+  def this(sentence:String) = this(
+    Sentence.tokenizerFactory.getTokenizer(new java.io.StringReader(sentence))
+      .tokenize
+      .map( _.word )
+      .toArray
+  )
+
+  //
+  // Necessary Overrides for Seq[CoreLabel]
+  //
+  override def length:Int = word.length
+  override def apply(index:Int):CoreLabel = {
+    val label = new CoreLabel(8)
+    label.setWord(word(index))
+    label.setTag(pos(index))
+    if (index > 0) { label.setAfter(word(index - 1)) }
+    if (index < word.length - 1) { label.setBefore(word(index + 1)) }
+    label.setNER(ner(index))
+    label.setLemma(lemma(index))
+    label.setIndex(index)
+    // TODO(gabor) things like character offsets, original text, etc.
+    label
+  }
+
+
+
+  var id:Option[Int] = None
+  // values
+  lazy val parse:Tree = {
+    NLP.parser.parse(word.toList, pos.toList)
+  }
+
+  lazy val stanfordDependencies:Array[(Int, String)] = {
+    if (length == 0) {
+      new Array[(Int, String)](0)
+    } else {
+      val depArray = new Array[(Int, String)](length)
+      // (get dependencies)
+      val structure:GrammaticalStructure
+        = Sentence.grammaticalStructureFactory.newGrammaticalStructure(parse)
+      val deps:java.util.Collection[TypedDependency]
+        = structure.typedDependencies()
+      // (fill dependencies)
+      deps.foreach{ (arc:TypedDependency) =>
+        depArray(arc.dep.index - 1) = 
+          ( arc.gov.index - 1,
+            arc.reln.getShortName + {if (arc.reln.getSpecific == null) "" else "_" + arc.reln.getSpecific} )
+      }
+      // (pad empty dependencies)
+      for (i <- 0 until depArray.length) {
+        if (depArray(i) == null) depArray(i) = (i, "noop")
+      }
+      depArray
+    }
+  }
+
+  def dependencyRoot:Int
+    = stanfordDependencies.zipWithIndex.filter( _._1._1 < 0 ).headOption match {
+      case Some( (dep, index) ) => index
+      case None => throw new IllegalStateException("Could not find head: '" +
+                    this + "' --- dependencies: " + stanfordDependencies.mkString(" "))
+    }
+
+  def dependencyChild(root:Int, depType:String):Option[Int]
+    = stanfordDependencies.zipWithIndex.filter( x => x._1._1 == root && x._1._2 == depType )
+                                       .map( _._2 ).headOption
+
+  def dependencyChildren(root:Int):Seq[(Int, String)]
+    = stanfordDependencies.zipWithIndex.filter( _._1._1 == root ).map( x => (x._2, x._1._2) )
+  
+  def dependencyYield(root:Int):Set[Int] = {
+    def recursiveSearch(root:Int, seen:Set[Int]):Set[Int] = {
+      val directChildren = dependencyChildren(root).map( _._1 )
+      directChildren.foldLeft(seen) {
+          case (soFar:Set[Int], index:Int) =>
+        if (!soFar(index)) recursiveSearch(index, seen + index)
+        else soFar
+      }
+    }
+    recursiveSearch(root, Set[Int](root))
+  }
+
+  def dependencyPathMonotonic(ancestor:Int, descendent:Int):Option[Seq[Int]] = {
+    def recurse(ancestor:Int, descendent:Int, lst:List[Int]):Option[List[Int]] = {
+      if (descendent == ancestor) Some(ancestor :: lst)
+      else if (descendent < 0) None
+      else recurse(ancestor, stanfordDependencies(descendent)._1, descendent :: lst)
+    }
+    recurse(ancestor, stanfordDependencies(descendent)._1, Nil)
+  }
+
+  lazy val headIndex:Int = {
+    if (word.length == 1) { 0 }
+    else {
+      val headLeaf = parse.headTerminal(collinsHeadFinder)
+      val index = parse.getLeaves().indexWhere{ (x:Tree) => x eq headLeaf }
+      if (index < 0) word.length - 1 else index
+    }
+  }
+
+  def headIndex(spanBegin:Int, spanEnd:Int):Int = {
+    parse.setSpans
+    val (score, tree) = parse.foldLeft( spanBegin + (length - spanEnd), parse ){
+        case ( (smallestDiffSoFar:Int, bestTreeSoFar:Tree), tree:Tree ) =>
+      if (tree != null && tree.getSpan != null) {
+        val (treeBegin, treeEnd) = (tree.getSpan.getSource, tree.getSpan.getTarget)
+        val diff = scala.math.abs(spanBegin - treeBegin)
+                     + scala.math.abs(spanEnd - treeEnd)
+        if (treeBegin >= spanBegin && treeEnd <= spanEnd &&
+            diff < smallestDiffSoFar) { (diff, tree) }
+        else { (smallestDiffSoFar, bestTreeSoFar) }
+      } else { (smallestDiffSoFar, bestTreeSoFar) }
+    }
+    val headLeaf = tree.headTerminal(collinsHeadFinder)
+    val index = parse.getLeaves().indexWhere{ (x:Tree) => x eq headLeaf }
+    if (index < spanBegin || index >= spanEnd) spanEnd - 1 else index
+  }
+  
+  def headWord(spanBegin:Int, spanEnd:Int):String = word(headIndex(spanBegin, spanEnd))
+
+  lazy val pos:Array[String]
+    = if (length == 0) new Array[String](0)
+      else NLP.tagger.apply(word.toList).map( _.tag ).toArray
+
+  lazy val lemma:Array[String] = word.zip(pos).map{ case (w:String,p:String) => 
+        morph( m => m.lemma(w,p) ).toString
+      }.toArray
+
+  lazy val ner:Array[String] = nerCRF(word, pos)
+
+  lazy val truecase:Array[String] = trueCaser(word, pos, lemma)
+
+  // helper functions
+  def words:Array[String] = word
+  def tags:Array[String] = pos
+
+  def headWord:String = word(headIndex)
+  def headLemma:String = lemma(headIndex)
+  def headPOS:String = pos(headIndex)
+  def namedEntities:Array[(Array[String],String)] = {
+    // (collect tags)
+    val nerTags = word.zip(ner).foldLeft(List[(List[String],String)]()){
+        case (soFar:List[(List[String],String)], (word:String, tag:String)) =>
+      val (chunk, lastTag) = if (soFar.isEmpty) (List[String](), "O")
+                             else soFar.head
+      val tailList:List[(List[String],String)]
+        = if (soFar.isEmpty) Nil else soFar.tail
+      if (lastTag != tag) {
+        (List[String](word), tag) :: {
+          if (lastTag != "O") (chunk.reverse, lastTag) :: tailList
+          else tailList
+        }
+      } else {
+        (word :: chunk, tag) :: tailList
+      }
+    }
+    // (some cleanup)
+    val headPair = nerTags.head
+    (if (headPair._2 == "O") nerTags.tail
+     else (headPair._1.reverse, headPair._2) :: nerTags.tail)
+      .reverse
+      .map{ case (c,t) => (c.toArray,t) }
+      .toArray
+  }
+
+  def toSentence:Sentence = this
+
+  override def equals(a:Any):Boolean = {
+    def seqMatch(s:Seq[String]):Boolean = {
+      s.length == word.length && s.zip(word).forall{ case (a,b) => a == b }
+    }
+    a match {
+      case (s:Sentence) =>
+        for (id1 <- this.id;
+             id2 <- s.id) return id1 == id2
+        return seqMatch(s.word)
+      case (s:Seq[String]) => seqMatch(s)
+      case _ => false
+    }
+  }
+  private var code:Int = 0
+  override def hashCode:Int = {
+    if (code == 0) { word.foreach( w => code = 37 * code + w.hashCode ) }
+    code
+  }
+  override def toString:String = word.mkString(" ")
+}
diff --git a/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/TokensRegex.scala b/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/TokensRegex.scala
new file mode 100644
index 0000000..2fcfd11
--- /dev/null
+++ b/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/TokensRegex.scala
@@ -0,0 +1,82 @@
+package edu.stanford.nlp;
+
+import scala.collection.JavaConversions._
+
+import edu.stanford.nlp.ling.CoreLabel
+import edu.stanford.nlp.util.CoreMap
+import edu.stanford.nlp.ling.tokensregex._
+
+import NLPConfig._
+
+
+case class TokensRegex(override val toString:String) {
+  val pattern:TokenSequencePattern = TokenSequencePattern.compile(toString)
+
+
+  def matches(input:Seq[CoreLabel]):Boolean = pattern.getMatcher(input.toList).matches
+
+  def allMatches(input:Seq[CoreLabel]):Iterator[Seq[CoreLabel]] = {
+    val matcher = pattern.getMatcher(input.toList)
+    new Iterator[Seq[CoreLabel]] {
+      var theNext:Option[Boolean] = None
+      override def hasNext:Boolean = theNext match {
+        case Some(x) => x
+        case None => theNext = Some(matcher.find); theNext.get
+      }
+      override def next:Seq[CoreLabel] = {
+        if (!hasNext) throw new NoSuchElementException
+        theNext = None
+        val m:java.util.List[_ <: CoreMap] = matcher.groupNodes
+        m.map( _ match {
+          case (x:CoreLabel) => x
+          case (x:CoreMap) => new CoreLabel(x)
+        })
+      }
+    }
+  }
+
+  def unapplySeq(target:Any):Option[Seq[Seq[CoreLabel]]] = target match {
+    case (input:Seq[CoreLabel]) =>
+      val matcher = pattern getMatcher(input toList)
+      if (matcher matches) {
+        Some(for (i <- 1 to matcher.groupCount) yield 
+          matcher groupNodes(i) map( _ match {
+          case (x:CoreLabel) => x
+          case (x:CoreMap) => new CoreLabel(x)
+        }))
+      } else { None }
+    case _ => None
+  }
+}
+
+
+object TokensRegex {
+  // Built-in predicates
+  def word(pattern:String):MarkedString = MarkedString(s"""{word : /$pattern/}""")
+  def tag(pattern:String):MarkedString = MarkedString(s"""{tag : /$pattern/}""")
+  def lemma(pattern:String):MarkedString = MarkedString(s"""{lemma : /$pattern/}""")
+  def ner(pattern:String):MarkedString = MarkedString(s"""{ner : /$pattern/}""")
+  def normalized(pattern:String):MarkedString = MarkedString(s"""{normalized : /$pattern/}""")
+
+  // Decorate predicates
+  case class MarkedString(str:String) extends AnyVal { override def toString:String = str }
+  implicit def stringDecorator(str:MarkedString) = new {
+    def unary_!():String = s"""!$str"""
+  }
+  implicit def string2string(str:MarkedString):String = str.str
+
+  // Create token sequence
+  implicit def product2tokens(p:Product):Tokens = new Tokens(List[String](p.productIterator.map( _.toString ).mkString(" & ")))
+  implicit def string2tokens(str:MarkedString):Tokens = new Tokens(List[String](str.str))
+  class Tokens(val regexps:List[String]) {
+    def apply(terms:String*):Tokens = {
+      new Tokens(terms.mkString(" & ") :: regexps)
+    }
+  }
+  
+  // Dump to TokensRegex object
+  implicit def string2tokensregex(str:MarkedString):TokensRegex
+    = new TokensRegex(s"""[${str.str}]""")
+  implicit def tokens2tokensregex(tokens:Tokens):TokensRegex
+    = new TokensRegex(s"""[${tokens.regexps.reverse.mkString("] [")}]""")
+}
diff --git a/src/main/scala/DataSource.scala b/src/main/scala/DataSource.scala
index aca4463..b431ff5 100644
--- a/src/main/scala/DataSource.scala
+++ b/src/main/scala/DataSource.scala
@@ -27,18 +27,18 @@ class DataSource(val dsp: DataSourceParams)
     val eventsRDD: RDD[Event] = eventsDB.find(
       appId = dsp.appId,
       entityType = Some("user"),
-      eventNames = Some(List("rate"))
+      eventNames = Some(List("train"))
     )(sc)
     
     val sentimentsRDD: RDD[Sentiment] = eventsRDD.map { event =>
       val sentiment = try {
          val sentimentValue: Double = event.event match {
-          case "rate" => event.properties.get[Double]("sentiment")
+          case "train" => event.properties.get[Double]("sentiment")
           case _ => throw new Exception(s"Unexpected event ${event} is read.")
         }
       
         Sentiment(
-          event.properties.get[String]("phase"),
+          event.properties.get[String]("phrase"),
           sentimentValue
         )
       } catch {
diff --git a/src/main/scala/DummyAlgorithm.scala b/src/main/scala/DummyAlgorithm.scala
deleted file mode 100644
index a1718a3..0000000
--- a/src/main/scala/DummyAlgorithm.scala
+++ /dev/null
@@ -1,34 +0,0 @@
-package org.template.sentimentanalysis
-
-import io.prediction.controller.PAlgorithm
-import io.prediction.controller.Params
-import io.prediction.data.storage.BiMap
-
-import org.apache.spark.SparkContext
-import org.apache.spark.SparkContext._
-import org.apache.spark.rdd.RDD
-
-import grizzled.slf4j.Logger
-
-case class DummyAlgorithmParams(
-)extends Params
-
-class DummyAlgorithm(val ap: DummyAlgorithmParams)
-  extends PAlgorithm[PreparedData, DummyModel, Query, PredictedResult] {
-
-  @transient lazy val logger = Logger[this.type]
-
-  def train(sc: SparkContext, data: PreparedData): DummyModel = {
-    //require(!data.sentiments.take(1).isEmpty,
-    //  s"RDD[sentiments] in PreparedData cannot be empty." +
-    //  " Please check if DataSource generates TrainingData" +
-    //  " and Preprator generates PreparedData correctly.")
-    // do nothing in milestone 1
-    new DummyModel()
-  }
-  
-  def predict(model: DummyModel, query: Query): PredictedResult = {
-    // always return 2.0 for milestone 1
-    new PredictedResult(2.0)
-  }
-}
diff --git a/src/main/scala/DummyModel.scala b/src/main/scala/DummyModel.scala
deleted file mode 100644
index a864687..0000000
--- a/src/main/scala/DummyModel.scala
+++ /dev/null
@@ -1,3 +0,0 @@
-package org.template.sentimentanalysis
-
-class DummyModel {}
diff --git a/src/main/scala/Engine.scala b/src/main/scala/Engine.scala
index 77142b6..b299f16 100644
--- a/src/main/scala/Engine.scala
+++ b/src/main/scala/Engine.scala
@@ -16,7 +16,7 @@ object SentimentAnalysisEngine extends IEngineFactory {
     new Engine(
       classOf[DataSource],
       classOf[Preparator],
-      Map("dummy" -> classOf[DummyAlgorithm]),
+      Map("nlpparse" -> classOf[Algorithm]),
       classOf[Serving])
   }
 }
diff --git a/src/main/scala/Model.scala b/src/main/scala/Model.scala
new file mode 100644
index 0000000..e6c25c4
--- /dev/null
+++ b/src/main/scala/Model.scala
@@ -0,0 +1,73 @@
+package org.template.sentimentanalysis
+
+import edu.stanford.nlp.Magic._
+import edu.stanford.nlp.trees.Tree
+
+class Model (
+  var rules: Map[String, Double]
+) extends Serializable {
+
+  /**
+   * Return the sentiment in [-2 , 2] scale
+   */
+  def getWordSentiment(word: String): Double = {
+    var score = rules.get(word.toLowerCase())
+    if (score.isEmpty) {
+      return 0.0
+    } else {
+      return score.get - 2.0
+    }
+  }
+
+  /**
+   * Parse the input to a tree structure. Calculate the sentiment from bottom
+   * to the top.
+   *
+   * For a leaf node, it is always a word token. Use the sentiment
+   * from the training data in this case. If the word did not appear in the
+   * training data. Assume it is neutral.
+   *
+   * For a non-leaf node, calculate the sentiments of each of its children.
+   * Determine whether the sentence is positive or negative by the number of
+   * negative children. If it is odd, then assume the sentence is negative.
+   */
+  def getSentiment(s: String, ap: AlgorithmParams): Double = {
+    var m = scala.collection.mutable.Map[Tree, Double]()
+    var tree = s.parse
+    var it = tree.iterator
+    val root = tree.preOrderNodeList().get(0)
+    val post_order = tree.postOrderNodeList()
+    var i = 0
+    while (i < post_order.size()) {
+      var cur = post_order.get(i)
+      i = i + 1
+
+      if (cur.isLeaf()) {
+        m(cur) = getWordSentiment(cur.value)
+      } else {
+        val children = cur.children()
+        var weight = 0.0000000001
+        var positive = 1
+        var sentiment = 0.0
+        m(cur) = 0
+        for (child <- children) {
+          var child_sentiment = m(child)
+
+          // The weight of a the child is proportional to the absolute value
+          // of its sentiment. It avoid the sentiment to be neutralized by
+          // other neutral childs
+          var child_weight = Math.abs(child_sentiment) + ap.baseWeight
+
+          weight = weight + child_weight
+          sentiment = sentiment + child_weight * Math.abs(child_sentiment)
+          if (child_sentiment < -0.0000001) {
+            positive = positive * -1
+          }
+        }
+        m(cur) = ( sentiment / weight ) * positive
+      }
+    }
+
+    return m(root) + 2.0
+  }
+}