From d643f38124e3d5299e1a15f7141e614935c0dd4f Mon Sep 17 00:00:00 2001 From: Wai Hon Law Date: Sun, 29 Mar 2015 17:58:31 -0700 Subject: [PATCH] Integrate with CoreNLP by the binding `gangeli/CoreNLP-Scala`. Tokenize the sentence and parse it to a Tree structure. Calculate the sentiment on the tree structure. --- .gitignore | 4 + build.sbt | 9 +- data/import_eventserver.py | 8 +- data/send_query.py | 18 ++ engine.json | 4 +- src/main/scala/Algorithm.scala | 49 ++++ src/main/scala/CoreNLP-Scala/Makefile | 44 +++ src/main/scala/CoreNLP-Scala/README.md | 3 + .../src/edu/stanford/nlp/Berkeley.scala | 90 ++++++ .../src/edu/stanford/nlp/Classify.scala | 272 ++++++++++++++++++ .../src/edu/stanford/nlp/Document.scala | 55 ++++ .../src/edu/stanford/nlp/Magic.scala | 76 +++++ .../src/edu/stanford/nlp/NLP.scala | 196 +++++++++++++ .../src/edu/stanford/nlp/NLPConfig.scala | 44 +++ .../src/edu/stanford/nlp/Optimize.scala | 157 ++++++++++ .../src/edu/stanford/nlp/Sentence.scala | 246 ++++++++++++++++ .../src/edu/stanford/nlp/TokensRegex.scala | 82 ++++++ src/main/scala/DataSource.scala | 6 +- src/main/scala/DummyAlgorithm.scala | 34 --- src/main/scala/DummyModel.scala | 3 - src/main/scala/Engine.scala | 2 +- src/main/scala/Model.scala | 73 +++++ 22 files changed, 1427 insertions(+), 48 deletions(-) create mode 100644 data/send_query.py create mode 100644 src/main/scala/Algorithm.scala create mode 100644 src/main/scala/CoreNLP-Scala/Makefile create mode 100644 src/main/scala/CoreNLP-Scala/README.md create mode 100644 src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Berkeley.scala create mode 100644 src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Classify.scala create mode 100644 src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Document.scala create mode 100644 src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Magic.scala create mode 100644 src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/NLP.scala create mode 100644 src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/NLPConfig.scala create mode 100644 src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Optimize.scala create mode 100644 src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Sentence.scala create mode 100644 src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/TokensRegex.scala delete mode 100644 src/main/scala/DummyAlgorithm.scala delete mode 100644 src/main/scala/DummyModel.scala create mode 100644 src/main/scala/Model.scala diff --git a/.gitignore b/.gitignore index 5f1f0a2..2abf820 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,8 @@ manifest.json pio.log /pio.sbt target/ +data/*.csv +data/*.tsv +data/*.zip +data/gen_submission.py *~ diff --git a/build.sbt b/build.sbt index 473124e..361b159 100644 --- a/build.sbt +++ b/build.sbt @@ -6,7 +6,14 @@ name := "template-scala-sentiment-analysis" organization := "io.prediction" +excludeFilter in unmanagedSources := "Berkeley.scala" + libraryDependencies ++= Seq( "io.prediction" %% "core" % pioVersion.value % "provided", "org.apache.spark" %% "spark-core" % "1.2.0" % "provided", - "org.apache.spark" %% "spark-mllib" % "1.2.0" % "provided") + "org.apache.spark" %% "spark-mllib" % "1.2.0" % "provided", + "edu.stanford.nlp" % "stanford-corenlp" % "3.4", + "edu.stanford.nlp" % "stanford-corenlp" % "3.4" classifier "models", + "edu.stanford.nlp" % "stanford-parser" % "3.4" +) + diff --git a/data/import_eventserver.py b/data/import_eventserver.py index 444106a..49cd2a0 100644 --- a/data/import_eventserver.py +++ b/data/import_eventserver.py @@ -13,11 +13,11 @@ def import_events(client, file): data = line.rstrip('\r\n').split("\t") if True: client.create_event( - event="rate", + event="train", entity_type="user", entity_id=data[0], properties= { - "sentence" : str(data[2]), + "phrase" : str(data[2]), "sentiment" : float(data[3]) } ) @@ -41,6 +41,6 @@ def import_events(client, file): client = predictionio.EventClient( access_key=args.access_key, url=args.url, - threads=5, - qsize=500) + threads=10, + qsize=1000) import_events(client, args.file) diff --git a/data/send_query.py b/data/send_query.py new file mode 100644 index 0000000..72f2e39 --- /dev/null +++ b/data/send_query.py @@ -0,0 +1,18 @@ +""" +Send sample query to prediction engine +""" + +import predictionio +client = predictionio.EngineClient(url="http://localhost:8000") + +def test(s): + print s + ' : ' + str(client.send_query({"s": s})['sentiment']) + +test('sad') +test('happy') +test('oh') +test('not') +test('not sad') +test('very sad') +test('very happy') +test('not very sad') diff --git a/engine.json b/engine.json index c52c636..bf61e5a 100644 --- a/engine.json +++ b/engine.json @@ -9,9 +9,9 @@ }, "algorithms": [ { - "name": "dummy", + "name": "nlpparse", "params": { - + "baseWeight": 1 } } ] diff --git a/src/main/scala/Algorithm.scala b/src/main/scala/Algorithm.scala new file mode 100644 index 0000000..98e3a76 --- /dev/null +++ b/src/main/scala/Algorithm.scala @@ -0,0 +1,49 @@ +package org.template.sentimentanalysis + +import io.prediction.controller.PAlgorithm +import io.prediction.controller.Params +import io.prediction.data.storage.BiMap + +import org.apache.spark.SparkContext +import org.apache.spark.SparkContext._ +import org.apache.spark.rdd.RDD + +import edu.stanford.nlp.Magic._ + +import grizzled.slf4j.Logger + +case class AlgorithmParams( + val baseWeight: Double +)extends Params + +class Algorithm(val ap: AlgorithmParams) + extends PAlgorithm[PreparedData, Model, Query, PredictedResult] { + + @transient lazy val logger = Logger[this.type] + + def train(sc: SparkContext, data: PreparedData): Model = { + require(!data.sentiments.take(1).isEmpty, + s"RDD[sentiments] in PreparedData cannot be empty." + + " Please check if DataSource generates TrainingData" + + " and Preprator generates PreparedData correctly.") + + val itemSets: RDD[(String, Double)] = data.sentiments.map( + s => (s.phrase.toLowerCase(), s.sentiment) + ).cache() + + val rules = itemSets.groupByKey + .mapValues( + // assume the last training data is the most up-to-date + iter => iter.toVector.last + ) + .collectAsMap.toMap + + new Model(rules) + } + + def predict(model: Model, query: Query): PredictedResult = { + new PredictedResult( + model.getSentiment(query.s, ap) + ) + } +} diff --git a/src/main/scala/CoreNLP-Scala/Makefile b/src/main/scala/CoreNLP-Scala/Makefile new file mode 100644 index 0000000..d833b82 --- /dev/null +++ b/src/main/scala/CoreNLP-Scala/Makefile @@ -0,0 +1,44 @@ +# +# To Build: +# 1. Set CORENLP_HOME to the root of CoreNLP +# 2. [optional] Set BERKELEY to the path to the Berkeley parser +# 3. Build using either 'make stanford' or 'make berkeley' (if the Berkeley parser is configured) +# + +CORENLP=$(CORENLP_HOME)/classes:$(CORENLP_HOME)/lib/joda-time.jar:$(CORENLP_HOME)/lib/jollyday-0.4.7.jar +BERKELEY=$(CORENLP_HOME)/../more/lib/BerkeleyParser.jar + +JAVAC=javac +SCALAC=scalac + +SRC=src +SOURCES = $(wildcard src/edu/stanford/nlp/*.scala) +TEST_SRC=test/src +LIB=lib +BUILD=classes +TEST_BUILD=test/classes +DIST=dist + +dist: stanford + mkdir -p ${DIST} + jar cf ${DIST}/corenlp-scala.jar -C $(BUILD) . + jar uf ${DIST}/corenlp-scala.jar -C $(SRC) . + +berkeley: stanford + $(SCALAC) -cp $(CORENLP):${BERKELEY} -d $(BUILD) `find $(SRC) -name "*.scala"` + +stanford: ${SOURCES} + mkdir -p $(BUILD) + sed -e 's/BerkeleyUtil.berkeleyParser/throw new IllegalStateException("Could not find parser model (and was not compiled to run with Berkeley parser)")/g' ${SRC}/edu/stanford/nlp/NLP.scala > /tmp/NLP_stanfordonly.scala + $(SCALAC) -cp $(CORENLP) -d $(BUILD) `find $(SRC) -name "*.scala" ! -name "*Berkeley.scala" ! -name "NLP.scala"` /tmp/NLP_stanfordonly.scala + rm /tmp/NLP_stanfordonly.scala + +default: stanford + +clean: + rm -r $(BUILD) + rm -r ${DIST} + + +cmd: + @echo "scala -J-Xmx4G -cp $(CORENLP):$(BUILD)":${HOME}/lib/corenlp-models.jar diff --git a/src/main/scala/CoreNLP-Scala/README.md b/src/main/scala/CoreNLP-Scala/README.md new file mode 100644 index 0000000..7f5ae08 --- /dev/null +++ b/src/main/scala/CoreNLP-Scala/README.md @@ -0,0 +1,3 @@ +Since gangeli/CoreNLP-Scala does not provide a way to install by the build.sbt file, +copy it from https://github.com/gangeli/CoreNLP-Scala. + diff --git a/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Berkeley.scala b/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Berkeley.scala new file mode 100644 index 0000000..4d52f79 --- /dev/null +++ b/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Berkeley.scala @@ -0,0 +1,90 @@ +package edu.stanford.nlp; + +import scala.collection.JavaConversions._ +import scala.concurrent.Lock + +import edu.stanford.nlp.trees.Tree +import edu.stanford.nlp.trees.Trees +import edu.stanford.nlp.trees.LabeledScoredTreeNode +import edu.stanford.nlp.ling.HasWord +import edu.stanford.nlp.ling.Word + +import edu.berkeley.nlp.PCFGLA._ +import edu.berkeley.nlp.util.Numberer + +import NLPConfig._ + +object BerkeleyUtil { + type BerkeleyTree = edu.berkeley.nlp.syntax.Tree[String] + + implicit def stanfordTree2BerkeleyTree(btree:BerkeleyTree):Tree = { + val roots = TreeAnnotations.unAnnotateTree(btree).getChildren; + if (roots.isEmpty) { + new LabeledScoredTreeNode(); + } else { + def convert(src:BerkeleyTree):Tree = { + val dst:Tree = new LabeledScoredTreeNode + if (src.getLabel != null) dst.setLabel(new Word(src.getLabel)) + dst.setChildren(src.getChildren.map( convert(_) ).toArray) + dst + } + new LabeledScoredTreeNode(new Word("TOP"), + List[Tree](convert(roots.get(0)))) + } + } + + lazy val berkeleyParser = { + // (function to create parser) + def mkParser = { + // (setup parser) + val pData = ParserData.Load(parse.model) + if (pData == null) throw new RuntimeException("Failed to load Berkeley parser model") + val grammar = pData.getGrammar(); + val lexicon = pData.getLexicon(); + Numberer.setNumberers(pData.getNumbs()); + // (create parser object) + val parser = new CoarseToFineMaxRuleParser( + grammar, lexicon, 1.0, -1, false, false, false, + false, false, true, true) + // (set binarization) + try { + val binarizationField = classOf[ConstrainedArrayParser].getDeclaredField("binarization"); + binarizationField.setAccessible(true); + binarizationField.set(parser, pData.getBinarization()); + binarizationField.setAccessible(false); + } catch { case (e:Exception) => throw new RuntimeException(e) } + // (parser object) + new { + def parse(words:List[String], pos:List[String]):Tree = { + var parsedTree:BerkeleyTree + = parser.getBestConstrainedParse(words, pos, null); + if (parsedTree.getChildren().isEmpty()) { + parsedTree = parser.getBestConstrainedParse(words, null, null); + } + parsedTree + } + } + } + // (create parsers) + val parsers = (0 until numThreads).map{ x => (mkParser, new Lock) }.toList + // (multithreaded implementation) + new { + def parse(words:List[String], pos:List[String]):Tree = { + def tryParse:Tree = { + val validParser = parsers.indexWhere{ + (pair:({def parse(words:List[String],pos:List[String]):Tree},Lock)) => + pair._2.available + } + if (validParser >= 0) { // case: [likely] found parser to run + val (parser, lock) = parsers(validParser) + lock.acquire + val rtn = parser.parse(words, pos) + lock.release + rtn + } else { Thread.sleep(1000); tryParse } // case: no parser found + } + tryParse + } + } + } +} diff --git a/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Classify.scala b/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Classify.scala new file mode 100644 index 0000000..f3a6866 --- /dev/null +++ b/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Classify.scala @@ -0,0 +1,272 @@ +package edu.stanford.nlp; + +import scala.collection.JavaConversions._ +import scala.collection.MapLike +import scala.collection.Map +import scala.collection.generic.CanBuildFrom +import scala.concurrent.Lock + +import java.io.ObjectInputStream +import java.lang.ref.SoftReference +import java.lang.ref.ReferenceQueue +import java.util.Properties + +import edu.stanford.nlp.classify.LinearClassifierFactory +import edu.stanford.nlp.classify.LogPrior +import edu.stanford.nlp.classify.RVFDataset +import edu.stanford.nlp.ie.NERClassifierCombiner +import edu.stanford.nlp.io.IOUtils +import edu.stanford.nlp.ling.HasWord +import edu.stanford.nlp.ling.RVFDatum +import edu.stanford.nlp.ling.Word +import edu.stanford.nlp.ling.CoreLabel +import edu.stanford.nlp.optimization.DiffFunction +import edu.stanford.nlp.optimization.QNMinimizer +import edu.stanford.nlp.optimization.SGDToQNMinimizer +import edu.stanford.nlp.parser.lexparser.LexicalizedParser +import edu.stanford.nlp.process.Morphology +import edu.stanford.nlp.process.PTBTokenizer +import edu.stanford.nlp.stats.ClassicCounter +import edu.stanford.nlp.stats.Counter +import edu.stanford.nlp.tagger.maxent.MaxentTagger +import edu.stanford.nlp.trees.CollinsHeadFinder +import edu.stanford.nlp.trees.LabeledScoredTreeNode +import edu.stanford.nlp.trees.Tree +import edu.stanford.nlp.trees.Trees +import edu.stanford.nlp.trees.GrammaticalStructureFactory +import edu.stanford.nlp.trees.GrammaticalStructure +import edu.stanford.nlp.trees.PennTreebankLanguagePack +import edu.stanford.nlp.trees.TypedDependency +import edu.stanford.nlp.util.logging.Redwood.Util._ + +import NLPConfig._ +import NLP._ + +// ---------- +// Classifiers +// ---------- +@SerialVersionUID(1l) +class Classifier[I,O]( + regression:I=>Map[O,Double], + val data:Map[I,(O,Float)]) extends Function1[I,O] with Serializable { + override def apply(in:I):O = { + regression(in).maxBy(_._2)._1 + } +} + +class Mapping[I,O](map:Map[I,(O,Float)]) { + import Mapping.{toCounter,defaultFeatures} + + def scorer[F](featurizer:I=>Iterable[F]):I=>Map[O,Double] = { + // -- Create Dataset + val weights = new Array[Float](map.size) + val dataset = new RVFDataset[O, F](map.size) + map.zipWithIndex.foreach{ + case ((input:I, (output:O, weight:Float)),i:Int) => + weights(i) = weight + dataset.add( new RVFDatum[O, F](toCounter(featurizer(input)), output) ) + } + // -- Train + val prior = new LogPrior(LogPrior.LogPriorType.QUADRATIC) + val factory = new LinearClassifierFactory[O,F]() + val classifier = factory.trainClassifier(dataset, weights, prior) + // -- Return + (input:I) => { + val scores = classifier.scoresOf( + new RVFDatum[O, F](toCounter(featurizer(input)), null.asInstanceOf[O])) + scores.keySet.map{ x => (x, scores.getCount(x)) }.toMap + } + } + def scorer:I=>Map[O,Double] = scorer(defaultFeatures(_, map.size)) + + def classifier[F](featurizer:I=>Iterable[F]):Classifier[I,O] + = new Classifier(scorer(featurizer), map) + def classifier:Classifier[I,O] + = classifier(defaultFeatures(_, map.size)) +} + +object Mapping { + def toCounter[X,F](map:Iterable[X]):Counter[F] = { + val counts = new ClassicCounter[F] + map.foreach{ (x:X) => x match { + case (feat:F, n:Number) => counts.incrementCount(feat, n.doubleValue) + case (feat:F) => counts.incrementCount(feat, 1) + case _ => throw new IllegalStateException("Type mismatch in toCounter") + } } + return counts + } + + def apply[I,O,X](map:Map[I,X]):Mapping[I,O] = { + new Mapping(map.map{ case (i:I, x:X) => x match { + case (o:O, n:Number) => (i, (o, n.floatValue)) + case (o:O) => (i, (o, 1.0.asInstanceOf[Float])) + case _ => throw new IllegalStateException("Type mismatch in toCounter") + } }) + } + + def defaultFeatures[I](input:I, datasetSize:Int):Iterable[(String,Float)] = { + def ngram[A](seq:List[A], n:Int, tail:List[A] = Nil):List[String] = { + if (seq.isEmpty) Nil + else (seq.head :: tail.slice(0, n-1)).reverse.mkString("_") :: ngram(seq.tail, n, seq.head :: tail) + } + input match { + case (sent:Sentence) => + val n:Int = (scala.math.log10(datasetSize) / 3.0).toInt + 1 + // N-grams + (ngram(sent.words.toList, n) ::: + ngram(sent.words.toList.map( _.toLowerCase ), n) ::: + ngram(sent.lemma.toList, n) ::: + ngram(sent.ner.toList, n) ::: + ngram(sent.pos.toList, n) ::: + // Bag-of-words + { if (n > 1) + sent.words.toList ::: + sent.words.toList.map( _.toLowerCase ) ::: + sent.lemma.toList ::: + sent.ner.toList ::: + sent.pos.toList + else Nil } + ).map{ (_, 1.0.toFloat) } + case (str:String) => + val tokens = str.split(" ") + val n:Int = (scala.math.log10(datasetSize) / 3.0).toInt + 1 + if (tokens.length <= 1) { + // Case: a single word + (tokens(0) :: // memorize + ngram(str.toCharArray.toList, n) ::: // literal n-grams + ngram(str.toLowerCase.toCharArray.toList, n) // case-insensitive n-grams + ).map{ (_, 1.0.toFloat) } + } else { + // Case: a phrase + (ngram(tokens.toList, n) ::: // literal n-grams + ngram(tokens.toList.map( _.toLowerCase), n) // case-insensitive n-grams + ).map{ (_, 1.0.toFloat) } + } + case (seq:Iterable[Any]) => + seq.map{ (x:Any) => x match { + case (feat:Any, n:Number) => (feat.toString, n.floatValue) + case (feat:Any) => (feat.toString, 1.0.toFloat) + case _ => (x.toString, 1.0.toFloat) + } } + case _ => List[(String,Float)]( (input.toString, 1.0.toFloat) ) + } + } +} + +// ---------- +// Ensemble Classifiers +// ---------- + +class Ensemble[I](members:Seq[I=>Boolean], dat:Option[Map[I,(Boolean,Float)]]) { + // -- Get Data + if (!dat.isDefined) { + members.foldLeft(Option[Map[I,(Boolean,Float)]](null)){ + (dat:Option[Map[I,(Boolean,Float)]], fn:I=>Boolean) => + fn match { + case (classifier:Classifier[I,Boolean]) => + dat match { + case Some(existingData) => + if (classifier.data != existingData) { + warn("Classifiers trained on different data; taking union") + Some(classifier.data ++ existingData) + } else { + Some(existingData) + } + case None => Some(classifier.data) + } + case _ => dat + } + } + } + + // -- Methods + def data(d:Map[I,(Boolean,Float)]):Ensemble[I] = new Ensemble(members, Some(d)) + def data(d:Seq[(I,Boolean)]):Ensemble[I] + = data( d.map( x => (x._1, (x._2, 1.0f)) ).toMap ) + + /** + * Implementation of AdaBoost. + * Taken from http://en.wikipedia.org/wiki/AdaBoost + */ + def boost(data:Map[I,(Boolean,Float)]):Classifier[I,Boolean] = { + if (data.isEmpty) throw new IllegalArgumentException("No data to train on!") + // -- Cache + startTrack("Running Weak Learners") + val dataAsArray = data.toArray + val gold = dataAsArray.map( _._2._1 ) + val predictions:Array[(I=>Boolean,Array[(Boolean, Float)])] + = members.toList.par.map{ (h:I=>Boolean) => + log("running " + h.toString) + (h, dataAsArray.map{ case (in:I, (out:Boolean, weight:Float)) => + (h(in), weight) + }) + }.toArray + endTrack("Running Weak Learners") + // -- Error Rate + def error(predictions:Array[(Boolean,Float)], + gold:Array[Boolean], + d:Array[Double] = (0 until data.size).map( x => 1.0 / data.size ).toArray + ):Double = { + predictions.zip(gold).zip(d).foldLeft(0.0){ + case (sum:Double, + (( (guess:Boolean, weight:Float), + gold:Boolean), + di:Double)) => + if(guess == gold) sum else sum + di * weight + } + } + def regressor(coefficients:Seq[(Double, I=>Boolean)] + ):(I => Map[Boolean, Double]) = (in:I) => { + val sum = coefficients.foldLeft(0.0){ + case (sum:Double, (alpha:Double, h:(I=>Boolean))) => + sum + alpha * { if(h(in)) 1.0 else -1.0 } + } + Map[Boolean, Double]( true -> {if(sum >= 0.0) 1.0 else 0.0 }, + false -> {if(sum >= 0.0) 0.0 else 1.0 } ) + } + // -- Run an Iteration + def iter(t:Int, + predictions:Array[(I=>Boolean, Array[(Boolean,Float)])], + gold:Array[Boolean], + soFar:List[(Double, I=>Boolean)], + d:Array[Double] = data.map( x => 1.0 / data.size.toDouble ).toArray, + tolerance:Double = NLPConfig.classify.tolerance + ):List[(Double, I=>Boolean)] = { + startTrack("Iteration " + t) + // (get errors) + val errors = predictions.map{ case (h, pred:Array[(Boolean,Float)]) => + ( h, pred, error(pred, gold, d) ) + } + val (hOpt, predOpt, et) = errors.maxBy( x => scala.math.abs(0.5 - x._3) ) + // (compute update) + log("optimal classifier: " + hOpt) + log("e_t: " + et) + val at = 0.5 * scala.math.log( (1.0 - et) / et ) + val newD = predOpt.zip(gold).zip(d).map{ + case (((guess:Boolean, weight:Float), gold:Boolean), di:Double) => + di * scala.math.exp(- {if (guess == gold) 1.0 else -1.0} * at) + } + val sumD = newD.sum + for (i <- 0 until newD.length) { newD(i) /= sumD } + // (update coefficients) + val coeffs = (at, hOpt) :: soFar + log("a_t: " + at) + endTrack("Iteration " + t) + // (recurse) + if ( scala.math.abs(0.5 - et) < tolerance || + t >= NLPConfig.classify.iterations) { + coeffs + } else { + iter(t+1, predictions, gold, coeffs, newD, tolerance) + } + } + // -- Construct Classifier + startTrack("Boosting over " + members.length + " classifier and " + data.size + " examples") + val fn = regressor(iter(1, predictions, gold, Nil)) + endTrack("Boosting over " + members.length + " classifier and " + data.size + " examples") + new Classifier(fn, data) + } + + def boost:Classifier[I,Boolean] + = boost(dat.getOrElse(Map[I,(Boolean,Float)]())) +} diff --git a/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Document.scala b/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Document.scala new file mode 100644 index 0000000..0659a50 --- /dev/null +++ b/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Document.scala @@ -0,0 +1,55 @@ +package edu.stanford.nlp; + +import scala.collection.JavaConversions._ +import scala.collection.MapLike +import scala.collection.Map +import scala.collection.generic.CanBuildFrom +import scala.concurrent.Lock + +import java.io.ObjectInputStream +import java.lang.ref.SoftReference +import java.lang.ref.ReferenceQueue +import java.util.Properties + +import edu.stanford.nlp.classify.LinearClassifierFactory +import edu.stanford.nlp.classify.LogPrior +import edu.stanford.nlp.classify.RVFDataset +import edu.stanford.nlp.ie.NERClassifierCombiner +import edu.stanford.nlp.io.IOUtils +import edu.stanford.nlp.ling.HasWord +import edu.stanford.nlp.ling.RVFDatum +import edu.stanford.nlp.ling.Word +import edu.stanford.nlp.ling.CoreLabel +import edu.stanford.nlp.optimization.DiffFunction +import edu.stanford.nlp.optimization.QNMinimizer +import edu.stanford.nlp.optimization.SGDToQNMinimizer +import edu.stanford.nlp.parser.lexparser.LexicalizedParser +import edu.stanford.nlp.process.Morphology +import edu.stanford.nlp.process.PTBTokenizer +import edu.stanford.nlp.stats.ClassicCounter +import edu.stanford.nlp.stats.Counter +import edu.stanford.nlp.tagger.maxent.MaxentTagger +import edu.stanford.nlp.trees.CollinsHeadFinder +import edu.stanford.nlp.trees.LabeledScoredTreeNode +import edu.stanford.nlp.trees.Tree +import edu.stanford.nlp.trees.Trees +import edu.stanford.nlp.trees.GrammaticalStructureFactory +import edu.stanford.nlp.trees.GrammaticalStructure +import edu.stanford.nlp.trees.PennTreebankLanguagePack +import edu.stanford.nlp.trees.TypedDependency +import edu.stanford.nlp.util.logging.Redwood.Util._ + +import NLPConfig._ +import NLP._ + + +object Document { +} + + +@SerialVersionUID(1l) +case class Document(sentences:Array[String]) { + // TODO(gabor) coreference +} + + diff --git a/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Magic.scala b/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Magic.scala new file mode 100644 index 0000000..692ac76 --- /dev/null +++ b/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Magic.scala @@ -0,0 +1,76 @@ +package edu.stanford.nlp; + +import scala.collection.JavaConversions._ +import scala.collection.MapLike +import scala.collection.Map +import scala.collection.generic.CanBuildFrom +import scala.concurrent.Lock + +import java.io.ObjectInputStream +import java.lang.ref.SoftReference +import java.lang.ref.ReferenceQueue +import java.util.Properties + +import edu.stanford.nlp.classify.LinearClassifierFactory +import edu.stanford.nlp.classify.LogPrior +import edu.stanford.nlp.classify.RVFDataset +import edu.stanford.nlp.ie.NERClassifierCombiner +import edu.stanford.nlp.io.IOUtils +import edu.stanford.nlp.ling.HasWord +import edu.stanford.nlp.ling.RVFDatum +import edu.stanford.nlp.ling.Word +import edu.stanford.nlp.ling.CoreLabel +import edu.stanford.nlp.optimization.DiffFunction +import edu.stanford.nlp.optimization.QNMinimizer +import edu.stanford.nlp.optimization.SGDToQNMinimizer +import edu.stanford.nlp.parser.lexparser.LexicalizedParser +import edu.stanford.nlp.process.Morphology +import edu.stanford.nlp.process.PTBTokenizer +import edu.stanford.nlp.stats.ClassicCounter +import edu.stanford.nlp.stats.Counter +import edu.stanford.nlp.tagger.maxent.MaxentTagger +import edu.stanford.nlp.trees.CollinsHeadFinder +import edu.stanford.nlp.trees.LabeledScoredTreeNode +import edu.stanford.nlp.trees.Tree +import edu.stanford.nlp.trees.Trees +import edu.stanford.nlp.trees.GrammaticalStructureFactory +import edu.stanford.nlp.trees.GrammaticalStructure +import edu.stanford.nlp.trees.PennTreebankLanguagePack +import edu.stanford.nlp.trees.TypedDependency +import edu.stanford.nlp.util.logging.Redwood.Util._ + +import NLPConfig._ + + +object Magic { + import NLP._ + + /* + * Implicit Conversions + */ + implicit def seq2nlpseq(seq:Seq[String]):Sentence = new Sentence(seq) + implicit def string2nlpseq(gloss:String):Sentence = new Sentence(gloss) + + implicit def map2mapping[I,O,X](map:Map[I,X]):Mapping[I,O] = Mapping(map) + + implicit def seq2ensemble[I](seq:Seq[I=>Boolean]):Ensemble[I] = new Ensemble(seq, None) + + implicit def fn2optimizable( + fn:Array[Double]=>Double):OptimizableFunction = { + optimize.algorithm.toLowerCase match { + case "lbfgs" => LBFGSOptimizableApproximateFunction(fn, None) + case "braindead" => BraindeadGradientDescent(fn, None) + case _ => throw new IllegalStateException("Unknown algorithm: " + optimize.algorithm) + } + } + implicit def fnPair2optimizable( + pair:(Array[Double]=>Double,Array[Double]=>Array[Double])):OptimizableFunction = { + optimize.algorithm.toLowerCase match { + case "lbfgs" => LBFGSOptimizableApproximateFunction(pair._1, Some(pair._2)) + case "braindead" => BraindeadGradientDescent(pair._1, Some(pair._2)) + case _ => throw new IllegalStateException("Unknown algorithm: " + optimize.algorithm) + } + } + + implicit def string2tokensregex(str:String):TokensRegex = new TokensRegex(str) +} diff --git a/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/NLP.scala b/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/NLP.scala new file mode 100644 index 0000000..16cc267 --- /dev/null +++ b/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/NLP.scala @@ -0,0 +1,196 @@ +package edu.stanford.nlp + +import scala.collection.JavaConversions._ +import scala.collection.MapLike +import scala.collection.Map +import scala.collection.generic.CanBuildFrom +import scala.concurrent.Lock + +import java.io.ObjectInputStream +import java.lang.ref.SoftReference +import java.lang.ref.ReferenceQueue +import java.util.Properties + +import edu.stanford.nlp.ling.CoreAnnotations._ +import edu.stanford.nlp.classify.LinearClassifierFactory +import edu.stanford.nlp.classify.LogPrior +import edu.stanford.nlp.classify.RVFDataset +import edu.stanford.nlp.ie.NERClassifierCombiner +import edu.stanford.nlp.ie.crf.CRFBiasedClassifier +import edu.stanford.nlp.io.IOUtils +import edu.stanford.nlp.ling.HasWord +import edu.stanford.nlp.ling.RVFDatum +import edu.stanford.nlp.ling.Word +import edu.stanford.nlp.ling.CoreLabel +import edu.stanford.nlp.optimization.DiffFunction +import edu.stanford.nlp.optimization.QNMinimizer +import edu.stanford.nlp.optimization.SGDToQNMinimizer +import edu.stanford.nlp.parser.lexparser.LexicalizedParser +import edu.stanford.nlp.process.Morphology +import edu.stanford.nlp.process.PTBTokenizer +import edu.stanford.nlp.stats.ClassicCounter +import edu.stanford.nlp.stats.Counter +import edu.stanford.nlp.tagger.maxent.MaxentTagger +import edu.stanford.nlp.trees.CollinsHeadFinder +import edu.stanford.nlp.trees.LabeledScoredTreeNode +import edu.stanford.nlp.trees.Tree +import edu.stanford.nlp.trees.Trees +import edu.stanford.nlp.trees.GrammaticalStructureFactory +import edu.stanford.nlp.trees.GrammaticalStructure +import edu.stanford.nlp.trees.PennTreebankLanguagePack +import edu.stanford.nlp.trees.TypedDependency +import edu.stanford.nlp.util.logging.Redwood.Util._ + +import NLPConfig._ + +object NLP { + implicit def list2hasWordList(lst:Seq[String]):java.util.List[_<:HasWord] + = lst.map( new Word(_) ).toList + + // ---------- + // Parsers + // ---------- + lazy val stanfordParser = { + val parser = LexicalizedParser.loadModel(parse.model) + new { + def parse(words:List[String], pos:List[String]):Tree = { + parser.parseStrings(words); + } + } + } + lazy val parser = stanfordParser + // ---------- + // Stanford CoreNLP Components + // ---------- + lazy val tagger = new MaxentTagger(pos.model) + + lazy val collinsHeadFinder = new CollinsHeadFinder() + + lazy val morph:((Morphology=>Any)=>Any) = { + val morph = new Morphology() + val morphLock = new Lock() + val f = { (fn:Morphology=>Any) => + morphLock.acquire; + val rtn = fn(morph); + morphLock.release + rtn + } + f + } + + lazy val nerCRF:(Array[String], Array[String])=>Array[String] = { + val classifier = new NERClassifierCombiner(ner.model, ner.aux); + (words:Array[String], pos:Array[String]) => { + val offsets:List[Int] = words.foldLeft( (List[Int](), 0) ){ + case ((offsetsSoFar:List[Int], offset:Int), word:String) => + (offset :: offsetsSoFar, offset + word.length + 1) + }._1.reverse + // (construct CoreLabel sentence) + val coreSentence = new java.util.ArrayList[CoreLabel](words.length) + words.zip(pos).zip(offsets)foreach{ + case ((word:String, pos:String), offset:Int) => + val label = new CoreLabel + label.setWord(word) + label.setOriginalText(word) + label.setTag(pos) + label.setBeginPosition(offset) + label.setEndPosition(offset + word.length) + coreSentence.add(label) + } + // (classify) + classifier.classifySentence(coreSentence) + val output:java.util.List[CoreLabel] = classifier.classifySentence(coreSentence); + // (convert back) + output.map{ (label:CoreLabel) => + label.ner() + }.toArray + } + } + + /** + * The TrueCase classifier implementation. + * Takes as input an array of tokens, POS tags, and lemmas, + * and returns as output the tokens with their true case applied. + * The length of the tokens, POS tags, and lemmas must match. + * @return An array of tokens (words as Strings) of the same length + * as the input tokens, but with their inferred true case. + */ + lazy val trueCaser:(Array[String], Array[String], Array[String])=>Array[String] = { + // Create classifier + val props:Properties = { + val p = new Properties + p.setProperty("loadClassifier", NLPConfig.truecase.model) + p.setProperty("mixedCaseMapFile", NLPConfig.truecase.disambiguation_list) + p.setProperty("classBias", NLPConfig.truecase.bias) + p + } + val classifier = new CRFBiasedClassifier[CoreLabel](props); + classifier.loadClassifierNoExceptions(NLPConfig.truecase.model, props); + // Set classifier biases + NLPConfig.truecase.bias.split(",").foreach{ (bias:String) => + val terms = bias.split(":") + classifier.setBiasWeight(terms(0), terms(1).toDouble) + } + // Get mixed case map + val mixedCaseMap:Map[String,String] + = scala.io.Source.fromInputStream(IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(NLPConfig.truecase.disambiguation_list)) + .getLines + .map( _.trim.split("""\s+""") ) + .map{ case Array(a:String, b:String) => (a ,b) } + .toMap + // Return function + (words:Array[String], pos:Array[String], lemma:Array[String]) => { + // (mock offsets) + val offsets:List[Int] = words.foldLeft( (List[Int](), 0) ){ + case ((offsetsSoFar:List[Int], offset:Int), word:String) => + (offset :: offsetsSoFar, offset + word.length + 1) + }._1.reverse + // (construct CoreLabel sentence) + val coreSentence = new java.util.ArrayList[CoreLabel](words.length) + words.zip(pos).zip(offsets)foreach{ + case ((word:String, pos:String), offset:Int) => + val label = new CoreLabel + label.setWord(word.toLowerCase) + label.setOriginalText(word) + label.setTag(pos) + label.setBeginPosition(offset) + label.setEndPosition(offset + word.length) + coreSentence.add(label) + } + // (classify) + val output:java.util.List[CoreLabel] = classifier.classifySentence(coreSentence); + // (convert back) + output.map{ (label:CoreLabel) => + val word:String = label.word + label.get(classOf[AnswerAnnotation]) match { + case "UPPER" => word.toUpperCase + case "LOWER" => word.toLowerCase + case "INIT_UPPER" => word.substring(0, 1).toUpperCase + word.substring(1).toLowerCase + case "O" => mixedCaseMap.get(word).getOrElse(word) + case _ => word + } + }.toArray + } + } + + // ---------- + // Methods + // ---------- + def preload(obj: => Any) { new Thread(){ override def run:Unit = obj }.start } +} + +trait CoreLabelSeq extends Seq[CoreLabel] { + // + // Trivial overrides (still have to define apply(Int):CoreLabel and length:Int though) + // + override def iterator:Iterator[CoreLabel] = new Iterator[CoreLabel] { + var index:Int = 0 + override def hasNext:Boolean = index < CoreLabelSeq.this.length + override def next:CoreLabel = { index += 1; apply(index - 1); } + } + + // + // Common Methods + // + def matches(t:TokensRegex) = t.matches(this) +} diff --git a/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/NLPConfig.scala b/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/NLPConfig.scala new file mode 100644 index 0000000..da165e0 --- /dev/null +++ b/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/NLPConfig.scala @@ -0,0 +1,44 @@ +package edu.stanford.nlp + +import edu.stanford.nlp.pipeline.DefaultPaths._ + +object NLPConfig { + object parse { + var model:String = DEFAULT_PARSER_MODEL + } + + object pos { + var model:String = DEFAULT_POS_MODEL + } + + object ner { + var model:String = DEFAULT_NER_CONLL_MODEL + var aux:String = DEFAULT_NER_MUC_MODEL + } + + object classify { + var tolerance:Double = 1e-5 + var iterations:Double = 40 + } + + object optimize { + var tolerance:Double = 1e-5 + var wiggle:Double = 1e-5 + var algorithm = "LBFGS" // | braindead | ... + } + + object truecase { + var model:String = "edu/stanford/nlp/models/truecase/truecasing.fast.caseless.qn.ser.gz" + var disambiguation_list:String = "edu/stanford/nlp/models/truecase/MixDisambiguation.list" + var bias:String = "INIT_UPPER:-0.7,UPPER:-0.7,O:0" + } + + def caseless:Unit = { + parse.model = "edu/stanford/nlp/models/lexparser/englishPCFG.caseless.ser.gz" + pos.model = "edu/stanford/nlp/models/pos-tagger/english-caseless-left3words-distsim.tagger" + ner.model = "edu/stanford/nlp/models/ner/english.conll.4class.caseless.distsim.crf.ser.gz" + ner.aux = "edu/stanford/nlp/models/ner/english.muc.7class.caseless.distsim.crf.ser.gz" + } + + var numThreads = Runtime.getRuntime().availableProcessors(); +} diff --git a/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Optimize.scala b/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Optimize.scala new file mode 100644 index 0000000..e7c8384 --- /dev/null +++ b/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Optimize.scala @@ -0,0 +1,157 @@ +package edu.stanford.nlp; + +import scala.collection.JavaConversions._ +import scala.collection.MapLike +import scala.collection.Map +import scala.collection.generic.CanBuildFrom +import scala.concurrent.Lock + +import java.io.ObjectInputStream +import java.lang.ref.SoftReference +import java.lang.ref.ReferenceQueue +import java.util.Properties + +import edu.stanford.nlp.classify.LinearClassifierFactory +import edu.stanford.nlp.classify.LogPrior +import edu.stanford.nlp.classify.RVFDataset +import edu.stanford.nlp.ie.NERClassifierCombiner +import edu.stanford.nlp.io.IOUtils +import edu.stanford.nlp.ling.HasWord +import edu.stanford.nlp.ling.RVFDatum +import edu.stanford.nlp.ling.Word +import edu.stanford.nlp.ling.CoreLabel +import edu.stanford.nlp.optimization.DiffFunction +import edu.stanford.nlp.optimization.QNMinimizer +import edu.stanford.nlp.optimization.SGDToQNMinimizer +import edu.stanford.nlp.parser.lexparser.LexicalizedParser +import edu.stanford.nlp.process.Morphology +import edu.stanford.nlp.process.PTBTokenizer +import edu.stanford.nlp.stats.ClassicCounter +import edu.stanford.nlp.stats.Counter +import edu.stanford.nlp.tagger.maxent.MaxentTagger +import edu.stanford.nlp.trees.CollinsHeadFinder +import edu.stanford.nlp.trees.LabeledScoredTreeNode +import edu.stanford.nlp.trees.Tree +import edu.stanford.nlp.trees.Trees +import edu.stanford.nlp.trees.GrammaticalStructureFactory +import edu.stanford.nlp.trees.GrammaticalStructure +import edu.stanford.nlp.trees.PennTreebankLanguagePack +import edu.stanford.nlp.trees.TypedDependency +import edu.stanford.nlp.util.logging.Redwood.Util._ + +import NLPConfig._ +import NLP._ +import Optimize._ + +// ---------- +// Optimizers +// ---------- +object Optimize { + def empiricalDerivative(fn:Array[Double]=>Double, + x:Array[Double]):Array[Double] = { + val y0 = fn(x) + def tweak(i:Int, delta:Double):(Double, Double) = { + x(i) += delta + val y1 = fn(x) + x(i) -= delta + if (delta < 1e-5 * optimize.wiggle || delta > 1e5 * optimize.wiggle) { + (y1, delta) + } else { + if (scala.math.abs(y1 - y0) / delta > 1e5) tweak(i, delta / 2.0) + else if (scala.math.abs(y1 - y0) / delta < 1e-5) tweak(i, delta * 2.0) + else (y1, delta) + } + } + {for (i <- 0 until x.length) yield { + val (y1, step) = tweak(i, optimize.wiggle) + (y1 - y0) / step + }}.toArray + } +} + +trait OptimizableFunction { + def minimize(initial:Array[Double]):Array[Double] + def derivative(ddx:Array[Double]=>Array[Double]):OptimizableFunction +} + +/** + * A wrapper for QNMinimizer (L-BFGS) +*/ +case class LBFGSOptimizableApproximateFunction( + fn:Array[Double]=>Double, derivative:Option[Array[Double]=>Array[Double]]) + extends OptimizableFunction{ + + override def minimize(initial:Array[Double]):Array[Double] = { + // (define a differentiable function) + val javaFn:DiffFunction = new DiffFunction { + override def domainDimension:Int = initial.length + override def valueAt(x:Array[Double]):Double = fn(x) + override def derivativeAt(x:Array[Double]):Array[Double] = { + derivative match { + case Some(ddx) => ddx(x) + case None => empiricalDerivative(fn, x) + } + } + } + // (optimize using QNMinimizer) + val javaInit = initial.map{ (n:Double) => n } + val optimizer = new QNMinimizer() + optimizer.setRobustOptions() + optimizer.minimize(javaFn, optimize.tolerance, javaInit) + } + + override def derivative(ddx:Array[Double]=>Array[Double]):LBFGSOptimizableApproximateFunction + = new LBFGSOptimizableApproximateFunction(fn, Some(ddx)) +} + +/** + * An optimization algorithm I made up (thus, "braindead"), that tries its + * best to move against the gradient (thus, "gradient descent"). + * The only motivation to use this over L-BFGS is that it's more robust to + * non-convex problems (i.e., won't crash and burn). +*/ +case class BraindeadGradientDescent( + fn:Array[Double]=>Double, derivative:Option[Array[Double]=>Array[Double]]) + extends OptimizableFunction{ + + override def minimize(initial:Array[Double]):Array[Double] = { + // (helpers) + def dx(x:Array[Double], y0:Double):Array[Double] = derivative match { + case Some(ddx) => ddx(x) + case None => empiricalDerivative(fn, x) + } + def move(init:Array[Double], direction:Array[Double], scaling:Double):Array[Double] = { + init.zip(direction).map{ case (a:Double, d:Double) => a + scaling * d} + } + def isImprovementOver(newY:Double, y:Double):Boolean + = newY + optimize.tolerance < y + // (state) + val initialX:Array[Double] = initial + val initialY:Double = fn(initialX) + var x:Array[Double] = initialX + var y:Double = initialY + var numIters = 0 + // (optimization) + while (numIters < 100) { + var step:Double = 1.0 + val dir:Array[Double] = dx(x, y).map( - _ ) + var newX:Array[Double] = move(x, dir, step) + var newY:Double = fn(newX) + while (!isImprovementOver(newY, y) && step > 1e-5) { + step /= 2.0 + newX = move(x, dir, step) + newY = fn(newX) + } + if (step <= 1e-5) return x // convergence + assert(newY < y, "Function value did not decrease!") + x = newX + y = newY + numIters += 1 + } + // (timeout -- no convergence) + return x + } + + override def derivative(ddx:Array[Double]=>Array[Double]):BraindeadGradientDescent + = new BraindeadGradientDescent(fn, Some(ddx)) +} diff --git a/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Sentence.scala b/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Sentence.scala new file mode 100644 index 0000000..404499a --- /dev/null +++ b/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Sentence.scala @@ -0,0 +1,246 @@ +package edu.stanford.nlp; + +import scala.collection.JavaConversions._ +import scala.collection.MapLike +import scala.collection.Map +import scala.collection.generic.CanBuildFrom +import scala.concurrent.Lock + +import java.io.ObjectInputStream +import java.lang.ref.SoftReference +import java.lang.ref.ReferenceQueue +import java.util.Properties + +import edu.stanford.nlp.classify.LinearClassifierFactory +import edu.stanford.nlp.classify.LogPrior +import edu.stanford.nlp.classify.RVFDataset +import edu.stanford.nlp.ie.NERClassifierCombiner +import edu.stanford.nlp.io.IOUtils +import edu.stanford.nlp.ling.HasWord +import edu.stanford.nlp.ling.RVFDatum +import edu.stanford.nlp.ling.Word +import edu.stanford.nlp.ling.CoreLabel +import edu.stanford.nlp.optimization.DiffFunction +import edu.stanford.nlp.optimization.QNMinimizer +import edu.stanford.nlp.optimization.SGDToQNMinimizer +import edu.stanford.nlp.parser.lexparser.LexicalizedParser +import edu.stanford.nlp.process.Morphology +import edu.stanford.nlp.process.PTBTokenizer +import edu.stanford.nlp.stats.ClassicCounter +import edu.stanford.nlp.stats.Counter +import edu.stanford.nlp.tagger.maxent.MaxentTagger +import edu.stanford.nlp.trees.CollinsHeadFinder +import edu.stanford.nlp.trees.LabeledScoredTreeNode +import edu.stanford.nlp.trees.Tree +import edu.stanford.nlp.trees.Trees +import edu.stanford.nlp.trees.GrammaticalStructureFactory +import edu.stanford.nlp.trees.GrammaticalStructure +import edu.stanford.nlp.trees.PennTreebankLanguagePack +import edu.stanford.nlp.trees.TypedDependency +import edu.stanford.nlp.util.logging.Redwood.Util._ + +import NLPConfig._ +import NLP._ + +object Sentence { + val tokenizerFactory = PTBTokenizer.factory + val grammaticalStructureFactory + = new PennTreebankLanguagePack().grammaticalStructureFactory + + def apply(word:Seq[String]):Sentence = new Sentence(word.toArray) + def apply(gloss:String):Sentence = new Sentence(gloss) +} + + +@SerialVersionUID(2l) +case class Sentence(word:Array[String]) extends CoreLabelSeq { + + def this(word:Seq[String]) = this(word.toArray) + + def this(sentence:String) = this( + Sentence.tokenizerFactory.getTokenizer(new java.io.StringReader(sentence)) + .tokenize + .map( _.word ) + .toArray + ) + + // + // Necessary Overrides for Seq[CoreLabel] + // + override def length:Int = word.length + override def apply(index:Int):CoreLabel = { + val label = new CoreLabel(8) + label.setWord(word(index)) + label.setTag(pos(index)) + if (index > 0) { label.setAfter(word(index - 1)) } + if (index < word.length - 1) { label.setBefore(word(index + 1)) } + label.setNER(ner(index)) + label.setLemma(lemma(index)) + label.setIndex(index) + // TODO(gabor) things like character offsets, original text, etc. + label + } + + + + var id:Option[Int] = None + // values + lazy val parse:Tree = { + NLP.parser.parse(word.toList, pos.toList) + } + + lazy val stanfordDependencies:Array[(Int, String)] = { + if (length == 0) { + new Array[(Int, String)](0) + } else { + val depArray = new Array[(Int, String)](length) + // (get dependencies) + val structure:GrammaticalStructure + = Sentence.grammaticalStructureFactory.newGrammaticalStructure(parse) + val deps:java.util.Collection[TypedDependency] + = structure.typedDependencies() + // (fill dependencies) + deps.foreach{ (arc:TypedDependency) => + depArray(arc.dep.index - 1) = + ( arc.gov.index - 1, + arc.reln.getShortName + {if (arc.reln.getSpecific == null) "" else "_" + arc.reln.getSpecific} ) + } + // (pad empty dependencies) + for (i <- 0 until depArray.length) { + if (depArray(i) == null) depArray(i) = (i, "noop") + } + depArray + } + } + + def dependencyRoot:Int + = stanfordDependencies.zipWithIndex.filter( _._1._1 < 0 ).headOption match { + case Some( (dep, index) ) => index + case None => throw new IllegalStateException("Could not find head: '" + + this + "' --- dependencies: " + stanfordDependencies.mkString(" ")) + } + + def dependencyChild(root:Int, depType:String):Option[Int] + = stanfordDependencies.zipWithIndex.filter( x => x._1._1 == root && x._1._2 == depType ) + .map( _._2 ).headOption + + def dependencyChildren(root:Int):Seq[(Int, String)] + = stanfordDependencies.zipWithIndex.filter( _._1._1 == root ).map( x => (x._2, x._1._2) ) + + def dependencyYield(root:Int):Set[Int] = { + def recursiveSearch(root:Int, seen:Set[Int]):Set[Int] = { + val directChildren = dependencyChildren(root).map( _._1 ) + directChildren.foldLeft(seen) { + case (soFar:Set[Int], index:Int) => + if (!soFar(index)) recursiveSearch(index, seen + index) + else soFar + } + } + recursiveSearch(root, Set[Int](root)) + } + + def dependencyPathMonotonic(ancestor:Int, descendent:Int):Option[Seq[Int]] = { + def recurse(ancestor:Int, descendent:Int, lst:List[Int]):Option[List[Int]] = { + if (descendent == ancestor) Some(ancestor :: lst) + else if (descendent < 0) None + else recurse(ancestor, stanfordDependencies(descendent)._1, descendent :: lst) + } + recurse(ancestor, stanfordDependencies(descendent)._1, Nil) + } + + lazy val headIndex:Int = { + if (word.length == 1) { 0 } + else { + val headLeaf = parse.headTerminal(collinsHeadFinder) + val index = parse.getLeaves().indexWhere{ (x:Tree) => x eq headLeaf } + if (index < 0) word.length - 1 else index + } + } + + def headIndex(spanBegin:Int, spanEnd:Int):Int = { + parse.setSpans + val (score, tree) = parse.foldLeft( spanBegin + (length - spanEnd), parse ){ + case ( (smallestDiffSoFar:Int, bestTreeSoFar:Tree), tree:Tree ) => + if (tree != null && tree.getSpan != null) { + val (treeBegin, treeEnd) = (tree.getSpan.getSource, tree.getSpan.getTarget) + val diff = scala.math.abs(spanBegin - treeBegin) + + scala.math.abs(spanEnd - treeEnd) + if (treeBegin >= spanBegin && treeEnd <= spanEnd && + diff < smallestDiffSoFar) { (diff, tree) } + else { (smallestDiffSoFar, bestTreeSoFar) } + } else { (smallestDiffSoFar, bestTreeSoFar) } + } + val headLeaf = tree.headTerminal(collinsHeadFinder) + val index = parse.getLeaves().indexWhere{ (x:Tree) => x eq headLeaf } + if (index < spanBegin || index >= spanEnd) spanEnd - 1 else index + } + + def headWord(spanBegin:Int, spanEnd:Int):String = word(headIndex(spanBegin, spanEnd)) + + lazy val pos:Array[String] + = if (length == 0) new Array[String](0) + else NLP.tagger.apply(word.toList).map( _.tag ).toArray + + lazy val lemma:Array[String] = word.zip(pos).map{ case (w:String,p:String) => + morph( m => m.lemma(w,p) ).toString + }.toArray + + lazy val ner:Array[String] = nerCRF(word, pos) + + lazy val truecase:Array[String] = trueCaser(word, pos, lemma) + + // helper functions + def words:Array[String] = word + def tags:Array[String] = pos + + def headWord:String = word(headIndex) + def headLemma:String = lemma(headIndex) + def headPOS:String = pos(headIndex) + def namedEntities:Array[(Array[String],String)] = { + // (collect tags) + val nerTags = word.zip(ner).foldLeft(List[(List[String],String)]()){ + case (soFar:List[(List[String],String)], (word:String, tag:String)) => + val (chunk, lastTag) = if (soFar.isEmpty) (List[String](), "O") + else soFar.head + val tailList:List[(List[String],String)] + = if (soFar.isEmpty) Nil else soFar.tail + if (lastTag != tag) { + (List[String](word), tag) :: { + if (lastTag != "O") (chunk.reverse, lastTag) :: tailList + else tailList + } + } else { + (word :: chunk, tag) :: tailList + } + } + // (some cleanup) + val headPair = nerTags.head + (if (headPair._2 == "O") nerTags.tail + else (headPair._1.reverse, headPair._2) :: nerTags.tail) + .reverse + .map{ case (c,t) => (c.toArray,t) } + .toArray + } + + def toSentence:Sentence = this + + override def equals(a:Any):Boolean = { + def seqMatch(s:Seq[String]):Boolean = { + s.length == word.length && s.zip(word).forall{ case (a,b) => a == b } + } + a match { + case (s:Sentence) => + for (id1 <- this.id; + id2 <- s.id) return id1 == id2 + return seqMatch(s.word) + case (s:Seq[String]) => seqMatch(s) + case _ => false + } + } + private var code:Int = 0 + override def hashCode:Int = { + if (code == 0) { word.foreach( w => code = 37 * code + w.hashCode ) } + code + } + override def toString:String = word.mkString(" ") +} diff --git a/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/TokensRegex.scala b/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/TokensRegex.scala new file mode 100644 index 0000000..2fcfd11 --- /dev/null +++ b/src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/TokensRegex.scala @@ -0,0 +1,82 @@ +package edu.stanford.nlp; + +import scala.collection.JavaConversions._ + +import edu.stanford.nlp.ling.CoreLabel +import edu.stanford.nlp.util.CoreMap +import edu.stanford.nlp.ling.tokensregex._ + +import NLPConfig._ + + +case class TokensRegex(override val toString:String) { + val pattern:TokenSequencePattern = TokenSequencePattern.compile(toString) + + + def matches(input:Seq[CoreLabel]):Boolean = pattern.getMatcher(input.toList).matches + + def allMatches(input:Seq[CoreLabel]):Iterator[Seq[CoreLabel]] = { + val matcher = pattern.getMatcher(input.toList) + new Iterator[Seq[CoreLabel]] { + var theNext:Option[Boolean] = None + override def hasNext:Boolean = theNext match { + case Some(x) => x + case None => theNext = Some(matcher.find); theNext.get + } + override def next:Seq[CoreLabel] = { + if (!hasNext) throw new NoSuchElementException + theNext = None + val m:java.util.List[_ <: CoreMap] = matcher.groupNodes + m.map( _ match { + case (x:CoreLabel) => x + case (x:CoreMap) => new CoreLabel(x) + }) + } + } + } + + def unapplySeq(target:Any):Option[Seq[Seq[CoreLabel]]] = target match { + case (input:Seq[CoreLabel]) => + val matcher = pattern getMatcher(input toList) + if (matcher matches) { + Some(for (i <- 1 to matcher.groupCount) yield + matcher groupNodes(i) map( _ match { + case (x:CoreLabel) => x + case (x:CoreMap) => new CoreLabel(x) + })) + } else { None } + case _ => None + } +} + + +object TokensRegex { + // Built-in predicates + def word(pattern:String):MarkedString = MarkedString(s"""{word : /$pattern/}""") + def tag(pattern:String):MarkedString = MarkedString(s"""{tag : /$pattern/}""") + def lemma(pattern:String):MarkedString = MarkedString(s"""{lemma : /$pattern/}""") + def ner(pattern:String):MarkedString = MarkedString(s"""{ner : /$pattern/}""") + def normalized(pattern:String):MarkedString = MarkedString(s"""{normalized : /$pattern/}""") + + // Decorate predicates + case class MarkedString(str:String) extends AnyVal { override def toString:String = str } + implicit def stringDecorator(str:MarkedString) = new { + def unary_!():String = s"""!$str""" + } + implicit def string2string(str:MarkedString):String = str.str + + // Create token sequence + implicit def product2tokens(p:Product):Tokens = new Tokens(List[String](p.productIterator.map( _.toString ).mkString(" & "))) + implicit def string2tokens(str:MarkedString):Tokens = new Tokens(List[String](str.str)) + class Tokens(val regexps:List[String]) { + def apply(terms:String*):Tokens = { + new Tokens(terms.mkString(" & ") :: regexps) + } + } + + // Dump to TokensRegex object + implicit def string2tokensregex(str:MarkedString):TokensRegex + = new TokensRegex(s"""[${str.str}]""") + implicit def tokens2tokensregex(tokens:Tokens):TokensRegex + = new TokensRegex(s"""[${tokens.regexps.reverse.mkString("] [")}]""") +} diff --git a/src/main/scala/DataSource.scala b/src/main/scala/DataSource.scala index aca4463..b431ff5 100644 --- a/src/main/scala/DataSource.scala +++ b/src/main/scala/DataSource.scala @@ -27,18 +27,18 @@ class DataSource(val dsp: DataSourceParams) val eventsRDD: RDD[Event] = eventsDB.find( appId = dsp.appId, entityType = Some("user"), - eventNames = Some(List("rate")) + eventNames = Some(List("train")) )(sc) val sentimentsRDD: RDD[Sentiment] = eventsRDD.map { event => val sentiment = try { val sentimentValue: Double = event.event match { - case "rate" => event.properties.get[Double]("sentiment") + case "train" => event.properties.get[Double]("sentiment") case _ => throw new Exception(s"Unexpected event ${event} is read.") } Sentiment( - event.properties.get[String]("phase"), + event.properties.get[String]("phrase"), sentimentValue ) } catch { diff --git a/src/main/scala/DummyAlgorithm.scala b/src/main/scala/DummyAlgorithm.scala deleted file mode 100644 index a1718a3..0000000 --- a/src/main/scala/DummyAlgorithm.scala +++ /dev/null @@ -1,34 +0,0 @@ -package org.template.sentimentanalysis - -import io.prediction.controller.PAlgorithm -import io.prediction.controller.Params -import io.prediction.data.storage.BiMap - -import org.apache.spark.SparkContext -import org.apache.spark.SparkContext._ -import org.apache.spark.rdd.RDD - -import grizzled.slf4j.Logger - -case class DummyAlgorithmParams( -)extends Params - -class DummyAlgorithm(val ap: DummyAlgorithmParams) - extends PAlgorithm[PreparedData, DummyModel, Query, PredictedResult] { - - @transient lazy val logger = Logger[this.type] - - def train(sc: SparkContext, data: PreparedData): DummyModel = { - //require(!data.sentiments.take(1).isEmpty, - // s"RDD[sentiments] in PreparedData cannot be empty." + - // " Please check if DataSource generates TrainingData" + - // " and Preprator generates PreparedData correctly.") - // do nothing in milestone 1 - new DummyModel() - } - - def predict(model: DummyModel, query: Query): PredictedResult = { - // always return 2.0 for milestone 1 - new PredictedResult(2.0) - } -} diff --git a/src/main/scala/DummyModel.scala b/src/main/scala/DummyModel.scala deleted file mode 100644 index a864687..0000000 --- a/src/main/scala/DummyModel.scala +++ /dev/null @@ -1,3 +0,0 @@ -package org.template.sentimentanalysis - -class DummyModel {} diff --git a/src/main/scala/Engine.scala b/src/main/scala/Engine.scala index 77142b6..b299f16 100644 --- a/src/main/scala/Engine.scala +++ b/src/main/scala/Engine.scala @@ -16,7 +16,7 @@ object SentimentAnalysisEngine extends IEngineFactory { new Engine( classOf[DataSource], classOf[Preparator], - Map("dummy" -> classOf[DummyAlgorithm]), + Map("nlpparse" -> classOf[Algorithm]), classOf[Serving]) } } diff --git a/src/main/scala/Model.scala b/src/main/scala/Model.scala new file mode 100644 index 0000000..e6c25c4 --- /dev/null +++ b/src/main/scala/Model.scala @@ -0,0 +1,73 @@ +package org.template.sentimentanalysis + +import edu.stanford.nlp.Magic._ +import edu.stanford.nlp.trees.Tree + +class Model ( + var rules: Map[String, Double] +) extends Serializable { + + /** + * Return the sentiment in [-2 , 2] scale + */ + def getWordSentiment(word: String): Double = { + var score = rules.get(word.toLowerCase()) + if (score.isEmpty) { + return 0.0 + } else { + return score.get - 2.0 + } + } + + /** + * Parse the input to a tree structure. Calculate the sentiment from bottom + * to the top. + * + * For a leaf node, it is always a word token. Use the sentiment + * from the training data in this case. If the word did not appear in the + * training data. Assume it is neutral. + * + * For a non-leaf node, calculate the sentiments of each of its children. + * Determine whether the sentence is positive or negative by the number of + * negative children. If it is odd, then assume the sentence is negative. + */ + def getSentiment(s: String, ap: AlgorithmParams): Double = { + var m = scala.collection.mutable.Map[Tree, Double]() + var tree = s.parse + var it = tree.iterator + val root = tree.preOrderNodeList().get(0) + val post_order = tree.postOrderNodeList() + var i = 0 + while (i < post_order.size()) { + var cur = post_order.get(i) + i = i + 1 + + if (cur.isLeaf()) { + m(cur) = getWordSentiment(cur.value) + } else { + val children = cur.children() + var weight = 0.0000000001 + var positive = 1 + var sentiment = 0.0 + m(cur) = 0 + for (child <- children) { + var child_sentiment = m(child) + + // The weight of a the child is proportional to the absolute value + // of its sentiment. It avoid the sentiment to be neutralized by + // other neutral childs + var child_weight = Math.abs(child_sentiment) + ap.baseWeight + + weight = weight + child_weight + sentiment = sentiment + child_weight * Math.abs(child_sentiment) + if (child_sentiment < -0.0000001) { + positive = positive * -1 + } + } + m(cur) = ( sentiment / weight ) * positive + } + } + + return m(root) + 2.0 + } +}