Skip to content

Commit

Permalink
Integrate with CoreNLP by the binding gangeli/CoreNLP-Scala.
Browse files Browse the repository at this point in the history
Tokenize the sentence and parse it to a Tree structure.
Calculate the sentiment on the tree structure.
  • Loading branch information
whhone committed Mar 30, 2015
1 parent e6a398e commit d643f38
Show file tree
Hide file tree
Showing 22 changed files with 1,427 additions and 48 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Expand Up @@ -2,4 +2,8 @@ manifest.json
pio.log
/pio.sbt
target/
data/*.csv
data/*.tsv
data/*.zip
data/gen_submission.py
*~
9 changes: 8 additions & 1 deletion build.sbt
Expand Up @@ -6,7 +6,14 @@ name := "template-scala-sentiment-analysis"

organization := "io.prediction"

excludeFilter in unmanagedSources := "Berkeley.scala"

libraryDependencies ++= Seq(
"io.prediction" %% "core" % pioVersion.value % "provided",
"org.apache.spark" %% "spark-core" % "1.2.0" % "provided",
"org.apache.spark" %% "spark-mllib" % "1.2.0" % "provided")
"org.apache.spark" %% "spark-mllib" % "1.2.0" % "provided",
"edu.stanford.nlp" % "stanford-corenlp" % "3.4",
"edu.stanford.nlp" % "stanford-corenlp" % "3.4" classifier "models",
"edu.stanford.nlp" % "stanford-parser" % "3.4"
)

8 changes: 4 additions & 4 deletions data/import_eventserver.py
Expand Up @@ -13,11 +13,11 @@ def import_events(client, file):
data = line.rstrip('\r\n').split("\t")
if True:
client.create_event(
event="rate",
event="train",
entity_type="user",
entity_id=data[0],
properties= {
"sentence" : str(data[2]),
"phrase" : str(data[2]),
"sentiment" : float(data[3])
}
)
Expand All @@ -41,6 +41,6 @@ def import_events(client, file):
client = predictionio.EventClient(
access_key=args.access_key,
url=args.url,
threads=5,
qsize=500)
threads=10,
qsize=1000)
import_events(client, args.file)
18 changes: 18 additions & 0 deletions data/send_query.py
@@ -0,0 +1,18 @@
"""
Send sample query to prediction engine
"""

import predictionio
client = predictionio.EngineClient(url="http://localhost:8000")

def test(s):
print s + ' : ' + str(client.send_query({"s": s})['sentiment'])

test('sad')
test('happy')
test('oh')
test('not')
test('not sad')
test('very sad')
test('very happy')
test('not very sad')
4 changes: 2 additions & 2 deletions engine.json
Expand Up @@ -9,9 +9,9 @@
},
"algorithms": [
{
"name": "dummy",
"name": "nlpparse",
"params": {

"baseWeight": 1
}
}
]
Expand Down
49 changes: 49 additions & 0 deletions src/main/scala/Algorithm.scala
@@ -0,0 +1,49 @@
package org.template.sentimentanalysis

import io.prediction.controller.PAlgorithm
import io.prediction.controller.Params
import io.prediction.data.storage.BiMap

import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.rdd.RDD

import edu.stanford.nlp.Magic._

import grizzled.slf4j.Logger

case class AlgorithmParams(
val baseWeight: Double
)extends Params

class Algorithm(val ap: AlgorithmParams)
extends PAlgorithm[PreparedData, Model, Query, PredictedResult] {

@transient lazy val logger = Logger[this.type]

def train(sc: SparkContext, data: PreparedData): Model = {
require(!data.sentiments.take(1).isEmpty,
s"RDD[sentiments] in PreparedData cannot be empty." +
" Please check if DataSource generates TrainingData" +
" and Preprator generates PreparedData correctly.")

val itemSets: RDD[(String, Double)] = data.sentiments.map(
s => (s.phrase.toLowerCase(), s.sentiment)
).cache()

val rules = itemSets.groupByKey
.mapValues(
// assume the last training data is the most up-to-date
iter => iter.toVector.last
)
.collectAsMap.toMap

new Model(rules)
}

def predict(model: Model, query: Query): PredictedResult = {
new PredictedResult(
model.getSentiment(query.s, ap)
)
}
}
44 changes: 44 additions & 0 deletions src/main/scala/CoreNLP-Scala/Makefile
@@ -0,0 +1,44 @@
#
# To Build:
# 1. Set CORENLP_HOME to the root of CoreNLP
# 2. [optional] Set BERKELEY to the path to the Berkeley parser
# 3. Build using either 'make stanford' or 'make berkeley' (if the Berkeley parser is configured)
#

CORENLP=$(CORENLP_HOME)/classes:$(CORENLP_HOME)/lib/joda-time.jar:$(CORENLP_HOME)/lib/jollyday-0.4.7.jar
BERKELEY=$(CORENLP_HOME)/../more/lib/BerkeleyParser.jar

JAVAC=javac
SCALAC=scalac

SRC=src
SOURCES = $(wildcard src/edu/stanford/nlp/*.scala)
TEST_SRC=test/src
LIB=lib
BUILD=classes
TEST_BUILD=test/classes
DIST=dist

dist: stanford
mkdir -p ${DIST}
jar cf ${DIST}/corenlp-scala.jar -C $(BUILD) .
jar uf ${DIST}/corenlp-scala.jar -C $(SRC) .

berkeley: stanford
$(SCALAC) -cp $(CORENLP):${BERKELEY} -d $(BUILD) `find $(SRC) -name "*.scala"`

stanford: ${SOURCES}
mkdir -p $(BUILD)
sed -e 's/BerkeleyUtil.berkeleyParser/throw new IllegalStateException("Could not find parser model (and was not compiled to run with Berkeley parser)")/g' ${SRC}/edu/stanford/nlp/NLP.scala > /tmp/NLP_stanfordonly.scala
$(SCALAC) -cp $(CORENLP) -d $(BUILD) `find $(SRC) -name "*.scala" ! -name "*Berkeley.scala" ! -name "NLP.scala"` /tmp/NLP_stanfordonly.scala
rm /tmp/NLP_stanfordonly.scala

default: stanford

clean:
rm -r $(BUILD)
rm -r ${DIST}


cmd:
@echo "scala -J-Xmx4G -cp $(CORENLP):$(BUILD)":${HOME}/lib/corenlp-models.jar
3 changes: 3 additions & 0 deletions src/main/scala/CoreNLP-Scala/README.md
@@ -0,0 +1,3 @@
Since gangeli/CoreNLP-Scala does not provide a way to install by the build.sbt file,
copy it from https://github.com/gangeli/CoreNLP-Scala.

90 changes: 90 additions & 0 deletions src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Berkeley.scala
@@ -0,0 +1,90 @@
package edu.stanford.nlp;

import scala.collection.JavaConversions._
import scala.concurrent.Lock

import edu.stanford.nlp.trees.Tree
import edu.stanford.nlp.trees.Trees
import edu.stanford.nlp.trees.LabeledScoredTreeNode
import edu.stanford.nlp.ling.HasWord
import edu.stanford.nlp.ling.Word

import edu.berkeley.nlp.PCFGLA._
import edu.berkeley.nlp.util.Numberer

import NLPConfig._

object BerkeleyUtil {
type BerkeleyTree = edu.berkeley.nlp.syntax.Tree[String]

implicit def stanfordTree2BerkeleyTree(btree:BerkeleyTree):Tree = {
val roots = TreeAnnotations.unAnnotateTree(btree).getChildren;
if (roots.isEmpty) {
new LabeledScoredTreeNode();
} else {
def convert(src:BerkeleyTree):Tree = {
val dst:Tree = new LabeledScoredTreeNode
if (src.getLabel != null) dst.setLabel(new Word(src.getLabel))
dst.setChildren(src.getChildren.map( convert(_) ).toArray)
dst
}
new LabeledScoredTreeNode(new Word("TOP"),
List[Tree](convert(roots.get(0))))
}
}

lazy val berkeleyParser = {
// (function to create parser)
def mkParser = {
// (setup parser)
val pData = ParserData.Load(parse.model)
if (pData == null) throw new RuntimeException("Failed to load Berkeley parser model")
val grammar = pData.getGrammar();
val lexicon = pData.getLexicon();
Numberer.setNumberers(pData.getNumbs());
// (create parser object)
val parser = new CoarseToFineMaxRuleParser(
grammar, lexicon, 1.0, -1, false, false, false,
false, false, true, true)
// (set binarization)
try {
val binarizationField = classOf[ConstrainedArrayParser].getDeclaredField("binarization");
binarizationField.setAccessible(true);
binarizationField.set(parser, pData.getBinarization());
binarizationField.setAccessible(false);
} catch { case (e:Exception) => throw new RuntimeException(e) }
// (parser object)
new {
def parse(words:List[String], pos:List[String]):Tree = {
var parsedTree:BerkeleyTree
= parser.getBestConstrainedParse(words, pos, null);
if (parsedTree.getChildren().isEmpty()) {
parsedTree = parser.getBestConstrainedParse(words, null, null);
}
parsedTree
}
}
}
// (create parsers)
val parsers = (0 until numThreads).map{ x => (mkParser, new Lock) }.toList
// (multithreaded implementation)
new {
def parse(words:List[String], pos:List[String]):Tree = {
def tryParse:Tree = {
val validParser = parsers.indexWhere{
(pair:({def parse(words:List[String],pos:List[String]):Tree},Lock)) =>
pair._2.available
}
if (validParser >= 0) { // case: [likely] found parser to run
val (parser, lock) = parsers(validParser)
lock.acquire
val rtn = parser.parse(words, pos)
lock.release
rtn
} else { Thread.sleep(1000); tryParse } // case: no parser found
}
tryParse
}
}
}
}

0 comments on commit d643f38

Please sign in to comment.