Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Integrate with CoreNLP by the binding
gangeli/CoreNLP-Scala
.
Tokenize the sentence and parse it to a Tree structure. Calculate the sentiment on the tree structure.
- Loading branch information
Showing
22 changed files
with
1,427 additions
and
48 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,4 +2,8 @@ manifest.json | |
pio.log | ||
/pio.sbt | ||
target/ | ||
data/*.csv | ||
data/*.tsv | ||
data/*.zip | ||
data/gen_submission.py | ||
*~ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
""" | ||
Send sample query to prediction engine | ||
""" | ||
|
||
import predictionio | ||
client = predictionio.EngineClient(url="http://localhost:8000") | ||
|
||
def test(s): | ||
print s + ' : ' + str(client.send_query({"s": s})['sentiment']) | ||
|
||
test('sad') | ||
test('happy') | ||
test('oh') | ||
test('not') | ||
test('not sad') | ||
test('very sad') | ||
test('very happy') | ||
test('not very sad') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,9 +9,9 @@ | |
}, | ||
"algorithms": [ | ||
{ | ||
"name": "dummy", | ||
"name": "nlpparse", | ||
"params": { | ||
|
||
"baseWeight": 1 | ||
} | ||
} | ||
] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
package org.template.sentimentanalysis | ||
|
||
import io.prediction.controller.PAlgorithm | ||
import io.prediction.controller.Params | ||
import io.prediction.data.storage.BiMap | ||
|
||
import org.apache.spark.SparkContext | ||
import org.apache.spark.SparkContext._ | ||
import org.apache.spark.rdd.RDD | ||
|
||
import edu.stanford.nlp.Magic._ | ||
|
||
import grizzled.slf4j.Logger | ||
|
||
case class AlgorithmParams( | ||
val baseWeight: Double | ||
)extends Params | ||
|
||
class Algorithm(val ap: AlgorithmParams) | ||
extends PAlgorithm[PreparedData, Model, Query, PredictedResult] { | ||
|
||
@transient lazy val logger = Logger[this.type] | ||
|
||
def train(sc: SparkContext, data: PreparedData): Model = { | ||
require(!data.sentiments.take(1).isEmpty, | ||
s"RDD[sentiments] in PreparedData cannot be empty." + | ||
" Please check if DataSource generates TrainingData" + | ||
" and Preprator generates PreparedData correctly.") | ||
|
||
val itemSets: RDD[(String, Double)] = data.sentiments.map( | ||
s => (s.phrase.toLowerCase(), s.sentiment) | ||
).cache() | ||
|
||
val rules = itemSets.groupByKey | ||
.mapValues( | ||
// assume the last training data is the most up-to-date | ||
iter => iter.toVector.last | ||
) | ||
.collectAsMap.toMap | ||
|
||
new Model(rules) | ||
} | ||
|
||
def predict(model: Model, query: Query): PredictedResult = { | ||
new PredictedResult( | ||
model.getSentiment(query.s, ap) | ||
) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
# | ||
# To Build: | ||
# 1. Set CORENLP_HOME to the root of CoreNLP | ||
# 2. [optional] Set BERKELEY to the path to the Berkeley parser | ||
# 3. Build using either 'make stanford' or 'make berkeley' (if the Berkeley parser is configured) | ||
# | ||
|
||
CORENLP=$(CORENLP_HOME)/classes:$(CORENLP_HOME)/lib/joda-time.jar:$(CORENLP_HOME)/lib/jollyday-0.4.7.jar | ||
BERKELEY=$(CORENLP_HOME)/../more/lib/BerkeleyParser.jar | ||
|
||
JAVAC=javac | ||
SCALAC=scalac | ||
|
||
SRC=src | ||
SOURCES = $(wildcard src/edu/stanford/nlp/*.scala) | ||
TEST_SRC=test/src | ||
LIB=lib | ||
BUILD=classes | ||
TEST_BUILD=test/classes | ||
DIST=dist | ||
|
||
dist: stanford | ||
mkdir -p ${DIST} | ||
jar cf ${DIST}/corenlp-scala.jar -C $(BUILD) . | ||
jar uf ${DIST}/corenlp-scala.jar -C $(SRC) . | ||
|
||
berkeley: stanford | ||
$(SCALAC) -cp $(CORENLP):${BERKELEY} -d $(BUILD) `find $(SRC) -name "*.scala"` | ||
|
||
stanford: ${SOURCES} | ||
mkdir -p $(BUILD) | ||
sed -e 's/BerkeleyUtil.berkeleyParser/throw new IllegalStateException("Could not find parser model (and was not compiled to run with Berkeley parser)")/g' ${SRC}/edu/stanford/nlp/NLP.scala > /tmp/NLP_stanfordonly.scala | ||
$(SCALAC) -cp $(CORENLP) -d $(BUILD) `find $(SRC) -name "*.scala" ! -name "*Berkeley.scala" ! -name "NLP.scala"` /tmp/NLP_stanfordonly.scala | ||
rm /tmp/NLP_stanfordonly.scala | ||
|
||
default: stanford | ||
|
||
clean: | ||
rm -r $(BUILD) | ||
rm -r ${DIST} | ||
|
||
|
||
cmd: | ||
@echo "scala -J-Xmx4G -cp $(CORENLP):$(BUILD)":${HOME}/lib/corenlp-models.jar |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
Since gangeli/CoreNLP-Scala does not provide a way to install by the build.sbt file, | ||
copy it from https://github.com/gangeli/CoreNLP-Scala. | ||
|
90 changes: 90 additions & 0 deletions
90
src/main/scala/CoreNLP-Scala/src/edu/stanford/nlp/Berkeley.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
package edu.stanford.nlp; | ||
|
||
import scala.collection.JavaConversions._ | ||
import scala.concurrent.Lock | ||
|
||
import edu.stanford.nlp.trees.Tree | ||
import edu.stanford.nlp.trees.Trees | ||
import edu.stanford.nlp.trees.LabeledScoredTreeNode | ||
import edu.stanford.nlp.ling.HasWord | ||
import edu.stanford.nlp.ling.Word | ||
|
||
import edu.berkeley.nlp.PCFGLA._ | ||
import edu.berkeley.nlp.util.Numberer | ||
|
||
import NLPConfig._ | ||
|
||
object BerkeleyUtil { | ||
type BerkeleyTree = edu.berkeley.nlp.syntax.Tree[String] | ||
|
||
implicit def stanfordTree2BerkeleyTree(btree:BerkeleyTree):Tree = { | ||
val roots = TreeAnnotations.unAnnotateTree(btree).getChildren; | ||
if (roots.isEmpty) { | ||
new LabeledScoredTreeNode(); | ||
} else { | ||
def convert(src:BerkeleyTree):Tree = { | ||
val dst:Tree = new LabeledScoredTreeNode | ||
if (src.getLabel != null) dst.setLabel(new Word(src.getLabel)) | ||
dst.setChildren(src.getChildren.map( convert(_) ).toArray) | ||
dst | ||
} | ||
new LabeledScoredTreeNode(new Word("TOP"), | ||
List[Tree](convert(roots.get(0)))) | ||
} | ||
} | ||
|
||
lazy val berkeleyParser = { | ||
// (function to create parser) | ||
def mkParser = { | ||
// (setup parser) | ||
val pData = ParserData.Load(parse.model) | ||
if (pData == null) throw new RuntimeException("Failed to load Berkeley parser model") | ||
val grammar = pData.getGrammar(); | ||
val lexicon = pData.getLexicon(); | ||
Numberer.setNumberers(pData.getNumbs()); | ||
// (create parser object) | ||
val parser = new CoarseToFineMaxRuleParser( | ||
grammar, lexicon, 1.0, -1, false, false, false, | ||
false, false, true, true) | ||
// (set binarization) | ||
try { | ||
val binarizationField = classOf[ConstrainedArrayParser].getDeclaredField("binarization"); | ||
binarizationField.setAccessible(true); | ||
binarizationField.set(parser, pData.getBinarization()); | ||
binarizationField.setAccessible(false); | ||
} catch { case (e:Exception) => throw new RuntimeException(e) } | ||
// (parser object) | ||
new { | ||
def parse(words:List[String], pos:List[String]):Tree = { | ||
var parsedTree:BerkeleyTree | ||
= parser.getBestConstrainedParse(words, pos, null); | ||
if (parsedTree.getChildren().isEmpty()) { | ||
parsedTree = parser.getBestConstrainedParse(words, null, null); | ||
} | ||
parsedTree | ||
} | ||
} | ||
} | ||
// (create parsers) | ||
val parsers = (0 until numThreads).map{ x => (mkParser, new Lock) }.toList | ||
// (multithreaded implementation) | ||
new { | ||
def parse(words:List[String], pos:List[String]):Tree = { | ||
def tryParse:Tree = { | ||
val validParser = parsers.indexWhere{ | ||
(pair:({def parse(words:List[String],pos:List[String]):Tree},Lock)) => | ||
pair._2.available | ||
} | ||
if (validParser >= 0) { // case: [likely] found parser to run | ||
val (parser, lock) = parsers(validParser) | ||
lock.acquire | ||
val rtn = parser.parse(words, pos) | ||
lock.release | ||
rtn | ||
} else { Thread.sleep(1000); tryParse } // case: no parser found | ||
} | ||
tryParse | ||
} | ||
} | ||
} | ||
} |
Oops, something went wrong.