Skip to content


Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
tree: b48317f337
Fetching contributors…

Cannot retrieve contributors at this time

46 lines (35 sloc) 2.065 kb
// From the Stanford Topic Modeling Toolbox:
// java -Xmx1024m -jar tmt-0.3.3.jar -Dscalanlp.distributed.hub=socket:// edu.stanford.nlp.tmt.TMTMain "3-infer.scala"
import scalanlp.stage._;
import scalanlp.stage.text._;
import scalanlp.text.tokenize._;
import edu.stanford.nlp.tmt.stage._;
import edu.stanford.nlp.tmt.model.lda._;
import edu.stanford.nlp.tmt.model.llda._;
// The path of the model to load.
val modelPath = file("lda-86a58316-30-2b1a90a6");
val model = LoadCVB0LDA(modelPath);
// A new dataset for inference.
val source = CSVFile("emails.csv") ~> IDColumn(1);
val text = {
source ~> // read from the source file
Column(2) ~> // select column containing text
TokenizeWith(model.tokenizer.get) // tokenize with existing model's tokenizer
// Base name of the output files to generate.
val output = file(modelPath, source.meta[].getName.replaceAll(".csv",""));
// turn the text into a dataset ready to be used with LDA
val dataset = LDADataset(text, termIndex = model.termIndex);
println("Writing document distributions to " + output + "-document-topic-distributions.csv");
val perDocTopicDistributions = InferCVB0DocumentTopicDistributions(model, dataset);
println("Writing topic usage to "+output+"-usage.csv");
val usage = QueryTopicUsage(model, dataset, perDocTopicDistributions);
println("Estimating per-doc per-word topic distributions");
val perDocWordTopicDistributions = EstimatePerWordTopicDistributions(model, dataset, perDocTopicDistributions);
println("Writing top terms to "+output+"-top-terms.csv");
val topTerms = QueryTopTerms(model, dataset, perDocWordTopicDistributions, numTopTerms=100);
Jump to Line
Something went wrong with that request. Please try again.