Permalink
Browse files

Make the console nicer: unify Document stuff into the Model class, ma…

…ke more object-oriented, nicer logging, etc.
  • Loading branch information...
1 parent 4d9380f commit 73939011dfbff6d52193fcdeff2b88fdd2e2487e @jakemannix jakemannix committed Apr 5, 2012
Showing with 113 additions and 39 deletions.
  1. +113 −39 src/main/ruby/mahout_console.rb
@@ -62,85 +62,159 @@ def load_dict(path)
load_seqfile(path).inject([]) {|arr, pair| arr[pair.second.get] = pair.first.to_s; arr }
end
- def load_matrix(path)
+ def load_vector_hash(path)
load_seqfile(path).inject({}) {|m, pair| m[pair.first.get] = pair.second.get; m }
end
- def load_model(m)
+ def to_matrix(m)
matrix = SparseRowMatrix.new(m.size, m.first[1].size)
- m.each{|k,v| matrix.assign_row(k, v)}
- TopicModel.new(matrix, 0.001, 0.001, nil, 1, 1.0)
+ m.each{|k,v| matrix.assign_row(k, v) }
+ matrix
end
def my_methods
public_methods - Object.public_methods
end
+
end
class Model
include_package "org.apache.mahout.math"
- attr_reader :matrix, :dict, :label_dict, :model
+ include_package "org.apache.mahout.clustering.lda.cvb"
+ attr_reader :matrix_hash, :dict, :label_dict, :model,
+ :topic_feature_counts, :feature_freqs, :topic_freqs, :model_norm,
+ :doc_hash, :doc_topics
+
+ def self.help()
+ puts "Instantiate via: Model.new(fs, {:model_path => _, :dict_path => _, :label_dict_path => _, :doc_path => _, :doc_topic_path => _})\n" +
+ "where :model_path and :dict_path are required, but the rest are only needed if you want to have labeled topics or have documents\n" +
+ "model.mahout_methods() will print out useful methods for this class";
+ end
+
+ def mahout_methods
+ (public_methods - Object.public_methods).sort
+ end
def initialize(fs, opts = {})
- @matrix = fs.load_matrix(opts[:model_path])
- @model = fs.load_model(matrix)
- @dict = fs.load_dict(opts[:dict_path])
- @label_dict = fs.load_dict(opts[:label_dict_path]) unless opts[:label_dict_path].nil?
+ if (!opts[:model_path])
+ raise ":model_path required to instantiate a model!"
+ else
+ puts "loading topic model path: #{opts[:model_path]}..."
+ @matrix_hash = fs.load_vector_hash(opts[:model_path])
+ @topic_feature_counts = fs.to_matrix(matrix_hash)
+ puts "loaded #{@topic_feature_counts.num_rows} topics with #{@topic_feature_counts.num_cols} features"
+ end
+ @model = TopicModel.new(topic_feature_counts, 0.001, 0.001, nil, 1, 1.0)
+ if (!opts[:dict_path])
+ raise ":dict_path required to instantiate a model!"
+ else
+ puts "loading dictionary path: #{opts[:dict_path]}..."
+ @dict = fs.load_dict(opts[:dict_path])
+ puts "loaded a #{@dict.size}-term dictionary"
+ end
+ if (!opts[:label_dict_path])
+ puts "no :label_dict_path specified, all topics must be integer topic_ids"
+ else
+ puts "loading topic_id / label dictionary from #{opts[:label_dict_path]}..."
+ @label_dict = fs.load_dict(opts[:label_dict_path])
+ puts "loaded a #{@label_dict.size}-label dictionary"
+ end
+ if (!opts[:doc_path])
+ puts "no :doc_path specified, will not load any document vectors"
+ else
+ puts "loading documents from #{opts[:doc_path]}..."
+ @doc_hash = fs.load_vector_hash(opts[:doc_path])
+ puts "loaded #{@doc_hash.size} documents"
+ end
+ if (!opts[:doc_topic_path])
+ puts "no :doc_topic_path, will not load pre-inferred p(topic | doc_id) data"
+ else
+ puts "loading p(topic | doc_id) vectors from #{opts[:doc_topic_path]}..."
+ @doc_topics = fs.to_matrix(fs.load_vector_hash(opts[:doc_topic_path]))
+ puts "loaded #{@doc_topics.num_rows} doc-topic distributions"
+ end
end
- def features_for(label, limit = nil)
- topic_id = @label_dict.nil? ? label.to_i : @label_dict.index(label)
- unless topic_id.nil? || @matrix[topic_id].nil?
- topic = @matrix[topic_id].inject([]){|a,e| a << [e.index, e.get]; a}.sort{|a,b| b[1] <=> a[1] }
+ def features_for_topic(label, limit = nil)
+ topic_id = label_dict ? label_dict.index(label) : label.to_i
+ unless topic_id.nil? || matrix_hash[topic_id].nil?
+ topic = matrix_hash[topic_id].inject([]){|a,e| a << [e.index, e.get]; a}.sort{|a,b| b[1] <=> a[1] }
topic_norm = topic.inject(0) {|sum,e| sum += e[1]}
topic = topic[0..limit] unless limit.nil?
- topic.map{ |p| [@dict[p[0]], p[1]/topic_norm] }
+ topic.map{ |p| [dict[p[0]], p[1]/topic_norm] }
end
end
- def topics_for(feature, limit = nil)
- feature_id = @dict.index(feature)
+ def topics_for_feature(feature, limit = nil)
+ feature_id = dict.index(feature)
unless feature_id.nil?
- topic_dist = @matrix.inject([]){|a,(topic_id,topic)| a << [@label_dict.nil? ? topic_id.to_s : label_dict[topic_id], topic.get(feature_id)]; a}.sort{|a,b| b[1] <=> a[1] }
+ topic_dist = matrix_hash.inject([]){|a,(topic_id,topic)| a << [label_dict ? label_dict[topic_id] : topic_id.to_s, topic.get(feature_id)]; a}.sort{|a,b| b[1] <=> a[1] }
norm = topic_dist.inject(0) {|sum,e| sum += e[1]}
topic_dist = topic_dist[0..limit] unless limit.nil?
topic_dist.map{ |p| [p[0], p[1] / norm] }
end
end
- def infer(vector, convergence = 0.0, max_iters = 20, prior = DenseVector.new(model.num_topics).assign(1.0 / model.num_topics))
- model.infer(vector, prior, convergence, max_iters)
- end
-end
-
-class UserDoc
- attr_reader :matrix, :dict, :model, :doc_topics, :label_dict
-
- def initialize(fs, model, opts = {})
- @matrix = fs.load_matrix(opts[:userdoc_path])
- @doc_topics = fs.load_matrix(opts[:doc_topic_path])
- @model = model
- @dict = model.dict
- @label_dict = model.label_dict
- end
-
- def features_for(user_id, limit = nil)
- unless user_id.nil? || matrix[user_id].nil?
- sorted = matrix[user_id].inject([]){|a,e| a << [e.index, e.get]; a}.sort{|a,b| b[1] <=> a[1] }
+ def features_for_doc(doc_id, limit = nil)
+ if doc_id && doc_hash[doc_id]
+ sorted = doc_hash[doc_id].inject([]){|a,e| a << [e.index, e.get]; a}.sort{|a,b| b[1] <=> a[1] }
norm = sorted.inject(0) {|sum,e| sum += e[1]}
sorted = sorted[0..limit] unless limit.nil?
sorted.map{ |p| [dict[p[0]], p[1]/norm] }
end
end
- def topics_for(user_id, limit = nil)
- unless user_id.nil? || doc_topics[user_id].nil?
- sorted = doc_topics[user_id].inject([]){|a,e| a << [e.index, e.get]; a}.sort{|a,b| b[1] <=> a[1] }
+ def topics_for_doc(doc_id, limit = nil)
+ if doc_id && doc_topic_hash[doc_id]
+ sorted = doc_topics[doc_id].inject([]){|a,e| a << [e.index, e.get]; a}.sort{|a,b| b[1] <=> a[1] }
norm = sorted.inject(0) {|sum,e| sum += e[1]}
sorted = sorted[0..limit] unless limit.nil?
sorted.map{ |p| [label_dict.nil? ? p[0] : label_dict[p[0]], p[1]/norm] }
end
end
+
+ def infer(s)
+ if s.respond_to?(:split)
+ tokens = s.split.map {|token| dict.index(token.downcase) }.compact
+ v = org.apache.mahout.math.SequentialAccessSparseVector.new(dict.size, tokens.size)
+ tokens.each {|t| v.set(t, 1.0) }
+ else
+ v = s
+ end
+ model.infer(v, 0, 100)
+ end
+
+ def related(s, i = 0)
+ topic = top_k(infer(s), i+1)[0]
+ topic = label_dict ? label_dict[topic] : topic
+ features_for_topic(topic, 10).map{|a| a[0] }
+ end
+
+ def significant_topics(feature)
+ feature_freqs ||= load_feature_freqs
+ topic_freqs || load_topic_freqs
+ fid = dict.index(feature)
+ freq = feature_freqs[fid]
+ (0..topic_feature_counts.num_rows).map {|tid| freq * topic_freq[tid] < topic_feature_counts.get(tid, fid) * model_norm ? (label_dict ? label_dict[fid] : fid) : nil }.compact
+ end
+
+ def feature_freqs
+ @feature_freqs ||= (0..topic_feature_counts.num_columns).inject([]) {|a,f| a[f] = topic_feature_counts.view_column(f).norm(1); a}
+ @model_norm ||= @feature_freqs.reduce(:+)
+ @feature_freqs
+ end
+
+ def topic_freqs
+ @topic_freqs ||= (0..topic_feature_counts.num_rows).inject([]) {|a,t| a[t] = topic_feature_counts.view_row(t).norm(1); a}
+ @model_norm ||= @feature_freqs.reduce(:+)
+ @topic_freqs
+ end
+
+ def top_k(v, k, dict = nil)
+ sorted = v.inject({}) {|a,e| e.respond_to?(:get) ? a[e.index] = e.get : a[e[0]] = e[1] ; a}.sort {|a,b| b[1] <=> a[1] }
+ top = sorted[0..k]
+ top.map {|k,v| dict ? dict[k] : k }
+ end
end
module IRB

0 comments on commit 7393901

Please sign in to comment.