Skip to content
Browse files

refactored to get rid of Cache class, improve runtime

  • Loading branch information...
1 parent b346d95 commit 052eeb0ef0e458e9fc23e708a43db6d1c716efb0 Paco NATHAN committed Jul 6, 2009
View
76 src/com/sharethis/textrank/Cache.java
@@ -1,76 +0,0 @@
-/*
-Copyright (c) 2009, ShareThis, Inc. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above
- copyright notice, this list of conditions and the following
- disclaimer in the documentation and/or other materials provided
- with the distribution.
-
- * Neither the name of the ShareThis, Inc., nor the names of its
- contributors may be used to endorse or promote products derived
- from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-package com.sharethis.textrank;
-
-import java.security.MessageDigest;
-
-import java.util.HashMap;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-
-
-/**
- * A singleton class implementing a cache of scanned sentences.
- *
- * @author paco@sharethis.com
- */
-
-public class
- Cache
- extends HashMap<String, Sentence>
-{
- // logging
-
- private final static Log LOG =
- LogFactory.getLog(Cache.class.getName());
-
-
- /**
- * Public definitions.
- */
-
- public MessageDigest md_sent = null;
-
-
- /**
- * Constructor.
- */
-
- public
- Cache ()
- throws Exception
- {
- md_sent = MessageDigest.getInstance("MD5");
- }
-}
View
5 src/com/sharethis/textrank/NGram.java
@@ -35,6 +35,7 @@
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
+import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -196,13 +197,13 @@ else if (this.getCount() < that.getCount()) {
*/
public static Graph
- collectNGrams (final LanguageModel lang, final Cache cache, final double rank_threshold)
+ collectNGrams (final LanguageModel lang, final List<Sentence> s_list, final double rank_threshold)
throws Exception
{
final Graph ngrams = new Graph();
final LinkedList<Integer> token_span = new LinkedList<Integer>();
- for (Sentence s : cache.values()) {
+ for (Sentence s : s_list) {
boolean span_marked = false;
double max_rank = 0.0D;
View
64 src/com/sharethis/textrank/Sentence.java
@@ -105,64 +105,40 @@ else if (h.length() == 8) {
*/
public void
- mapTokens (final LanguageModel lang, final Cache cache, final Graph graph)
+ mapTokens (final LanguageModel lang, final Graph graph)
throws Exception
{
token_list = lang.tokenizeSentence(text);
- // determine an MD5 signature for this sentence
+ // scan each token to determine part-of-speech
- cache.md_sent.reset();
+ final String[] tag_list = lang.tagTokens(token_list);
- for (int i = 0; i < token_list.length; i++) {
- if (LOG.isDebugEnabled()) {
- LOG.debug("token: " + token_list[i]);
- }
-
- cache.md_sent.update(token_list[i].getBytes());
- }
-
- md5_hash = hexFormat(cache.md_sent.digest());
-
- // use MD5 hash to lookup sentence in the cache
-
- final Sentence cache_hit = cache.get(md5_hash);
-
- if (cache_hit == null) {
- // add another scanned sentence to the cache
+ // create nodes for the graph
- cache.put(md5_hash, this);
+ Node last_node = null;
+ node_list = new Node[token_list.length];
- // scan each token to determine part-of-speech
-
- final String[] tag_list = lang.tagTokens(token_list);
+ for (int i = 0; i < token_list.length; i++) {
+ final String pos = tag_list[i];
- // create nodes for the graph
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("token: " + token_list[i] + " pos tag: " + pos);
+ }
- Node last_node = null;
- node_list = new Node[token_list.length];
+ if (lang.isRelevant(pos)) {
+ final String key = lang.getNodeKey(token_list[i], pos);
+ final KeyWord value = new KeyWord(token_list[i], pos);
+ final Node n = Node.buildNode(graph, key, value);
- for (int i = 0; i < token_list.length; i++) {
- final String pos = tag_list[i];
+ // emit nodes to construct the graph
- if (LOG.isDebugEnabled()) {
- LOG.debug("token: " + token_list[i] + " pos tag: " + pos);
+ if (last_node != null) {
+ n.connect(last_node);
}
- if (lang.isRelevant(pos)) {
- final String key = lang.getNodeKey(token_list[i], pos);
- final KeyWord value = new KeyWord(token_list[i], pos);
- final Node n = Node.buildNode(graph, key, value);
-
- // emit nodes to construct the graph
-
- if (last_node != null) {
- n.connect(last_node);
- }
-
- last_node = n;
- node_list[i] = n;
- }
+ last_node = n;
+ node_list[i] = n;
}
}
}
View
11 src/com/sharethis/textrank/TextRank.java
@@ -43,6 +43,7 @@
import java.io.OutputStreamWriter;
import java.io.Reader;
+import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
@@ -101,7 +102,6 @@
protected String text = null;
protected boolean use_wordnet = false;
- protected Cache cache = null;
protected Graph graph = null;
protected Graph ngram_subgraph = null;
protected Map<NGram, MetricVector> metric_space = null;
@@ -131,8 +131,6 @@
prepCall (final String text, final boolean use_wordnet)
throws Exception
{
-
- cache = new Cache();
graph = new Graph();
ngram_subgraph = null;
metric_space = new HashMap<NGram, MetricVector>();
@@ -159,9 +157,12 @@
// scan sentences to construct a graph of relevent morphemes
+ final ArrayList<Sentence> s_list = new ArrayList<Sentence>();
+
for (String sent_text : lang.splitParagraph(text)) {
final Sentence s = new Sentence(sent_text.trim());
- s.mapTokens(lang, cache, graph);
+ s.mapTokens(lang, graph);
+ s_list.add(s);
if (LOG.isDebugEnabled()) {
LOG.debug("s: " + s.text);
@@ -182,7 +183,7 @@
graph.runTextRank();
graph.sortResults(max_results);
- ngram_subgraph = NGram.collectNGrams(lang, cache, graph.getRankThreshold());
+ ngram_subgraph = NGram.collectNGrams(lang, s_list, graph.getRankThreshold());
markTime("basic_textrank");

0 comments on commit 052eeb0

Please sign in to comment.
Something went wrong with that request. Please try again.