v0.1.2

theeluwin · Nov 19, 2016 · 2479081 · 2479081
1 parent 330fbe0
commit 2479081
Show file tree

Hide file tree

Showing 3 changed files with 46 additions and 33 deletions.
diff --git a/lexrankr/lexrankr.py b/lexrankr/lexrankr.py
@@ -12,7 +12,7 @@
 from collections import Counter
 from gensim.corpora import Dictionary, TextCorpus
 from gensim.models import TfidfModel
-from sklearn.cluster import Birch
+from sklearn.cluster import Birch, DBSCAN, AffinityPropagation
 from sklearn.feature_extraction import DictVectorizer
 from mcl.mcl_clustering import mcl
 
@@ -23,11 +23,11 @@ class LexRankError(Exception):
 
 class Sentence(object):
 
-    def __init__(self, text, keywords=[], index=0):
+    def __init__(self, text, tokens=[], index=0):
         self.index = index
         self.text = text
-        self.keywords = keywords
-        self.counter = Counter(self.keywords)
+        self.tokens = tokens
+        self.counter = Counter(self.tokens)
 
     def __unicode__(self):
         return self.text
@@ -50,7 +50,7 @@ def __hash__(self):
 
 class SentenceFactory(object):
 
-    def __init__(self, tagger, useful_tags, delimiters, min_keyword_length, stopwords, **kwargs):
+    def __init__(self, tagger, useful_tags, delimiters, min_token_length, stopwords, **kwargs):
         if tagger == 'twitter':
             self.tagger = taggers.Twitter()
             self.tagger_options = {
@@ -83,24 +83,24 @@ def __init__(self, tagger, useful_tags, delimiters, min_keyword_length, stopword
         self.useful_tags = useful_tags
         self.delimiters = delimiters
         self.stopwords = stopwords
-        self.min_keyword_length = min_keyword_length
+        self.min_token_length = min_token_length
         self.splitter = self.splitterer()
         self.pos = lambda text: self.tagger.pos(text, **self.tagger_options)
 
     def splitterer(self):
         escaped_delimiters = '|'.join([re.escape(delimiter) for delimiter in self.delimiters])
         return lambda value: re.split(escaped_delimiters, value)
 
-    def text2keywords(self, text):
-        keywords = []
+    def text2tokens(self, text):
+        tokens = []
         word_tag_pairs = self.pos(text)
         for word, tag in word_tag_pairs:
             if word in self.stopwords:
                 continue
             if tag not in self.useful_tags:
                 continue
-            keywords.append("{}/{}".format(word, tag))
-        return keywords
+            tokens.append("{}/{}".format(word, tag))
+        return tokens
 
     def text2sentences(self, text):
         candidates = self.splitter(text.strip())
@@ -111,10 +111,10 @@ def text2sentences(self, text):
                 candidate = candidate.strip(' ').strip('.')
             if not candidate:
                 continue
-            keywords = self.text2keywords(candidate)
-            if len(keywords) < self.min_keyword_length:
+            tokens = self.text2tokens(candidate)
+            if len(tokens) < self.min_token_length:
                 continue
-            sentence = Sentence(candidate, keywords, index)
+            sentence = Sentence(candidate, tokens, index)
             sentences.append(sentence)
             index += 1
         return sentences
@@ -128,34 +128,42 @@ def __init__(self, sentences, no_below=3, no_above=0.8, max_size=None):
         self.dictionary = Dictionary(self.get_texts(), prune_at=max_size)
         self.dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=max_size)
         self.dictionary.compactify()
-        self.bows = [self.dictionary.doc2bow(keywords) for keywords in self.get_texts()]
+        self.bows = [self.dictionary.doc2bow(tokens) for tokens in self.get_texts()]
 
     def get_texts(self):
         for sentence in self.sentences:
-            yield sentence.keywords
+            yield sentence.tokens
 
 
 class LexRank(object):
 
-    def __init__(self, similarity='jaccard', decay_window=15, decay_alpha=0.5, clustering='birch', tagger='twitter', useful_tags=['Noun', 'Verb', 'Adjective'], delimiters=['. ', '\n', '.\n'], min_keyword_length=2, stopwords=stopwords_ko, no_below_word_count=3, no_above_word_portion=0.8, max_dictionary_size=None, min_cluster_size=3, similarity_threshold=0.8, matrix_smoothing=False, birch_threshold=0.05, birch_branching_factor=15, compactify=True, **kwargs):
+    def __init__(self, similarity='jaccard', decay_window=15, decay_alpha=0.5, clustering='birch', tagger='twitter', useful_tags=['Noun', 'Verb', 'Adjective'], delimiters=['. ', '\n', '.\n'], min_token_length=2, stopwords=stopwords_ko, no_below_word_count=3, no_above_word_portion=0.8, max_dictionary_size=None, min_cluster_size=3, similarity_threshold=0.8, matrix_smoothing=False, n_clusters=None, compactify=True, **kwargs):
         self.decay_window = decay_window
         self.decay_alpha = decay_alpha
         if similarity == 'cosine':  # very, very slow :(
             self.vectorizer = DictVectorizer()
             self.uniform_sim = self._sim_cosine
         elif similarity == 'jaccard':
             self.uniform_sim = self._sim_jaccard
+        elif similarity == 'normalized_cooccurrence':
+            self.uniform_sim = self._sim_normalized_cooccurrence
         else:
-            raise LexRankError("available similarity functions are: cosine, jaccard")
+            raise LexRankError("available similarity functions are: cosine, jaccard, normalized_cooccurrence")
         self.sim = lambda sentence1, sentence2: self.decay(sentence1, sentence2) * self.uniform_sim(sentence1, sentence2)
-        self.factory = SentenceFactory(tagger=tagger, useful_tags=useful_tags, delimiters=delimiters, min_keyword_length=min_keyword_length, stopwords=stopwords, **kwargs)
+        self.factory = SentenceFactory(tagger=tagger, useful_tags=useful_tags, delimiters=delimiters, min_token_length=min_token_length, stopwords=stopwords, **kwargs)
         if clustering == 'birch':
-            self._birch = Birch(threshold=birch_threshold, branching_factor=birch_branching_factor)
-            self._clusterer = lambda matrix: self._birch.fit_predict(matrix)
+            self._birch = Birch(threshold=0.99, n_clusters=n_clusters)
+            self._clusterer = lambda matrix: self._birch.fit_predict(1 - matrix)
+        elif clustering == 'dbscan':
+            self._dbscan = DBSCAN()
+            self._clusterer = lambda matrix: self._dbscan.fit_predict(1 - matrix)
+        elif clustering == 'affinity':
+            self._affinity = AffinityPropagation()
+            self._clusterer = lambda matrix: self._affinity.fit_predict(1 - matrix)
         elif clustering == 'markov':  # not working well :(
             self._clusterer = lambda matrix: (lambda A: mcl(A, expand_factor=1, inflate_factor=1, mult_factor=0))(matrix)[1]
         elif clustering == None:
-            self._clusterer = lambda matrix: [0 for index in matrix.shape[0]]
+            self._clusterer = lambda matrix: [0 for index in range(matrix.shape[0])]
         else:
             raise LexRankError("available clustering algorithms are: birch, markov, no-clustering(use `None`)")
         self.no_below_word_count = no_below_word_count
@@ -199,6 +207,11 @@ def _sim_cosine(self, sentence1, sentence2):
         vector1, vector2 = self.vectorizer.fit_transform([sentence1_tfidf, sentence2_tfidf]).toarray()
         return vector1.dot(vector2)
 
+    def _sim_normalized_cooccurrence(self, sentence1, sentence2):
+        if sentence1 == sentence2:
+            return 1
+        return len(set(sentence1.tokens) & set(sentence2.tokens)) / (math.log(len(sentence1.tokens)) + math.log(len(sentence2.tokens)))
+
     def decay(self, sentence1, sentence2):
         distance = abs(sentence1.index - sentence2.index)
         closeness = max(self.decay_window - distance, 0) / self.decay_window
@@ -266,8 +279,8 @@ def _compactify(self):
         self.clusters = clusters
         self._clustered()
 
-    def _verbose(self, summaries):
-        summaries = sorted(summaries, key=lambda sentence: sentence.index)
+    def _verbose(self):
+        summaries = sorted(self.summaries, key=lambda sentence: sentence.index)
         return [sentence.text for sentence in summaries]
 
     def probe(self, k=None):
@@ -281,19 +294,19 @@ def probe(self, k=None):
             raise LexRankError("this will not give a summarization")
         if k < 1:
             k = int(self.num_sentences * k)
-        summaries = []
+        self.summaries = []
         ends = np.array([len(cluster) for cluster in self.clusters])
         drones = np.zeros(ends.shape)
         for i in range(self.num_clusters):
-            summaries.append(self.clusters[i][0])
+            self.summaries.append(self.clusters[i][0])
             drones[i] += 1
-            if len(summaries) == k:
-                return self._verbose(summaries)
+            if len(self.summaries) == k:
+                return self._verbose()
         while True:
             branch = np.array([drones + 1, ends]).min(axis=0) / ends
             leach = int(branch.argmin())
             drone = int(drones[leach])
-            summaries.append(self.clusters[leach][drone])
+            self.summaries.append(self.clusters[leach][drone])
             drones[leach] += 1
-            if len(summaries) == k:
-                return self._verbose(summaries)
+            if len(self.summaries) == k:
+                return self._verbose()
diff --git a/setup.py b/setup.py
@@ -25,7 +25,7 @@
 
 setup(
     name='lexrankr',
-    version='0.1.1',
+    version='0.1.2',
     license='MIT',
     author='Jamie Seol',
     author_email='theeluwin@gmail.com',

diff --git a/tests/test.py b/tests/test.py
@@ -17,8 +17,8 @@ def setUp(self):
     def test_summarized(self):
         self.lexrank.summarize(self.text)
         summaries = self.lexrank.probe()
-        self.assertEqual(len(summaries), 3)
-        self.assertEqual(summaries[0], "사과 배 감 귤")
+        self.assertEqual(len(summaries), 2)
+        self.assertEqual(summaries[0], "배 감 귤 수박")
 
 
 if __name__ == '__main__':