-
Notifications
You must be signed in to change notification settings - Fork 43
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
12 changed files
with
269 additions
and
147 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
import json | ||
import requests | ||
|
||
from typing import List | ||
|
||
from konlpy.tag import Okt | ||
from requests.models import Response | ||
|
||
|
||
class OktTokenizer: | ||
""" | ||
A POS-tagger based tokenizer functor. Note that these are just an examples. Using phrases function rather than a mere POS tokenizer seems better. | ||
Example: | ||
tokenizer: OktTokenizer = OktTokenizer() | ||
tokens: List[str] = tokenizer(your_text_here) | ||
""" | ||
|
||
okt: Okt = Okt() | ||
|
||
def __call__(self, text: str) -> List[str]: | ||
tokens: List[str] = self.okt.phrases(text) | ||
return tokens | ||
|
||
|
||
class ApiTokenizer: | ||
""" | ||
An API based tokenizer functor, assuming that the response body is a jsonifyable string with content of list of str tokens. | ||
Example: | ||
tokenizer: ApiTokenizer = ApiTokenizer() | ||
tokens: List[str] = tokenizer(your_text_here) | ||
""" | ||
|
||
def __init__(self, endpoint: str) -> None: | ||
self.endpoint: str = endpoint | ||
|
||
def __call__(self, text: str) -> List[str]: | ||
body: bytes = text.encode('utf-8') | ||
res: Response = requests.post(self.endpoint, data=body) | ||
tokens: List[str] = json.loads(res.text) | ||
return tokens |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
from .sentence import Sentence | ||
from .textrankr import TextRank | ||
from .textrank import TextRank |
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,18 +1,23 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
from collections import Counter | ||
|
||
|
||
class Sentence(object): | ||
class Sentence: | ||
""" | ||
The purpose of this class is as follows: | ||
1. In order to use the 'pagerank' function in the networkx library, you need a hashable object. | ||
2. Summaries should keep the sentence order from its original text to improve the verbosity. | ||
Note that the 'bow' stands for 'bag-of-words'. | ||
""" | ||
|
||
def __init__(self, phraser, text, index=0): | ||
self.index = index | ||
self.text = text.strip() | ||
self.tokens = phraser(self.text) | ||
self.bow = Counter(self.tokens) | ||
def __init__(self, index: int, text: str, bow: Counter) -> None: | ||
self.index: int = index | ||
self.text: str = text | ||
self.bow: Counter = bow | ||
|
||
def __str__(self): | ||
def __str__(self) -> str: | ||
return self.text | ||
|
||
def __hash__(self): | ||
def __hash__(self) -> int: | ||
return self.index |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
from typing import List | ||
from typing import Dict | ||
from typing import Callable | ||
|
||
from networkx import Graph | ||
from networkx import pagerank | ||
|
||
from .sentence import Sentence | ||
|
||
from .utils import parse_text_into_sentences | ||
from .utils import build_sentence_graph | ||
|
||
|
||
class TextRank: | ||
""" | ||
Args: | ||
tokenizer: a function or a functor of Callable[[str], List[str]] type. | ||
tolerance: a threshold for omitting edge weights. | ||
Example: | ||
tokenizer: YourTokenizer = YourTokenizer() | ||
textrank: TextRank = TextRank(tokenzier) | ||
summaries: str = textrank.summarize(your_text_here) | ||
print(summaries) | ||
""" | ||
|
||
def __init__(self, tokenizer: Callable[[str], List[str]], tolerance: float = 0.05) -> None: | ||
self.tokenizer: Callable[[str], List[str]] = tokenizer | ||
self.tolerance: float = tolerance | ||
|
||
def summarize(self, text: str, num_sentences: int = 3, verbose: bool = True): | ||
""" | ||
Summarizes the given text, using the textrank algorithm. | ||
Args: | ||
text: a raw text to be summarized. | ||
num_sentences: number of sentences in the summarization results. | ||
verbose: if True, it will return a summarized raw text, otherwise it will return a list of sentence texts. | ||
""" | ||
|
||
# parse text | ||
sentences: List[Sentence] = parse_text_into_sentences(text, self.tokenizer) | ||
|
||
# build graph | ||
graph: Graph = build_sentence_graph(sentences, tolerance=self.tolerance) | ||
|
||
# run pagerank | ||
pageranks: Dict[Sentence, float] = pagerank(graph, weight='weight') | ||
|
||
# get top-k sentences | ||
sentences = sorted(pageranks, key=pageranks.get, reverse=True) | ||
sentences = sentences[:num_sentences] | ||
sentences = sorted(sentences, key=lambda sentence: sentence.index) | ||
|
||
# return summaries | ||
summaries = [sentence.text for sentence in sentences] | ||
if verbose: | ||
return '\n'.join(summaries) | ||
else: | ||
return summaries |
Oops, something went wrong.