Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CJK_models #57

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
442 changes: 348 additions & 94 deletions Makefile

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion drafttopic/about.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
__name__ = "drafttopic"
__version__ = "0.3.0"
__version__ = "1.4.0"
__author__ = "Aaron Halfaker, Sumit Asthana"
__author_email__ = "ahalfaker@wikimedia.org, asthana.sumit23@gmail.com"
__description__ = "A library for automatic detection of topics of new " +\
Expand Down
26 changes: 26 additions & 0 deletions drafttopic/feature_lists/jawiki.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from revscoring.datasources.meta import vectorizers, mappers
from revscoring.features import wikitext
from revscoring.features.meta import aggregators


jawiki_kvs = vectorizers.word2vec.load_gensim_kv(
filename="jawiki-20201201-learned_vectors.50_cell.10k.kv", mmap='r')


def vectorize_words(words):
return vectorizers.word2vec.vectorize_words(jawiki_kvs, words)


revision_text_vectors = vectorizers.word2vec(
wikitext.revision.datasources.cjk.cjks,
vectorize_words,
name="revision.text.ja_vectors")

w2v = aggregators.mean(
revision_text_vectors,
vector=True,
name="revision.text.ja_vectors_mean"
)

drafttopic = [w2v]
articletopic = drafttopic
4 changes: 2 additions & 2 deletions drafttopic/feature_lists/kowiki.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@


kowiki_kvs = vectorizers.word2vec.load_gensim_kv(
filename="kowiki-20200501-learned_vectors.50_cell.10k.kv", mmap='r')
filename="kowiki-20201201-learned_vectors.50_cell.10k.kv", mmap='r')


def vectorize_words(words):
return vectorizers.word2vec.vectorize_words(kowiki_kvs, words)


revision_text_vectors = vectorizers.word2vec(
mappers.lower_case(wikitext.revision.datasources.words),
wikitext.revision.datasources.cjk.cjks,
vectorize_words,
name="revision.text.ko_vectors")

Expand Down
26 changes: 26 additions & 0 deletions drafttopic/feature_lists/zhwiki.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from revscoring.datasources.meta import vectorizers, mappers
from revscoring.features import wikitext
from revscoring.features.meta import aggregators


zhwiki_kvs = vectorizers.word2vec.load_gensim_kv(
filename="zhwiki-20201201-learned_vectors.50_cell.10k.kv", mmap='r')


def vectorize_words(words):
return vectorizers.word2vec.vectorize_words(zhwiki_kvs, words)


revision_text_vectors = vectorizers.word2vec(
wikitext.revision.datasources.cjk.cjks,
vectorize_words,
name="revision.text.zh_vectors")

w2v = aggregators.mean(
revision_text_vectors,
vector=True,
name="revision.text.zh_vectors_mean"
)

drafttopic = [w2v]
articletopic = drafttopic
25 changes: 17 additions & 8 deletions drafttopic/utilities/extract_from_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
[--input=<path>]
[--output=<path>]
[--extractors=<num>]
[--tok_strategy=<str>]
[--verbose]
[--debug]

Expand All @@ -25,6 +26,7 @@
[default: <stdout>]
--extractors=<num> The number of parallel extractors to
start [default: <cpu count>]
--tok_strategy=<str> tokenization strategy
--verbose Print dots and stuff to stderr
--debug Print debug logs
"""
Expand All @@ -39,6 +41,9 @@
from revscoring.dependencies import solve
from revscoring.utilities.util import dump_observation, read_observations

from mwtext.content_transformers import Wikitext2Words
forbidden_link_prefixes = [
'category', 'image', 'file']

def main(argv=None):
args = docopt.docopt(__doc__, argv=argv)
Expand Down Expand Up @@ -72,14 +77,17 @@ def main(argv=None):
extractors = int(args['--extractors'])

verbose = args['--verbose']
tok_strategy = str(args['--tok_strategy']) if args['--tok_strategy'] is not None else None
wtpp = Wikitext2Words(forbidden_link_prefixes, tok_strategy=tok_strategy)
sys.stderr.write("tokenization strategy is: " + tok_strategy)
sys.stderr.write("\nnumber of processes: " + str(extractors) + "\n")
run(observations, dependents, output, extractors, wtpp, verbose)

run(observations, dependents, output, extractors, verbose)


def run(labelings, dependents, output, extractors, verbose=False):
def run(labelings, dependents, output, extractors, wtpp, verbose=False):
extractor_pool = Pool(processes=extractors)

extractor = LabelingDependentExtractor(dependents)
extractor = LabelingDependentExtractor(dependents, wtpp)

for observation in extractor_pool.imap(
extractor.extract_and_cache, labelings):
Expand All @@ -100,15 +108,16 @@ def run(labelings, dependents, output, extractors, verbose=False):

class LabelingDependentExtractor:

def __init__(self, dependents):
def __init__(self, dependents, wtpp):
self.dependents = dependents
self.wtpp = wtpp

def extract_and_cache(self, observation):
if observation['text'] is None:
return None

values = extract_from_text(
self.dependents, observation['text'],
self.dependents, self.wtpp, observation['text'],
cache=observation.get('cache'))
dependent_cache = {str(d): val
for d, val in zip(self.dependents, values)}
Expand All @@ -121,7 +130,7 @@ def extract_and_cache(self, observation):
return observation


def extract_from_text(dependents, text, cache=None, context=None):
def extract_from_text(dependents, wtpp, text, cache=None, context=None):
"""
Extracts a set of values from a text an returns a cache containing just
those values.
Expand All @@ -135,6 +144,6 @@ def extract_from_text(dependents, text, cache=None, context=None):
A list of extracted feature values
"""
cache = cache if cache is not None else {}
cache[revision_oriented.revision.text] = text
cache[revision_oriented.revision.text] = ' '.join(wtpp.transform(text))

return list(solve(dependents, cache=cache, context=context))
1 change: 0 additions & 1 deletion drafttopic/utilities/fetch_draft_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ def _fetch_text(obs):
rev_doc = page_doc['revisions'][0]
text = rev_doc['slots']['main']['content']
if is_article(text):

obs['text'] = text
obs['title'] = page_doc['title']
obs['rev_id'] = rev_doc['revid']
Expand Down
Loading