In [7]:
import numpy as np
from collections import Counter
from gensim.models import KeyedVectors
from common.basic_encoder import BasicEncoder
from common.space import Space
from text.summarizer import NewsSummarizer

In [10]:
from config import w2v_file, vocab_limit
from config import stop_words

In [5]:
keyed_vectors = KeyedVectors.load_word2vec_format(w2v_file, limit=vocab_limit, binary=True)

2019-12-20 13:38:42,056 gensim.models.utils_any2vec: INFO loading projection weights from ~/Downloads/GoogleNews-vectors-negative300.bin.gz
2019-12-20 13:39:24,785 gensim.models.utils_any2vec: INFO loaded (1000000, 300) matrix from ~/Downloads/GoogleNews-vectors-negative300.bin.gz


In [9]:
space = Space(keyed_vectors)

In [114]:
class TextEncoder(BasicEncoder):

    def __init__(self, summarizer: NewsSummarizer, space: Space):
        self.summarizer = summarizer
        self.space = space

    def create_subspace(self, full_text: str, subspace_dim: int = 5):
        """Returns a word subspace """
        keywords: Counter = self._get_keywords(full_text)
        if keywords:
            return self.space.create_subspace(keywords, dims=subspace_dim)

    def _get_keywords(self, full_text: str, limit: int = 7) -> Counter:
        top_sents = 1
        keywords = Counter()
        while not keywords:
            keywords: Counter = self.summarizer.get_keywords(text=full_text, top=top_sents)
            top_sents += 1
        return Counter(dict(keywords.most_common(limit)))

    def __repr__(self) -> str:
        return 'Text Encoder'

In [70]:
news_summarizer = NewsSummarizer(stop_words=stop_words)
text_encoder = TextEncoder(summarizer=news_summarizer, space=space)

In [130]:
full_text1 = """
Shocking CCTV footage released by Manchester police shows the moment the man wielding 
a large-bladed knife is tackled to the ground by armed officers. At about 11 pm on Tuesday, 
CCTV operators spotted a man waving the butcher’s knife around the Piccadilly Garden’s 
area of Manchester and informed the police. The man can be seen struggling to stand and 
interacts with terrified members of the public, as he continues to wave the knife around.
A 55-year-old man has been arrested on suspicion of affray and remains in police custody 
for questioning."""
full_text2 = """
Shocking CCTV footage released by Manchester police shows the moment the man wielding 
a large-bladed knife is tackled to the ground by armed officers.
The man can be seen struggling to stand and interacts with terrified members of the public, 
as he continues to wave the knife around. A 55-year-old man has been arrested on suspicion 
of affray and remains in police custody for questioning.
"""
full_text3 = """
man with a knife attemted to kill the boys of the governore near the school. A 55-year-old 
man has been arrested on suspicion of affray and remains in police custody for questioning.
"""

In [131]:
ws1 = text_encoder.create_subspace(full_text=full_text1)
ws2 = text_encoder.create_subspace(full_text=full_text2)
ws3 = text_encoder.create_subspace(full_text=full_text3)

Counter({'man': 4, 'police': 3, 'knife': 3, 'wave': 2, 'cctv': 2, 'manchester': 2, 'inform': 1})
Counter({'man': 3, 'police': 2, 'knife': 2, 'release': 1, 'bladed': 1, 'moment': 1, 'cctv': 1})
Counter({'man': 2, 'custody': 1, 'remain': 1, 'affray': 1, 'question': 1, 'police': 1, 'year': 1})


In [134]:
space.subspaces_similarity(ws3, ws2)

0.107053615