<a href="https://colab.research.google.com/github/kullawattana/thesis_2020_spacy_colab/blob/master/18_hash_tag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import spacy
from spacy.matcher import Matcher
from spacy.language import Language
from spacy.tokenizer import Tokenizer, _get_regex_pattern
from spacy.tokens import Token
from spacy.lang.tokenizer_exceptions import URL_PATTERN

def firstSolutionReplace():
    sentence = "This is my twitter update #MyTopic"
    parsed = nlp(sentence)
    print([token.text for token in parsed])

    new_sentence = re.sub(r'#(\w+)',r'ZZZPLACEHOLDERZZZ\1',sentence)
    print(new_sentence)
    #['This', 'is', 'my', 'twitter', 'update', 'ZZZPLACEHOLDERZZZMyTopic']

    parsed = nlp(new_sentence)
    print([token.text for token in parsed])
    print([x.replace('ZZZPLACEHOLDERZZZ','#') for x in [token.text for token in parsed]])
    #['This', 'is', 'my', 'twitter', 'update', '#MyTopic']

def secondSolutionParse():
    my_str = "Tweet hashtags #MyHashOne #MyHashTwo"
    parsed = nlp(my_str)

    print([(x.text,x.pos_) for x in parsed])
    #[('Tweet', 'PROPN'), ('hashtags', 'NOUN'), ('#', 'NOUN'), ('MyHashOne', 'NOUN'), ('#', 'NOUN'), ('MyHashTwo', 'PROPN')]

    indexes = [m.span() for m in re.finditer('#\w+',my_str,flags=re.IGNORECASE)]
    print(indexes)
    #[(15, 25), (26, 36)]

    for start,end in indexes:
        parsed.merge(start_idx=start,end_idx=end)

    print([(x.text,x.pos_) for x in parsed])
    #[('Tweet', 'PROPN'), ('hashtags', 'NOUN'), ('#MyHashOne', 'NOUN'), ('#MyHashTwo', 'PROPN')]

def thirdSolutionMatcher():
    matcher = Matcher(nlp.vocab)
    matcher.add('HASHTAG', None, [{'ORTH': '#'}, {'IS_ASCII': True}])

    doc = nlp("This is a #sentence. Here is another #hashtag. #The #End.")
    matches = matcher(doc)
    hashtags = []
    for match_id, start, end in matches:
        hashtags.append(doc[start:end])

    for span in hashtags:
        span.merge()

    print([t.text for t in doc])

def hashtag_pipe(doc):
    merged_hashtag = False
    while True:
        for token_index,token in enumerate(doc):
            if token.text == '#':
                if token.head is not None:
                    start_index = token.idx
                    end_index = start_index + len(token.head.text) + 1
                    if doc.merge(start_index, end_index) is not None:
                        merged_hashtag = True
                        break
        if not merged_hashtag:
            break
        merged_hashtag = False
    return doc

def create_tokenizer(nlp):
    # spacy defaults: when the standard behaviour is required, they
    # need to be included when subclassing the tokenizer
    prefix_re = spacy.util.compile_prefix_regex(Language.Defaults.prefixes)
    infix_re = spacy.util.compile_infix_regex(Language.Defaults.infixes)
    suffix_re = spacy.util.compile_suffix_regex(Language.Defaults.suffixes)

    # extending the default url regex with regex for hashtags with "or" = |
    hashtag_pattern = r'''|^(#[\w_-]+)$'''
    url_and_hashtag = URL_PATTERN + hashtag_pattern
    url_and_hashtag_re = re.compile(url_and_hashtag)

    # set a custom extension to match if token is a hashtag
    hashtag_getter = lambda token: token.text.startswith('#')
    Token.set_extension('is_hashtag', getter=hashtag_getter)

    return Tokenizer(nlp.vocab, prefix_search=prefix_re.search,
                     suffix_search=suffix_re.search,
                     infix_finditer=infix_re.finditer,
                     token_match=url_and_hashtag_re.match
                     )

def fiveSolution():
    nlp = spacy.load('en')

    # get default pattern for tokens that don't get split
    re_token_match = _get_regex_pattern(nlp.Defaults.token_match)
    # add your patterns (here: hashtags and in-word hyphens)
    re_token_match = "({re_token_match}|#\w+|\w+-\w+)"

    # overwrite token_match function of the tokenizer
    nlp.tokenizer.token_match = re.compile(re_token_match).match

    text = "@Pete: choose low-carb #food #eatsmart ;-) 😋👍"
    doc = nlp(text)
    print(doc)

#https://stackoverflow.com/questions/43388476/how-could-spacy-tokenize-hashtag-as-a-whole
if __name__ == "__main__":
    nlp = spacy.load('en')
    nlp.add_pipe(hashtag_pipe)

    doc = nlp("twitter #hashtag")
    assert len(doc) == 2
    assert doc[0].text == 'twitter'
    assert doc[1].text == '#hashtag'

    #======================================
    nlp.tokenizer = create_tokenizer(nlp)
    doc = nlp("#spreadhappiness #smilemore so_great@good.com https://www.somedomain.com/foo")

    for token in doc:
        print(token.text)
        if token._.is_hashtag:
            print("-> matches hashtag")
    ## returns: "#spreadhappiness -> matches hashtag #smilemore -> matches hashtag so_great@good.com https://www.somedomain.com/foo"
    #======================================

    fiveSolution()

#spreadhappiness
-> matches hashtag
#smilemore
-> matches hashtag
so_great@good.com
https://www.somedomain.com/foo
@Pete: choose low-carb #food #eatsmart ;-) 😋👍
