In [309]:
import json
import spacy
import pandas as pd
import os
import spacy
from transformers import pipeline
import tqdm

nlp = spacy.load("en_core_web_sm")
unmasker = pipeline('fill-mask', model='xlm-roberta-base')

def load_squad(path):
    para_path = f"{path}.para.json"
    if os.path.exists(para_path):
        with open(para_path,mode='r',encoding='utf8') as fpara:
            return json.load(fpara)
    with open(path,encoding='utf8') as f:
        origin_data = json.load(f)
        paragraphs = []
        for article in origin_data['data']:
            for paragraph in article['paragraphs']:
                paragraphs.append(paragraph['context'])
    with open(para_path,'w',encoding='utf8') as fpara:
        json.dump(paragraphs,fpara)
    return paragraphs



def mask_paragraph(paragraph):
    def post_process_mask(output_mask,idx):
        all_masks = []
        for mask in output_mask:
            nlp_seq = nlp(mask['sequence'])
            mask['lemma'] = mask['token_str']
            for token in nlp_seq:
                if token.text == mask['token_str']:
                    mask['lemma'] = token.lemma_
            mask['score'] = round(mask['score'],4)
            mask.pop('sequence')
            all_masks.append(mask)
        return all_masks
    mask_sentences = []
    sentences = list(nlp(paragraph).sents)
    for idx_sen, sen in enumerate(sentences):
        sen_item = {'sentence':sen.text}
        sen_item['mask'] = []
        for i,token in enumerate(nlp(sen.text)):
            if token.pos_=='NOUN':
                masked_sentence = f"{sen.text[:token.idx]}<mask>{sen.text[token.idx+len(token):]}"
                # masked_paragraph = f'{"".join([s.text for s in sentences[:idx_sen]])}{masked_sentence}{"".join([s.text for s in sentences[idx_sen+1:]])}'
                output_mask = unmasker(masked_sentence,top_k=10)
                output_mask = post_process_mask(output_mask,i)
                mask_item = dict(mask_word=token.text, lemma_word=token.lemma_, masked_sentence=masked_sentence, output_mask=output_mask)
                sen_item['mask'].append(mask_item)
        mask_sentences.append(sen_item)
    return mask_sentences

In [310]:
save_path = "../data/SQuAD/mask"
paragraphs = load_squad("../data/SQuAD/train-v2.0.json")

mask_sentences = []
cnt = 0
for i,para in tqdm.tqdm(enumerate(paragraphs)):
    cnt += 1
    try:
        m_sentence = mask_paragraph(para)
        mask_sentences.append(m_sentence)
    except Exception as e:
        print(cnt,e)
        print(para)
    if cnt%100==0 :
        with open(f"{save_path}/mask_{cnt}.json","w",encoding="utf8") as f:
            json.dump(mask_sentences,f)
        mask_sentences = []
if len(mask_sentences)!=0:
    with open(f"{save_path}/mask_{cnt}.json","w",encoding="utf8") as f:
        json.dump(mask_sentences,f)

6068it [8:27:33,  5.02s/it]


KeyboardInterrupt: 

In [307]:
mask_sentences

[[{'sentence': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress.',
   'mask': [{'mask_word': 'singer',
     'lemma_word': 'singer',
     'masked_sentence': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American <mask>, songwriter, record producer and actress.',
     'output_mask': [{'score': 0.245,
       'token': 39329,
       'token_str': 'actor',
       'lemma': 'actor'},
      {'score': 0.185, 'token': 5367, 'token_str': 'sing', 'lemma': 'sing'},
      {'score': 0.1265, 'token': 3299, 'token_str': 'model', 'lemma': 'model'},
      {'score': 0.1152,
       'token': 215542,
       'token_str': 'actress',
       'lemma': 'actress'},
      {'score': 0.068, 'token': 19612, 'token_str': 'music', 'lemma': 'music'},
      {'score': 0.0319, 'token': 1346, 'token_str': 'film', 'lemma': 'film'},
      {'score': 0.0319,
       'token': 108558,
       'to

In [None]:
unmasker("how are <mask>, I am really <mask> exciting !",top_k=5)

In [301]:
((len(paragraphs)/10)*68)/3600

35.955

In [251]:
sen_nlp,token

(Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".,
 .)

In [188]:
s0 =sentences[0]


<spacy.vocab.Vocab at 0x7f77571f2310>

In [181]:
token, masked_sentence

(group,
 "Managed by her father, Mathew Knowles, the  <mask>  became one of the world's best-selling girl  <mask> s of all time.")

In [152]:
mask[1]['mask'][1]

{'mask_word': 'dancing',
 'masked_sentence': "Born and raised in Houston, Texas, she performed in various singing and  <mask>  competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child.",
 'output_mask': [{'score': 0.5852501392364502,
   'token': 82393,
   'token_str': 'dance',
   'sequence': "Born and raised in Houston, Texas, she performed in various singing and dance competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child."},
  {'score': 0.035630419850349426,
   'token': 19612,
   'token_str': 'music',
   'sequence': "Born and raised in Houston, Texas, she performed in various singing and music competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child."},
  {'score': 0.03368469700217247,
   'token': 23718,
   'token_str': 'performance',
   'sequence': "Born and raised in Houston, Texas, she performed in various singing and perform

In [153]:
import spacy
from spacy.lang.en import English
from nltk.stem.porter import PorterStemmer

nlp = English()
stemmer = PorterStemmer()

def stem_tokens(tokens):
    return [stemmer.stem(token.text) for token in tokens]

doc = nlp("I am running in the park with my dogs")
tokens = doc

stemmed_tokens = stem_tokens(tokens)
print(stemmed_tokens)


['i', 'am', 'run', 'in', 'the', 'park', 'with', 'my', 'dog']


In [156]:
stemmer.stem('run')

'run'

In [137]:
 unmasker(mask[1],top_k=10)

PipelineException: No mask_token (<mask>) found on the input

In [157]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp("running runs runner ran")


In [158]:
for token in doc:
    print(token.text, token.lemma_)


running run
runs run
runner runner
ran run


In [105]:
text = 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".'

doc = nlp(text)
for token in doc:
    if token.pos_=='NOUN':
        print(token.text, token.pos_)

singer NOUN
songwriter NOUN
record NOUN
producer NOUN
actress NOUN
singing NOUN
dancing NOUN
competitions NOUN
child NOUN
fame NOUN
1990s NOUN
lead NOUN
singer NOUN
girl NOUN
group NOUN
Child NOUN
father NOUN
group NOUN
world NOUN
girl NOUN
groups NOUN
time NOUN
hiatus NOUN
release NOUN
album NOUN
solo NOUN
artist NOUN
number NOUN


In [108]:
list(doc)[0].pos_

'PROPN'

In [102]:
paragraphs[0]

'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".'