In [1]:
import nltk
from nltk import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors
import os
import requests
import json
from bs4 import BeautifulSoup
from collections import defaultdict
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/YuRong/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# Function definition

In [9]:
NGRAM_API_URI = "https://{0}.linggle.com/query/"
class Linggle:
    def __init__(self, ver='www'):
        self.ver = ver

    def __getitem__(self, query):
        return self.search(query)

    def search(self, query):
        query = query.replace('/', '@')
        req = requests.get(NGRAM_API_URI.format(self.ver) + query)
        results = req.json()
        return results.get('ngrams', [])
    
    
def extract(soup):
    word_list = []
    for term in soup.select('.pt-list-terms'):
        count = 0    
        for item in term.select('.pt-list-terms__item'):
            if item.select('.pt-list-rating__indicator--high'):
                for title in item.select('.pt-thesaurus-card__term-title'):
                    if title.select_one('.link--term'):
                        if count == 3:
                            break
                        else:
                            count += 1
                            word_list.append(title.select_one('.link--term').text)
    return word_list

def crawl(url):
    headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0' }
    source_code = requests.get(url , headers=headers).content
    soup = BeautifulSoup(source_code, 'html.parser')
    return extract(soup)
    
def synonymsPT(word):
    r = 'https://www.powerthesaurus.org/'+ word +'/synonyms'
    synonymsList = crawl(r)
    max_count = 0
    max_word = word
    for synonym in synonymsList:
        count = sum([row[1] for row in ling[word+" _"]][:10]) + sum([row[1] for row in ling["_ "+word]][:10])
        s_count = sum([row[1] for row in ling[synonym+" _"]][:10]) + sum([row[1] for row in ling["_ "+synonym]][:10])
        if s_count > (2*count) and s_count > 800000:
            max_count = s_count
            max_word = synonym
    return max_word

def crawlCNN(url):
    headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0' }
    source_code = requests.get(url , headers=headers).content
    soup = BeautifulSoup(source_code, 'html.parser')
    
    paragraph = ""
    for big_block in soup.select('.zn-body__paragraph'):
        paragraph += big_block.text + '\n'
    
    return paragraph

# Read glove embedding file

In [3]:
model = KeyedVectors.load_word2vec_format(os.getcwd()+"/glove_300d_word2vec.txt")

# Read English 5000 vocabulary dictionary file

In [4]:
#read Oxford 5000 vocabulary from file
f = open('5000_voc_Oxford.txt','r')
words_5000 = []
for line in f.readlines():
    line = line.lower()
    line = line.strip('\n')
    line = line.split('\t',1)
    words_5000.append([line[0]])
f.close()

## Read input paragraph file (Optional, or you can use CNN url to read paragraph)

In [5]:
f = open('input.txt','r')
origin = f.read()

## Read input CNN paragraph URL

In [27]:
CNN_URL = 'https://edition.cnn.com/2019/06/14/us/blue-lobster-arnolds-bar-massachussetts-trnd/index.html'
origin = crawlCNN(CNN_URL)

# Preprocess the paragraph

In [28]:
print('Origin paragraph:')
print(origin,'\n')
# paragraph is a list of origin paragraph split by ' '
paragraph = origin.split(' ')

#token_pos => [('Tokyo', 'NNP'), ('(', '('), ('CNN', 'NNP'), (')', ')'), ('Japan', 'NNP'), ("'s", 'POS'), ....]
token_pos = nltk.pos_tag(nltk.word_tokenize(origin))

#word_token => dictionary with key = paragraph's index, value = word_tokenize(value of paragraph)
#word_token => {0: ['Tokyo'], 1:['(', 'CNN', ')', 'Japan', "'s"], 2:['85-year-old'], ....}
word_token = {}
for index, token in enumerate(paragraph):
    word_token[index] = nltk.word_tokenize(token)

#list of (word_token's index, pos, original token, origina token's lower and simple tense || original token)
#allword => [(0, 'NNP', 'Tokyo', 'Tokyo'), (1, '(', '(', '('), (1, 'NNP', 'CNN', 'CNN'), .....]
allword = []
verb_pos = ['VBD','VBG','VBN','VBP','VBZ']
noun_pos = ['NNS']
posIndex = 0
for key, value in word_token.items():
    for v in value:
        if posIndex >= len(token_pos):
            break
        current_pos = token_pos[posIndex][1]
        #simplify the word tense of V. and N.
        if current_pos in verb_pos:
            word = (key, current_pos, v, WordNetLemmatizer().lemmatize(v.lower(),'v'))
            allword.append(word) 
        elif current_pos in noun_pos:
            word = (key, current_pos, v, WordNetLemmatizer().lemmatize(v.lower(),'n'))
            allword.append(word)
        else:
            word = (key, current_pos, v, v)
            allword.append(word)
        posIndex += 1

Origin paragraph:
 (CNN)Nathan Nickerson III almost couldn't believe his eyes when he saw an unexpected pop of color in his seafood shipment earlier this week. 
It was one of many lobsters ordered by his restaurant, Arnold's Lobster and Clam Bar, in Eastham, Massachusetts -- except this one was blue.
"I said, 'I think we have something special here.' I couldn't believe the color," Nickerson told CNN. "Everyone was circling around it, just wondering, 'How did this happen?'"
The restaurant owner said he plans to keep it on display at the restaurant for about another week until he donates it to an aquarium. By allowing visitors to see the lobster, he hopes to inspire the younger generation's appreciation for marine life.
"I want the children to see ... (and) be interested in marine life and this is one way to get them excited about it," Nickerson said. "Maybe one can become the next marine biologist."
He has a specific aquarium in mind
Nickerson hopes to donate it to the aquarium opening 

# Find difficult words and find the simpler word to replace it.
## All difficult words and the simpler words are stored in variable "diffWord "

In [37]:
ling = Linggle()
#the POS we check whether the word is difficult.
care_pos = ['NN', 'NN', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP',' VBZ']
#print('Difficult words:')
diffWord = []
dont_care_word = ['``', ',', '.', "'d", "'s", "''"]
for word in allword:
    if word[2] in dont_care_word or word[3] in dont_care_word:
        continue
    if word[3] not in [i[0] for i in words_5000] and word[3].lower() not in [i[0] for i in words_5000] and word[1] in care_pos:
        try:
            count = sum([row[1] for row in ling[word[2]+" _"]][:10]) + sum([row[1] for row in ling["_ "+word[2]]][:10])
        except:
            continue
        #如果字在linggle的次數大於150萬，就篩掉，不視為困難字
        if count < 1500000:
            #==================== 用word embedding換同義字 ==========================
            #similar word => 用word embedding前十相近的字挑出分數大於0.6的
            try:
                similar_word = [row[0] for row in model.most_similar(word[2], topn=10) if row[1] > 0.6]
                max_count = 0
                max_word = word[2]
                for s_word in similar_word:
                    s_count = sum([row[1] for row in ling[s_word+" _"]][:10]) + sum([row[1] for row in ling["_ "+s_word]][:10])
                    #從similar word挑出在linggle次數最多的，且次數必須大於原字的2倍，且次數要大於120萬
                    if s_count > (2*count) and s_count > max_count and s_count > 1200000:
                        max_count = s_count
                        max_word = s_word
                if word[2] != max_word:
                    word = word+(max_word, 'word embedding')
                    diffWord.append(word)
                #=============== 如果沒利用word embedding換字，則使用power thesaurus網佔找同義字 =================
                else:
                    replace_word = synonymsPT(word[2])
                    if replace_word == word[2] or replace_word == word[3]:
                        continue
                    else:
                        word = word + (replace_word, 'PT website')
                        diffWord.append(word)

                #print(word)
            except:
                continue

  if np.issubdtype(vec.dtype, np.int):


In [38]:
diffWord

[(19, 'NN', 'seafood', 'seafood', 'fish', 'word embedding'),
 (149, 'NN', 'biologist', 'biologist', 'scientist', 'word embedding'),
 (329, 'NN', 'overproduce', 'overproduce', 'produce', 'PT website'),
 (337, 'NNS', 'Lobsters', 'lobster', 'Lobsters', 'no change')]

In [13]:
allword

[(0, 'NNP', 'Tokyo', 'Tokyo'),
 (1, '(', '(', '('),
 (1, 'NNP', 'CNN', 'CNN'),
 (1, ')', ')', ')'),
 (1, 'NNP', 'Japan', 'Japan'),
 (1, 'POS', "'s", "'s"),
 (2, 'JJ', '85-year-old', '85-year-old'),
 (3, 'NN', 'monarch', 'monarch'),
 (4, 'VBD', 'was', 'be'),
 (5, 'VBN', 'born', 'bear'),
 (6, 'DT', 'the', 'the'),
 (7, 'NN', 'son', 'son'),
 (8, 'IN', 'of', 'of'),
 (9, 'DT', 'a', 'a'),
 (10, 'NN', 'deity', 'deity'),
 (10, ',', ',', ','),
 (11, 'CC', 'but', 'but'),
 (12, 'PRP', 'he', 'he'),
 (12, 'VBZ', "'s", "'s"),
 (13, 'VBG', 'retiring', 'retire'),
 (14, 'IN', 'as', 'as'),
 (15, 'DT', 'the', 'the'),
 (16, 'NNS', 'people', 'people'),
 (16, 'POS', "'s", "'s"),
 (17, 'NNP', 'Emperor', 'Emperor'),
 (17, '.', '.', '.'),
 (17, 'IN', 'On', 'On'),
 (18, 'NNP', 'Tuesday', 'Tuesday'),
 (18, ',', ',', ','),
 (19, 'NNP', 'Emperor', 'Emperor'),
 (20, 'NNP', 'Akihito', 'Akihito'),
 (21, 'MD', 'will', 'will'),
 (22, 'VB', 'abdicate', 'abdicate'),
 (23, 'DT', 'the', 'the'),
 (24, 'NNP', 'Chrysanthemum',

In [14]:
token_pos

[('Tokyo', 'NNP'),
 ('(', '('),
 ('CNN', 'NNP'),
 (')', ')'),
 ('Japan', 'NNP'),
 ("'s", 'POS'),
 ('85-year-old', 'JJ'),
 ('monarch', 'NN'),
 ('was', 'VBD'),
 ('born', 'VBN'),
 ('the', 'DT'),
 ('son', 'NN'),
 ('of', 'IN'),
 ('a', 'DT'),
 ('deity', 'NN'),
 (',', ','),
 ('but', 'CC'),
 ('he', 'PRP'),
 ("'s", 'VBZ'),
 ('retiring', 'VBG'),
 ('as', 'IN'),
 ('the', 'DT'),
 ('people', 'NNS'),
 ("'s", 'POS'),
 ('Emperor', 'NNP'),
 ('.', '.'),
 ('On', 'IN'),
 ('Tuesday', 'NNP'),
 (',', ','),
 ('Emperor', 'NNP'),
 ('Akihito', 'NNP'),
 ('will', 'MD'),
 ('abdicate', 'VB'),
 ('the', 'DT'),
 ('Chrysanthemum', 'NNP'),
 ('Throne', 'NNP'),
 ('--', ':'),
 ('the', 'DT'),
 ('oldest', 'JJS'),
 ('continuous', 'JJ'),
 ('hereditary', 'JJ'),
 ('monarchy', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('world', 'NN'),
 ('--', ':'),
 ('becoming', 'VBG'),
 ('the', 'DT'),
 ('first', 'JJ'),
 ('Japanese', 'JJ'),
 ('monarch', 'NN'),
 ('in', 'IN'),
 ('modern', 'JJ'),
 ('history', 'NN'),
 ('to', 'TO'),
 ('do', 'VB'),
 ('so', 

In [12]:
word_token

{0: ['Tokyo'],
 1: ['(', 'CNN', ')', 'Japan', "'s"],
 2: ['85-year-old'],
 3: ['monarch'],
 4: ['was'],
 5: ['born'],
 6: ['the'],
 7: ['son'],
 8: ['of'],
 9: ['a'],
 10: ['deity', ','],
 11: ['but'],
 12: ['he', "'s"],
 13: ['retiring'],
 14: ['as'],
 15: ['the'],
 16: ['people', "'s"],
 17: ['Emperor', '.', 'On'],
 18: ['Tuesday', ','],
 19: ['Emperor'],
 20: ['Akihito'],
 21: ['will'],
 22: ['abdicate'],
 23: ['the'],
 24: ['Chrysanthemum'],
 25: ['Throne'],
 26: ['--'],
 27: ['the'],
 28: ['oldest'],
 29: ['continuous'],
 30: ['hereditary'],
 31: ['monarchy'],
 32: ['in'],
 33: ['the'],
 34: ['world'],
 35: ['--'],
 36: ['becoming'],
 37: ['the'],
 38: ['first'],
 39: ['Japanese'],
 40: ['monarch'],
 41: ['in'],
 42: ['modern'],
 43: ['history'],
 44: ['to'],
 45: ['do'],
 46: ['so', '.'],
 47: ['His'],
 48: ['son', ','],
 49: ['Crown'],
 50: ['Prince'],
 51: ['Naruhito', ','],
 52: ['59', ','],
 53: ['will'],
 54: ['be'],
 55: ['inaugurated'],
 56: ['as'],
 57: ['the'],
 58: ['12

In [15]:
paragraph

['Tokyo',
 "(CNN)Japan's",
 '85-year-old',
 'monarch',
 'was',
 'born',
 'the',
 'son',
 'of',
 'a',
 'deity,',
 'but',
 "he's",
 'retiring',
 'as',
 'the',
 "people's",
 'Emperor.\n\nOn',
 'Tuesday,',
 'Emperor',
 'Akihito',
 'will',
 'abdicate',
 'the',
 'Chrysanthemum',
 'Throne',
 '--',
 'the',
 'oldest',
 'continuous',
 'hereditary',
 'monarchy',
 'in',
 'the',
 'world',
 '--',
 'becoming',
 'the',
 'first',
 'Japanese',
 'monarch',
 'in',
 'modern',
 'history',
 'to',
 'do',
 'so.',
 'His',
 'son,',
 'Crown',
 'Prince',
 'Naruhito,',
 '59,',
 'will',
 'be',
 'inaugurated',
 'as',
 'the',
 '126th',
 'emperor',
 'the',
 'next',
 'day,',
 'ushering',
 'in',
 'the',
 'Reiwa',
 'era.\nThe',
 'much-loved',
 'Emperor',
 'Akihito',
 'will',
 'be',
 'remembered',
 'for',
 'connecting',
 'with',
 'his',
 'public',
 'in',
 'a',
 'way',
 'that',
 'no',
 'other',
 'Japanese',
 'monarch',
 'has',
 'done',
 'and',
 'expressing',
 '"deep',
 'remorse"',
 'for',
 'the',
 "country's",
 'actions',
 

In [69]:
[row[0] for row in model.most_similar('wooing', topn=10) if row[1] > 0.5]

  if np.issubdtype(vec.dtype, np.int):


['wooed', 'courting', 'luring', 'courted', 'woos', 'persuading']

In [155]:
model.most_similar('telescopes', topn=10)

  if np.issubdtype(vec.dtype, np.int):


[('telescope', 0.8120151162147522),
 ('observatories', 0.6612797975540161),
 ('astronomers', 0.621345043182373),
 ('infrared', 0.6060433387756348),
 ('ground-based', 0.5985158085823059),
 ('microscopes', 0.573469877243042),
 ('binoculars', 0.5701478719711304),
 ('hubble', 0.5597853660583496),
 ('observatory', 0.5295453071594238),
 ('optical', 0.5255236625671387)]

In [114]:
synonymsPT('clasped')

gripped
clutched
embraced
clamped
squeezed
tight
cuddled
grasped
included
enclosed


TypeError: 'NoneType' object is not iterable