In [1]:
import numpy as np, pandas as pd
import spacy
from spacy.lang.en import English

import spacy
import nltk
nltk.download('wordnet')

from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
print(spacy.__version__)
print(nltk.__version__)
print(English)

3.0.1
3.5
<class 'spacy.lang.en.English'>


In [3]:
# https://spacy.io/usage/processing-pipelines

# https://nlpforhackers.io/complete-guide-to-spacy/

from nltk.corpus import wordnet as wn
from spacy.tokens import Token
 
from spacy.language import Language

# 形態素解析で特定した品詞を元に、辿るべき上位語を特定する。
def penn_to_wn(tag):
    if tag.startswith('N'):
        return 'n'
 
    if tag.startswith('V'):
        return 'v'
 
    if tag.startswith('J'):
        return 'a'
 
    if tag.startswith('R'):
        return 'r'
 
    return None

def get_hypernyms_all(synset, nup):
    niter = 0
    synset2 = synset.name()
    synset_list = [synset2]
    while True:
        niter += 1
        try:
            print(synset2)
            synset2 = get_hypernyms_iter(synset, niter=niter)
            synset_list.append(synset2)
        except:
            break
    return synset_list[nup]

def get_hypernyms_iter(synset, niter=1):
    synset2 = synset
    for iiter in range(niter):
        synset2 = get_hypernyms(synset2)
    return synset2.name()

def get_hypernyms(synset):
    return synset.hypernyms()[0]

class WordnetPipeline(object):
    def __init__(self, nlp):
#     def __init__(self):
        Token.set_extension('synset', default=None)
        Token.set_extension('hypernym', default=None)
 
    def __call__(self, doc):
        for token in doc:
            wn_tag = penn_to_wn(token.tag_)
            if wn_tag is None:
                continue
 
            ss = wn.synsets(token.text, wn_tag)[0]
            token._.set('synset', ss.name())
            if (wn_tag == 'n'):
                nup = -1
            else:
                nup = -1
            token._.set('hypernym', get_hypernyms_all(ss, nup))
#             token.set_extension()
 
        return doc

# @English.factory('wn_synsets')
@Language.factory('wn_synsets')
def wn_synsets(name, nlp):
    return WordnetPipeline(nlp)

In [4]:
nlp=spacy.load('en_core_web_sm')
stop_words = nlp.Defaults.stop_words
nlp.add_pipe('wn_synsets')

<__main__.WordnetPipeline at 0xffff71538bb0>

In [5]:
text = "Paris is the awesome capital of France. I like tennis very much"

for token in nlp(text):
    print(token, token.tag_, token.pos_, token._.synset, token._.hypernym)
# print([token._.synset for token in nlp(text)])
# print([token.tag_ for token in nlp(text)])
# print([token.pos_ for token in nlp(text)])

paris.n.01
be.v.01
amazing.s.02
capital.n.01
assets.n.01
possession.n.02
relation.n.01
abstraction.n.06
entity.n.01
france.n.01
wish.v.02
desire.v.01
tennis.n.01
court_game.n.01
athletic_game.n.01
game.n.01
activity.n.01
act.n.02
event.n.01
psychological_feature.n.01
abstraction.n.06
entity.n.01
very.r.01
much.r.01
Paris NNP PROPN paris.n.01 paris.n.01
is VBZ AUX be.v.01 be.v.01
the DT DET None None
awesome JJ ADJ amazing.s.02 amazing.s.02
capital NN NOUN capital.n.01 entity.n.01
of IN ADP None None
France NNP PROPN france.n.01 france.n.01
. . PUNCT None None
I PRP PRON None None
like VBP VERB wish.v.02 desire.v.01
tennis NN NOUN tennis.n.01 entity.n.01
very RB ADV very.r.01 very.r.01
much RB ADV much.r.01 much.r.01
