In [6]:
#日本語リーダー
from nltk.corpus.reader.wordnet import WordNetCorpusReader
class JapaneseWordNetCorpusReader(WordNetCorpusReader):
    def __init__(self, root, filename):
        WordNetCorpusReader.__init__(self,root,'omw_reader')
        import codecs
        f=codecs.open(filename, encoding="utf-8")
        self._jword2offset = {}
        for line in f:
            _cells = line.strip().split('\t')
            _offset_pos = _cells[0]
            _word = _cells[1]
            if len(_cells)>2: _tag = _cells[2]
            _offset, _pos = _offset_pos.split('-')
            try:
                self._jword2offset[_word].append({'offset': int(_offset), 'pos': _pos})
            except:
                self._jword2offset[_word]=[{'offset': int(_offset), 'pos': _pos}]
    def synsets(self, word):
        if word in self._jword2offset:
            results = [ ]
            for offset in (self._jword2offset[word]):
                results.append(WordNetCorpusReader.synset_from_pos_and_offset(
                self, offset['pos'], offset['offset']
                ))
            return results
        else:
            return None


#英語WordNetから類似度を算出するモジュール
import codecs

#単語リストを2つ受け取って概念リストのリストを返す
def convWords2Synsets(wordList1, wordList2):
    jwn = JapaneseWordNetCorpusReader('./wordnet', './wnjpn-ok.tab') #英語WordNetと日本語WordNetを指定する
    synLists = [[ ],[ ]]
    wordLists = [wordList1, wordList2]
    for i in [0,1]:
        synLists[i].append(jwn.synsets(wordLists[i][0]))
        print(synLists[i])
    return synLists

#概念リストを2つ受け取って類似度を返す
def calcSim(synList1,synList2):
    import numpy as np
    for i in range(len(synList1)):
        for j in range(len(synList2)):
            sims = []
            for syn1 in synList1[i]:
                for syn2 in synList2[j]:
                    sims.append(syn1.path_similarity(syn2))
            sims=list(filter(None, sims))
            if(len(sims)==0):
                simMatrix=0
            else:
                simMatrix = max(sims)
    return simMatrix


#２単語間の類似度を計算する    
def similarity(word1,word2):
    print("[類似度]")
    wordLists=[[],[]]
    wordLists[0].append(word1)
    wordLists[1].append(word2)
    synLists = convWords2Synsets(wordLists[0], wordLists[1]) #概念リストを作成
    simMatrix = calcSim(synLists[0], synLists[1]) #類似度を計算
    print(word1 + " - " + word2 +": " + str(simMatrix))
    return simMatrix

In [7]:
#Wordnetはシソーラス（言葉を同義語や意味上の類似関係、包含関係などによって分類した辞書）で単語間の関係を保持している
#この関係性を利用することで、ある単語の類義語を得ることができる

import sys, sqlite3
from collections import namedtuple
from pprint import pprint

conn = sqlite3.connect("./wnjpn.db")
Word = namedtuple('Word', 'wordid lang lemma pron pos')

def getWords(lemma):
  cur = conn.execute("select * from word where lemma=?", (lemma,))
  return [Word(*row) for row in cur]
 
Sense = namedtuple('Sense', 'synset wordid lang rank lexid freq src')

def getSenses(word):
  cur = conn.execute("select * from sense where wordid=?", (word.wordid,))
  return [Sense(*row) for row in cur]

Synset = namedtuple('Synset', 'synset pos name src')

def getSynset(synset):
  cur = conn.execute("select * from synset where synset=?", (synset,))
  return Synset(*cur.fetchone())

def getWordsFromSynset(synset, lang):
  cur = conn.execute("select word.* from sense, word where synset=? and word.lang=? and sense.wordid = word.wordid;", (synset,lang))
  return [Word(*row) for row in cur]

def getWordsFromSenses(sense, lang="jpn"):
  synonym = {}
  for s in sense:
    lemmas = []
    syns = getWordsFromSynset(s.synset, lang)
    for sy in syns:
      lemmas.append(sy.lemma)
    synonym[getSynset(s.synset).name] = lemmas
  return synonym

def getSynonym (word):
    synonym = {}
    words = getWords(word)
    if words:
        for w in words:
            sense = getSenses(w)
            s = getWordsFromSenses(sense)
            synonym = dict(list(synonym.items()) + list(s.items()))
    return synonym


#単語の類義語を出力する
def synonym(word):
    print("["+word +"の類義語]")
    synonym = getSynonym(word)
    pprint(synonym)

In [8]:
synonym('犬')      #単語の類義語を出力

print()
similarity('猫','犬')    #二つの単語の類似度を計算

[犬の類義語]
{'canis_familiaris': ['飼い犬', 'ドッグ', '犬', '飼犬', '洋犬', 'イヌ'],
 'spy': ['廻者',
         '間諜',
         '工作員',
         '犬',
         '間者',
         '探',
         '諜報員',
         '諜者',
         '密偵',
         'スパイ',
         '秘密捜査員',
         'いぬ',
         'まわし者',
         '隠密',
         '探り',
         '廻し者',
         '回し者',
         '回者']}

[類似度]
[[Synset('cat.n.01')]]
[[Synset('dog.n.01'), Synset('spy.n.01')]]
猫 - 犬: 0.2


0.2