## Lecture 05 word2vec

### Part 1 Implement a Word2Vec Model using the News Corpus

In [1]:
from gensim.models.word2vec import LineSentence
from gensim.models import Word2Vec

In [2]:
# Initialize and train a Word2Vec model

sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
model = Word2Vec(sentences, min_count=1)

# Paramters for 'Word2Vec' in the gensim package:
# -- sentences (iterable of list of str): Can be simply a list of lists of tokens, but for larger corpora, 
#     consider an iterable that streams the sentences directly from disk/network. 
#     See BrownCorpus, Text8Corpus or LineSentence module for such examples.
# -- min_count: Words below the min_count frequency are dropped before training occurs.
#     The default value is min_count = 5
#
# See documentation here: 
# https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.LineSentence


In [3]:
model.wv.vocab

{'cat': <gensim.models.keyedvectors.Vocab at 0x12b904940>,
 'say': <gensim.models.keyedvectors.Vocab at 0x12b904048>,
 'meow': <gensim.models.keyedvectors.Vocab at 0x12b904f98>,
 'dog': <gensim.models.keyedvectors.Vocab at 0x12b9044e0>,
 'woof': <gensim.models.keyedvectors.Vocab at 0x12b904ef0>}

In [4]:
model.most_similar('cat') # check out the most similar words for "cat"

  """Entry point for launching an IPython kernel.


[('say', 0.07369384914636612),
 ('woof', 0.0512646846473217),
 ('dog', 0.043911129236221313),
 ('meow', -0.04825589805841446)]

In [5]:
model.wv['dog'] # check out the word vector for 'dog'

array([ 4.5524435e-03,  4.3167290e-03, -1.9331600e-03,  1.4156078e-03,
        1.5240088e-03, -4.6887097e-04, -1.8123303e-03, -3.1281598e-03,
       -1.0598174e-03, -4.6913023e-03,  4.3256516e-03, -2.4396495e-03,
       -3.2000607e-03,  3.3786276e-03,  4.4470131e-03, -4.1932724e-03,
        2.5538651e-03, -4.8114667e-03,  4.8077307e-03,  1.0756694e-03,
       -2.8313601e-03,  3.3936216e-04, -1.3326895e-03,  2.1480329e-03,
       -1.0317875e-03,  1.0735678e-03, -5.8563414e-04, -3.6414443e-03,
        2.5576167e-04,  2.3993750e-03,  3.0371237e-03,  2.0543532e-03,
       -3.7211739e-03,  4.5596221e-03,  4.7293343e-03,  1.1081680e-03,
       -4.0781870e-03, -2.8813148e-03, -3.4635402e-03, -7.3132112e-05,
        1.6330541e-03, -3.6698077e-03,  3.6049262e-03, -2.3594000e-03,
       -3.0682206e-03,  7.7385473e-04,  8.4273046e-04, -3.7035465e-03,
       -1.3521563e-03,  8.6767157e-04,  4.4918987e-03,  4.1219322e-03,
       -4.8194081e-03,  4.8602805e-03,  2.3068113e-03, -4.4915131e-03,
      

In [8]:
# Let's try word2vec using a larger corpus (the chinese news corpus in LineSentence format)

line_setences_path = '/Users/xinweixu/Dropbox/learn/Comp_Prog/nlp/data/sentences-cut.txt'
sentences = LineSentence(line_setences_path)

In [9]:
news_model = Word2Vec(sentences, min_count=5)

In [10]:
news_model.wv['小米']

array([ 0.46606752,  0.0603839 ,  0.12178345, -0.6077574 ,  0.35224456,
        0.1260093 ,  0.2057447 ,  0.02758181,  0.7849377 ,  0.25842643,
       -0.11409037,  0.64268595,  0.17382361,  0.91700697,  2.6077218 ,
        0.29584727, -1.1201617 , -0.34778276,  0.18813357, -0.23707744,
       -0.2701578 , -1.8964182 , -1.1023777 , -0.38531917, -0.94263166,
       -0.5404368 , -0.1292999 ,  0.94657075, -0.5578564 ,  1.5042089 ,
       -1.087818  ,  1.1199632 ,  1.6911533 ,  0.45593718,  0.574618  ,
       -0.33724988,  0.12162794, -0.6187379 , -0.09358543,  0.28237635,
        0.64235187, -0.09404029,  0.11260442, -0.3531185 ,  0.07025079,
       -0.07933626,  0.4417101 , -0.14876136,  0.57109916,  2.2883537 ,
       -1.1443634 , -1.4978299 ,  0.58583254, -1.0405807 , -0.86536634,
        1.2698337 , -1.6695113 , -1.0639921 , -0.10368778, -0.7882075 ,
       -0.49955893,  0.02908174, -0.00261209, -1.4894137 , -0.28617343,
       -0.19373026,  0.18372041, -1.2616925 , -0.64112276, -0.13

In [11]:
news_model.most_similar('小米')

  """Entry point for launching an IPython kernel.


[('OPPO', 0.7341985106468201),
 ('华为', 0.7296118140220642),
 ('苹果', 0.7029392719268799),
 ('生鲜', 0.700323224067688),
 ('三星', 0.6948364973068237),
 ('S8', 0.6868998408317566),
 ('格力', 0.685062050819397),
 ('家电', 0.6844112277030945),
 ('亚马逊', 0.6718410849571228),
 ('智能手机', 0.6623239517211914)]

In [12]:
news_model.most_similar('华为')

  """Entry point for launching an IPython kernel.


[('三星', 0.8230892419815063),
 ('西门子', 0.7642229795455933),
 ('中兴', 0.7625566720962524),
 ('OPPO', 0.7476036548614502),
 ('谷歌', 0.7430927753448486),
 ('阿里巴巴', 0.7340729236602783),
 ('上汽', 0.7331277132034302),
 ('蚂蚁', 0.7311336994171143),
 ('小米', 0.7296118140220642),
 ('英特尔', 0.7259373664855957)]

In [13]:
news_model.most_similar('亚马逊')

  """Entry point for launching an IPython kernel.


[('沃尔玛', 0.776244044303894),
 ('京东', 0.7541892528533936),
 ('谷歌', 0.7526918649673462),
 ('苹果公司', 0.718194842338562),
 ('微软', 0.7116185426712036),
 ('零售商', 0.7107131481170654),
 ('苹果', 0.7026939392089844),
 ('天猫', 0.7006959915161133),
 ('阿里巴巴', 0.696462869644165),
 ('Google', 0.691777229309082)]

In [26]:
# now let's save the word vectors to a local directory for future use!

# first, we can save the model
news_model.save('/Users/xinweixu/Dropbox/learn/Comp_Prog/nlp/data/news_model')

# to load a model, use the following:
# news_model = gensim.models.Word2Vec.load('news_model')

# Advanced users can load a model and continue training it with more sentences:
#model = gensim.models.Word2Vec.load('path/to/mymodel')
#model.train(more_sentences)



In [None]:
# second, we can also save the word vectors for future query
filename = '/Users/xinweixu/Dropbox/learn/Comp_Prog/nlp/data/news_model_word_vectors.kv'
word_vectors = news_model.wv
word_vectors.save(filename)

In [29]:
# save as binary format (.bin)
word_vectors.save_word2vec_format('/Users/xinweixu/Dropbox/learn/Comp_Prog/nlp/data/news_model_word_vectors.bin', binary=True)

# or as .txt
# word_vectors.save_word2vec_format('path/to/<file_name>.txt', binary=False)
# note that .bin takes less space than .txt!

In [27]:
# to load word vectors from a local directory, use the following:
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors
# filename = get_tmpfile('<file_path>')
# word_vectors = KeyedVectors.load(filename, mmap='r')

# or we can also load binary word vectors into a model:
#model = KeyedVectors.load_word2vec_format('path/to/<file_name>.bin', binary=True)


 For more info on storing and querying word vectors, see: https://radimrehurek.com/gensim/models/keyedvectors.html

### Part 2 Name Entity Recognition & Dependency Parsing

#### 1. Using jieba.posseg
posseg = part of speech segment，
`jieba.posseg` 标注句子分词后每个词的词性，采用ictclas兼容的标记法。对于词性注释，见表格：https://blog.csdn.net/suibianshen2012/article/details/53487157

In [14]:
text = """新华社华盛顿4月26日电 美国总统特朗普26日表示，美国将撤销在《武器贸易条约》上的签字。

特朗普当天在美国印第安纳州首府印第安纳波利斯举行的美国全国步枪协会年会上说，《武器贸易条约》是一个“严重误导的条约”，美国将撤销在该条约上的签字，联合国将很快收到美国正式拒绝该条约的通知。"""

In [15]:
import jieba.posseg as pseg

In [16]:
words = pseg.cut(text)

In [17]:
for word, flag in words:
    print('%s %s' % (word, flag))

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/1y/1btp7xpj7b1f82lnwvn2916h0000gn/T/jieba.cache
Loading model cost 0.683 seconds.
Prefix dict has been built succesfully.


新华社 nt
华盛顿 ns
4 m
月 m
26 m
日电 j
  x
美国 ns
总统 n
特朗普 nr
26 m
日 m
表示 v
， x
美国 ns
将 d
撤销 v
在 p
《 x
武器 n
贸易 vn
条约 n
》 x
上 f
的 uj
签字 v
。 x

 x

 x
特朗普 nr
当天 t
在 p
美国 ns
印第安纳州 ns
首府 n
印第安纳波利斯 ns
举行 v
的 uj
美国 ns
全国 n
步枪 n
协会 n
年 m
会上 t
说 v
， x
《 x
武器 n
贸易 vn
条约 n
》 x
是 v
一个 m
“ x
严重 a
误导 n
的 uj
条约 n
” x
， x
美国 ns
将 d
撤销 v
在 p
该 r
条约 n
上 f
的 uj
签字 v
， x
联合国 nt
将 d
很快 d
收到 v
美国 ns
正式 ad
拒绝 v
该 r
条约 n
的 uj
通知 v
。 x


#### 2. Using pyltp (by HIT)
pyltp 是语言技术平台 （language technology platform）的python 封装。 

github page: https://github.com/HIT-SCIR/pyltp

documentation: https://pyltp.readthedocs.io/zh_CN/latest/api.html#id13

In [18]:
from pyltp import Postagger

ModuleNotFoundError: No module named 'pyltp'

In [None]:
# pyltp install failed on macOS 10.14

#### 3. stanford-corenlp

For a list of python packages using the Stanford-CoreNLP server:
https://stanfordnlp.github.io/CoreNLP/other-languages.html

Here we use `pycorenlp` for its simplicity.

Before running the python interface, we need to launch the CoreNLP server from the terminal:
```bash
cd path/to/CoreNLP/folder
java -mx6g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -timeout 5000 --add-modules java.se.ee
```
The parameter `-mx6g` specifies the amount of memory that CoreNLP is allowed to use. In this case, it’s six gigabytes. The `-timeout 5000` parameter specifies the timeout in milliseconds. `--add-modules java.se.ee` is for java version 9/10/11.



See more info: 
https://towardsdatascience.com/natural-language-processing-using-stanfords-corenlp-d9e64c1e1024


In [1]:
from pycorenlp import StanfordCoreNLP

In [2]:
nlp = StanfordCoreNLP('http://localhost:9000')

In [3]:
text = "This movie was actually neither that funny, nor super witty. I liked watching that movie. If I had a choice, I would not watch that movie again."

In [4]:
result = nlp.annotate(text,
                   properties={
                       'annotators': 'ner, pos',
                       'outputFormat': 'json',
                       'timeout': 1000,
                   })

# possible annotators:
# annotators: tokenize, ssplit, pos, lemma, ner, parse, dcoref

In [9]:
result["sentences"]

[{'index': 0,
  'entitymentions': [],
  'tokens': [{'index': 1,
    'word': 'This',
    'originalText': 'This',
    'lemma': 'this',
    'characterOffsetBegin': 0,
    'characterOffsetEnd': 4,
    'pos': 'DT',
    'ner': 'O',
    'before': '',
    'after': ' '},
   {'index': 2,
    'word': 'movie',
    'originalText': 'movie',
    'lemma': 'movie',
    'characterOffsetBegin': 5,
    'characterOffsetEnd': 10,
    'pos': 'NN',
    'ner': 'O',
    'before': ' ',
    'after': ' '},
   {'index': 3,
    'word': 'was',
    'originalText': 'was',
    'lemma': 'be',
    'characterOffsetBegin': 11,
    'characterOffsetEnd': 14,
    'pos': 'VBD',
    'ner': 'O',
    'before': ' ',
    'after': ' '},
   {'index': 4,
    'word': 'actually',
    'originalText': 'actually',
    'lemma': 'actually',
    'characterOffsetBegin': 15,
    'characterOffsetEnd': 23,
    'pos': 'RB',
    'ner': 'O',
    'before': ' ',
    'after': ' '},
   {'index': 5,
    'word': 'neither',
    'originalText': 'neither',
  

In [14]:
# print out the labels from NER (name recognition entity)

pos = []
for word in result["sentences"][0]['tokens']:
    pos.append('{} ({})'.format(word['word'], word['ner']))
    
" ".join(pos)

'This (O) movie (O) was (O) actually (O) neither (O) that (O) funny (O) , (O) nor (O) super (O) witty (O) . (O)'

In [5]:
# in this example, all the NER labels seem to be o,
# try another example -- a random news excerpt from Financial Times:

text2 = "Europe and the US have warned Iran against reviving its nuclear programme after Tehran said it would stop complying with parts of a 2015 atomic deal, raising the stakes in the regime’s stand-off with the Trump administration."

In [7]:
result2 = nlp.annotate(text2,
                   properties={
                       'annotators': 'ner, pos',
                       'outputFormat': 'json',
                       'timeout': 1000,
                   })


# using the 'sentiment' annotator gives error!
# related issue reported here:
# https://github.com/stanfordnlp/CoreNLP/issues/347
# but not able to solve the error after adding ejml.jar to the classpath...

In [8]:
result2['sentences']

[{'index': 0,
  'entitymentions': [{'docTokenBegin': 0,
    'docTokenEnd': 1,
    'tokenBegin': 0,
    'tokenEnd': 1,
    'text': 'Europe',
    'characterOffsetBegin': 0,
    'characterOffsetEnd': 6,
    'ner': 'LOCATION',
    'nerConfidences': {'LOCATION': 0.93099511643904}},
   {'docTokenBegin': 3,
    'docTokenEnd': 4,
    'tokenBegin': 3,
    'tokenEnd': 4,
    'text': 'US',
    'characterOffsetBegin': 15,
    'characterOffsetEnd': 17,
    'ner': 'COUNTRY',
    'nerConfidences': {'LOCATION': 0.99806632305234}},
   {'docTokenBegin': 6,
    'docTokenEnd': 7,
    'tokenBegin': 6,
    'tokenEnd': 7,
    'text': 'Iran',
    'characterOffsetBegin': 30,
    'characterOffsetEnd': 34,
    'ner': 'COUNTRY',
    'nerConfidences': {'LOCATION': 0.99568029031225}},
   {'docTokenBegin': 13,
    'docTokenEnd': 14,
    'tokenBegin': 13,
    'tokenEnd': 14,
    'text': 'Tehran',
    'characterOffsetBegin': 80,
    'characterOffsetEnd': 86,
    'ner': 'LOCATION',
    'nerConfidences': {'LOCATION': 0.

In [9]:
pos = []
for word in result2["sentences"][0]['tokens']:
    pos.append('{} ({})'.format(word['word'], word['ner']))
    
" ".join(pos)

"Europe (LOCATION) and (O) the (O) US (COUNTRY) have (O) warned (O) Iran (COUNTRY) against (O) reviving (O) its (O) nuclear (O) programme (O) after (O) Tehran (LOCATION) said (O) it (O) would (O) stop (O) complying (O) with (O) parts (O) of (O) a (O) 2015 (DATE) atomic (O) deal (O) , (O) raising (O) the (O) stakes (O) in (O) the (O) regime (O) 's (O) stand-off (O) with (O) the (O) Trump (PERSON) administration (O) . (O)"

In [19]:
# now we have different NER labels:
# Europe -- location, 
# US -- country, 
# Iran -- country,
# 2015 -- date,
# Trump -- person

# try another example --- the open lines from Joseph Conrad's novel:

text3 = "The Nellie, a cruising yawl, swung to her anchor without a flutter of the sails, and was at rest. The flood had made, the wind was nearly calm, and being bound down the river, the only thing for it was to come to and wait for the turn of the tide."

In [None]:
result3 = nlp.annotate(text3,
                   properties={
                       'annotators': 'ner, pos',
                       'outputFormat': 'json',
                       'timeout': 1000,
                   })



In [23]:
result3['sentences']

[{'index': 0,
  'entitymentions': [{'docTokenBegin': 1,
    'docTokenEnd': 2,
    'tokenBegin': 1,
    'tokenEnd': 2,
    'text': 'Nellie',
    'characterOffsetBegin': 4,
    'characterOffsetEnd': 10,
    'ner': 'PERSON',
    'nerConfidences': {'PERSON': 0.7205367853984}},
   {'docTokenBegin': 9,
    'docTokenEnd': 10,
    'tokenBegin': 9,
    'tokenEnd': 10,
    'text': 'her',
    'characterOffsetBegin': 38,
    'characterOffsetEnd': 41,
    'ner': 'PERSON'}],
  'tokens': [{'index': 1,
    'word': 'The',
    'originalText': 'The',
    'lemma': 'the',
    'characterOffsetBegin': 0,
    'characterOffsetEnd': 3,
    'pos': 'DT',
    'ner': 'O',
    'before': '',
    'after': ' '},
   {'index': 2,
    'word': 'Nellie',
    'originalText': 'Nellie',
    'lemma': 'Nellie',
    'characterOffsetBegin': 4,
    'characterOffsetEnd': 10,
    'pos': 'NNP',
    'ner': 'PERSON',
    'before': ' ',
    'after': ''},
   {'index': 3,
    'word': ',',
    'originalText': ',',
    'lemma': ',',
    'cha

In [26]:
# check out the NER labels

pos = []
for word in result3["sentences"][0]['tokens']:
    pos.append('{} ({})'.format(word['word'], word['ner']))
    
" ".join(pos)

'The (O) Nellie (PERSON) , (O) a (O) cruising (O) yawl (O) , (O) swung (O) to (O) her (O) anchor (O) without (O) a (O) flutter (O) of (O) the (O) sails (O) , (O) and (O) was (O) at (O) rest (O) . (O)'

In [27]:
# and the pos taggers

pos = []
for word in result3["sentences"][0]['tokens']:
    pos.append('{} ({})'.format(word['word'], word['pos']))
    
" ".join(pos)

'The (DT) Nellie (NNP) , (,) a (DT) cruising (VBG) yawl (NN) , (,) swung (VBN) to (TO) her (PRP$) anchor (NN) without (IN) a (DT) flutter (NN) of (IN) the (DT) sails (NNS) , (,) and (CC) was (VBD) at (IN) rest (NN) . (.)'

To run a server using Chinese properties:

``` bash
java -Xmx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -serverProperties StanfordCoreNLP-chinese.properties -port 9000 -timeout 15000
```

In [10]:
nlp = StanfordCoreNLP('http://localhost:9000')

In [11]:
text_ch = "新华社福州5月8日电（记者余俊杰、颜之宏）记者从8日闭幕的第二届数字中国建设峰会上获悉，为期3天的峰会共对接数字经济项目587项，总投资额4569亿元，其中签约项目308项，总投资额2520亿元。"

In [12]:
result_ch = nlp.annotate(text_ch,
                   properties={
                       'annotators': 'ner, pos',
                       'outputFormat': 'json',
                       'timeout': 1000,
                   })




In [13]:
result_ch['sentences']

[{'index': 0,
  'entitymentions': [{'docTokenBegin': 0,
    'docTokenEnd': 1,
    'tokenBegin': 0,
    'tokenEnd': 1,
    'text': '新华社',
    'characterOffsetBegin': 0,
    'characterOffsetEnd': 3,
    'ner': 'ORGANIZATION',
    'nerConfidences': {'ORGANIZATION': 0.99933014808887}},
   {'docTokenBegin': 1,
    'docTokenEnd': 2,
    'tokenBegin': 1,
    'tokenEnd': 2,
    'text': '福州',
    'characterOffsetBegin': 3,
    'characterOffsetEnd': 5,
    'ner': 'CITY',
    'nerConfidences': {'GPE': 0.99824373714923}},
   {'docTokenBegin': 2,
    'docTokenEnd': 4,
    'tokenBegin': 2,
    'tokenEnd': 4,
    'text': '5月8日',
    'characterOffsetBegin': 5,
    'characterOffsetEnd': 9,
    'ner': 'DATE',
    'normalizedNER': 'XXXX-05-08',
    'nerConfidences': {'DATE': 0.99757596733706}},
   {'docTokenBegin': 6,
    'docTokenEnd': 7,
    'tokenBegin': 6,
    'tokenEnd': 7,
    'text': '记者',
    'characterOffsetBegin': 11,
    'characterOffsetEnd': 13,
    'ner': 'TITLE'},
   {'docTokenBegin': 7,
  

In [14]:
# check out the NER labels

pos = []
for word in result_ch["sentences"][0]['tokens']:
    pos.append('{} ({})'.format(word['word'], word['ner']))
    
" ".join(pos)

'新华社 (ORGANIZATION) 福州 (CITY) 5月 (DATE) 8日 (DATE) 电 (O) （ (O) 记者 (TITLE) 余俊杰 (PERSON) 、 (O) 颜之宏 (PERSON) ） (O) 记者 (TITLE) 从 (O) 8 (NUMBER) 日 (MISC) 闭幕 (O) 的 (O) 第二 (ORDINAL) 届 (MISC) 数字 (MISC) 中国 (MISC) 建设 (MISC) 峰会 (MISC) 上 (O) 获悉 (O) ， (O) 为期 (O) 3 (NUMBER) 天 (MISC) 的 (MISC) 峰会 (MISC) 共 (O) 对接 (O) 数字 (O) 经济 (O) 项目 (O) 587 (NUMBER) 项 (MISC) ， (O) 总 (O) 投资额 (O) 4569亿 (MONEY) 元 (MONEY) ， (O) 其中 (O) 签约 (O) 项目 (O) 308 (NUMBER) 项 (MISC) ， (O) 总 (O) 投资额 (O) 2520亿 (MONEY) 元 (MONEY) 。 (O)'

In [15]:
# check out the POS taggers

pos = []
for word in result_ch["sentences"][0]['tokens']:
    pos.append('{} ({})'.format(word['word'], word['pos']))
    
" ".join(pos)

'新华社 (NR) 福州 (NR) 5月 (NT) 8日 (NT) 电 (NN) （ (PU) 记者 (NN) 余俊杰 (NR) 、 (PU) 颜之宏 (NR) ） (PU) 记者 (NN) 从 (P) 8 (CD) 日 (M) 闭幕 (VV) 的 (DEC) 第二 (OD) 届 (M) 数字 (NN) 中国 (NR) 建设 (VV) 峰会 (NN) 上 (LC) 获悉 (VV) ， (PU) 为期 (VV) 3 (CD) 天 (M) 的 (DEC) 峰会 (NN) 共 (AD) 对接 (VV) 数字 (NN) 经济 (NN) 项目 (NN) 587 (CD) 项 (M) ， (PU) 总 (JJ) 投资额 (NN) 4569亿 (CD) 元 (M) ， (PU) 其中 (NN) 签约 (VV) 项目 (NN) 308 (CD) 项 (M) ， (PU) 总 (JJ) 投资额 (NN) 2520亿 (CD) 元 (M) 。 (PU)'