# 安装spacy

参考：
[离线安装spacy](https://blog.csdn.net/wwmmddz/article/details/115491267)

In [10]:
import spacy
nlp = spacy.load('en_core_web_sm')


# 文本处理

In [5]:
doc = nlp('Weather is good, very windy and sunny. We have no classes in the afternoon.')
# 分词
for token in doc:
    print(token)

Weather
is
good
,
very
windy
and
sunny
.
We
have
no
classes
in
the
afternoon
.


In [6]:
# 分句
for sent in doc.sents:
    print(sent.text)

Weather is good, very windy and sunny.
We have no classes in the afternoon.


# 词性

In [5]:
for token in doc:
    print('{}-{}'.format(token, token.pos_))

Weather-NOUN
is-AUX
good-ADJ
,-PUNCT
very-ADV
windy-ADJ
and-CCONJ
sunny-ADJ
.-PUNCT
We-PRON
have-VERB
no-DET
classes-NOUN
in-ADP
the-DET
afternoon-NOUN
.-PUNCT


# 命名实体识别

关于命名实体识别的讲解：

[NLP-NER 命名实体识别详解之一](https://zhuanlan.zhihu.com/p/88544122)


In [6]:
doc_2 = nlp("I went to Paris where I met my old friend Jack from uni.")
for ent in doc_2.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Paris 10 15 GPE
Jack 42 46 PERSON


In [7]:
from spacy import displacy
doc = nlp("I went to Paris where I met my old friend Jack from uni.")
displacy.render(doc, style='ent', jupyter=True)

# 找到书中所有人物名字

In [12]:
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as fr:
        infos = fr.read()
        return infos

t = read_file("./data/pride_and_prejudice.txt")
print(type(t))

<class 'str'>


In [18]:
# 加载文本数据
texts = read_file("./data/pride_and_prejudice.txt")
processed_text = nlp(texts)

sentences = [s for s in processed_text.sents]
print(len(sentences))
for idx, s in enumerate(sentences[:4]):
    print(idx, s)

5824
0 The Project Gutenberg EBook of Pride and Prejudice, by Jane Austen

This eBook is for the use of anyone anywhere at no cost and with
almost no restrictions whatsoever.  
1 You may copy it, give it away or
re-use it under the terms of the Project Gutenberg License included
with this eBook or online at www.gutenberg.org


Title: Pride and Prejudice

Author: Jane Austen

Posting Date: August 26, 2008
2 [EBook #1342]
Release Date: June, 1998
Last updated:
3 February 15, 2015]

Language: English


*** START OF THIS PROJECT GUTENBERG EBOOK PRIDE AND PREJUDICE ***




Produced by Anonymous Volunteers





PRIDE AND PREJUDICE




In [20]:
print(type(processed_text.ents), len(processed_text.ents))

<class 'tuple'> 5517


In [28]:
from collections import Counter, defaultdict

def find_person(doc):
    c = Counter()
    for ent in processed_text.ents:
        print(ent, '-', ent.label_, '-', ent.label, '-', ent.lemma_)
        if ent.label_ == "PERSON":
            c[ent.lemma_] += 1
        
    # 返回出现频率最高的前10个人名
    return c.most_common(10)
        
print(find_person(processed_text))

Jane Austen - PERSON - 380 - Jane Austen
eBook - PRODUCT - 386 - eBook
the Project Gutenberg License - ORG - 383 - the Project Gutenberg License
eBook - ORG - 383 - eBook
Jane Austen - PERSON - 380 - Jane Austen
August 26, 2008 - DATE - 391 - August 26, 2008
EBook - ORG - 383 - EBook
1342 - MONEY - 394 - 1342
Release Date: - LOC - 385 - Release Date:
June - DATE - 391 - June
1998 - DATE - 391 - 1998
February 15, 2015 - DATE - 391 - February 15, 2015
English - LANGUAGE - 389 - English
Anonymous Volunteers - PERSON - 380 - Anonymous Volunteers
Jane Austen - PERSON - 380 - Jane Austen
Chapter 1 - LAW - 390 - chapter 1
first - ORDINAL - 396 - first
some one - CARDINAL - 397 - some one
Bennet - PERSON - 380 - Bennet
Netherfield Park - FAC - 9191306739292312949 - Netherfield Park
Bennet - PERSON - 380 - Bennet
Long - PERSON - 380 - Long
Bennet - PERSON - 380 - Bennet
Long - PERSON - 380 - Long
Netherfield - ORG - 383 - Netherfield
England - GPE - 384 - England
Monday - DATE - 391 - Monday
fo

In [5]:
from collections import Counter, defaultdict
s = [('color', 'blue'), ('color', 'orange'), ('color', 'yellow'), ('fruit', 'banana'), ('fruit', 'orange'),
     ('fruit', 'banana')]
d = defaultdict(set)
for k, v in s:
    d[k].add(v)
print(d)

defaultdict(<class 'set'>, {'color': {'orange', 'yellow', 'blue'}, 'fruit': {'orange', 'banana'}})


# 恐怖袭击分析

In [7]:
def read_file_to_list(file_path):
    with open(file_path, 'r', encoding='utf-8') as fr:
        infos = fr.readlines()
    
    return infos

In [13]:
articles = read_file_to_list('./data/rand-terrorism-dataset.txt')
articles_nlp = [nlp(a) for a in articles]

common_terrorist_groups = [
    'taliban', 
    'al - qaeda', 
    'hamas',  
    'fatah', 
    'plo', 
    'bilad al - rafidayn'
]

common_locations = [
    'iraq',
    'baghdad', 
    'kirkuk', 
    'mosul', 
    'afghanistan', 
    'kabul',
    'basra', 
    'palestine', 
    'gaza', 
    'israel', 
    'istanbul', 
    'beirut', 
    'pakistan'
]

In [17]:
location_entity_dict = defaultdict(Counter)

for a in articles_nlp:
    # 人 或 组织
    article_terrorist_groups = [ent.lemma_.lower() for ent in a.ents if ent.label_ == "PERSON" or ent.label_ == "ORG"]
    article_locations = [ent.lemma_.lower() for ent in a.ents if ent.label_ == "GPE"]
    terrorist_common = [ent for ent in article_terrorist_groups if ent in common_terrorist_groups]
    locations_common = [ent for ent in article_locations if ent in common_locations]
    for found_entity in terrorist_common:
        for found_location in locations_common:
            location_entity_dict[found_entity][found_location] += 1

In [18]:
location_entity_dict

defaultdict(collections.Counter,
            {'plo': Counter({'israel': 38,
                      'beirut': 9,
                      'iraq': 8,
                      'palestine': 1}),
             'fatah': Counter({'israel': 21,
                      'gaza': 11,
                      'beirut': 1,
                      'iraq': 1}),
             'hamas': Counter({'gaza': 70, 'israel': 31, 'beirut': 1}),
             'taliban': Counter({'afghanistan': 264,
                      'kabul': 35,
                      'pakistan': 17})})

In [32]:
import pandas as pd
# for k, v in dict(location_entity_dict).items():
#     print(k, v, len(v))
loaction_entity_df = pd.DataFrame.from_dict(dict(location_entity_dict))
# 补全缺失值
loaction_entity_df = loaction_entity_df.fillna(value=0)

loaction_entity_df

Unnamed: 0,plo,fatah,hamas,taliban
beirut,9.0,1.0,1.0,0.0
israel,38.0,21.0,31.0,0.0
iraq,8.0,1.0,0.0,0.0
palestine,1.0,0.0,0.0,0.0
gaza,0.0,11.0,70.0,0.0
kabul,0.0,0.0,0.0,35.0
pakistan,0.0,0.0,0.0,17.0
afghanistan,0.0,0.0,0.0,264.0
