# 数据读取

## 配置数据路径

In [27]:
import warnings
import jieba

import pandas as pd

warnings.filterwarnings('ignore')

novel_path = 'D:/Code/Python/NLP/word2vec/novel/'
data_path = 'D:/Code/Python/NLP/word2vec/'


D:/Code/Python/NLP/word2vec/novel/


## 读取停用词表

In [28]:
stop_word_file = open(data_path + 'stop_words.txt', 'r', encoding='utf-8')
stop_word = list()
for line in stop_word_file.readlines():
    line = line.strip()
    stop_word.append(line)
stop_word_file.close()

## 读取人物名/武功/门派

In [29]:

people_names_file = open(data_path + "金庸小说全人物.txt", 'r', encoding='utf-8')
people_names = list()
for line in people_names_file.readlines():
    line = line.strip()  # 去掉每行末尾的换行符
    jieba.add_word(line)
    people_names.append(line)
people_names_file.close()

In [30]:
KongFu_file = open(data_path + "金庸小说全武功.txt", 'r', encoding='utf-8')
KongFu = list()
for line in KongFu_file.readlines():
    line = line.strip()  # 去掉每行末尾的换行符
    jieba.add_word(line)
    KongFu.append(line)
KongFu_file.close()

In [31]:
sects_file = open(data_path + "金庸小说全门派.txt", 'r', encoding='utf-8')
sects = list()
for line in sects_file.readlines():
    line = line.strip()  # 去掉每行末尾的换行符
    jieba.add_word(line)
    sects.append(line)
sects_file.close()

# 数据处理

## 分词

In [32]:
import os

novel_names = list(os.listdir(novel_path))

seg_novel = []
for novel_name in novel_names:
    novel = open(novel_path + novel_name, 'r', encoding='utf-8')
    print("Waiting for {}...".format(novel_name))
    line = novel.readline()
    forward_rows = len(seg_novel)
    while line:
        line_1 = line.strip()
        outstr = ''
        line_seg = jieba.cut(line_1, cut_all=False)
        for word in line_seg:
            if word not in stop_word:
                if word != '\t':
                    if word[:2] in people_names:
                        word = word[:2]
                    outstr += word
                    outstr += " "
        if len(str(outstr.strip())) != 0:
            seg_novel.append(str(outstr.strip()).split())
        line = novel.readline()
    novel.close()
    print("{} finished，with {} Row".format(novel_name, (len(seg_novel) - forward_rows)))
    print("-" * 40)
print("-" * 40)
print("-" * 40)
print("All finished，with {} Row".format(len(seg_novel)))

Waiting for 书剑恩仇录.txt...
书剑恩仇录.txt finished，with 3561 Row
----------------------------------------
Waiting for 侠客行.txt...
侠客行.txt finished，with 3514 Row
----------------------------------------
Waiting for 倚天屠龙记.txt...
倚天屠龙记.txt finished，with 7919 Row
----------------------------------------
Waiting for 天龙八部.txt...
天龙八部.txt finished，with 10948 Row
----------------------------------------
Waiting for 射雕英雄传.txt...
射雕英雄传.txt finished，with 7131 Row
----------------------------------------
Waiting for 白马啸西风.txt...
白马啸西风.txt finished，with 597 Row
----------------------------------------
Waiting for 碧血剑.txt...
碧血剑.txt finished，with 3786 Row
----------------------------------------
Waiting for 神雕侠侣.txt...
神雕侠侣.txt finished，with 6999 Row
----------------------------------------
Waiting for 笑傲江湖.txt...
笑傲江湖.txt finished，with 8551 Row
----------------------------------------
Waiting for 越女剑.txt...
越女剑.txt finished，with 197 Row
----------------------------------------
Waiting for 连城诀.txt...
连城诀.tx

# 训练word2vec模型

## 训练skip-gram模型

In [34]:
import gensim.models as w2v

model = w2v.Word2Vec(sentences=seg_novel, vector_size=200, window=10, min_count=5, sg=1)
model.save(data_path + 'data/skip_gram.model')  # 保存模型

## 训练CBOW模型

In [35]:
model = w2v.Word2Vec(sentences=seg_novel, vector_size=200, window=5, min_count=5, sg=0)
model.save(data_path + 'data/cbow.model')

# 关系分析

## 人物间关系分析

In [38]:
def cal_similarity(p1, p2, model_path):
    word2vec_model = w2v.Word2Vec.load(model_path)
    try:
        sim = word2vec_model.wv.similarity(p1, p2)
        return sim
    except Exception:
        return 0


person_list = ["张无忌", "赵敏", "周芷若", "小昭", "殷离", "双儿", "小龙女", "金花婆婆"]

skip_gram_path = "./data/skip_gram.model"
cbow_path = "./data/cbow.model"


### 采用skip_gram模型

In [55]:
import pandas as pd
import itertools

skip_sim_between_person = pd.DataFrame(columns=['person1', 'person2', 'Similarity'])

person_combinations = itertools.combinations(person_list, 2)

for person_pair in person_combinations:
    person1 = person_pair[0]
    person2 = person_pair[1]
    similarity = cal_similarity(person1, person2, skip_gram_path)
    skip_sim_between_person = skip_sim_between_person._append(
        {'person1': person1, 'person2': person2, 'Similarity': similarity}, ignore_index=True)

skip_sim_between_person

Unnamed: 0,person1,person2,Similarity
0,张无忌,赵敏,0.756818
1,张无忌,周芷若,0.697727
2,张无忌,小昭,0.558738
3,张无忌,殷离,0.425713
4,张无忌,双儿,0.200312
5,张无忌,小龙女,0.133034
6,张无忌,金花婆婆,0.383085
7,赵敏,周芷若,0.702674
8,赵敏,小昭,0.558679
9,赵敏,殷离,0.518284


### 采用cbow模型

In [44]:
cbow_sim_between_person = pd.DataFrame(columns=['person1', 'person2', 'Similarity'])

person_combinations = itertools.combinations(person_list, 2)

for person_pair in person_combinations:
    person1 = person_pair[0]
    person2 = person_pair[1]
    similarity = cal_similarity(person1, person2, cbow_path)
    cbow_sim_between_person = cbow_sim_between_person._append(
        {'person1': person1, 'person2': person2, 'sim': similarity}, ignore_index=True)

cbow_sim_between_person

Unnamed: 0,person1,person2,Similarity,sim
0,张无忌,赵敏,,0.748016
1,张无忌,周芷若,,0.675788
2,张无忌,小昭,,0.523435
3,张无忌,殷离,,0.507522
4,张无忌,双儿,,0.508742
5,张无忌,小龙女,,0.663821
6,张无忌,金花婆婆,,0.665959
7,赵敏,周芷若,,0.817713
8,赵敏,小昭,,0.837609
9,赵敏,殷离,,0.787739


## 人物与武功的关系分析

In [45]:
person_list1 = ["段誉", "乔峰", "杨过", "萧峰"]
kongfu_list = ["凌波微步", "北冥神功", "六脉神剑", "一阳指", "降龙十八掌", "打狗棒法", "黯然销魂掌", "寒冰绵掌",
               "独孤九剑"]

### 采用skip_garm模型

In [46]:
skip_garm_sim_person_and_kongfu = pd.DataFrame(columns=['person', 'KongFu', 'Similarity'])

for person in person_list:
    for kongfu in kongfu_list:
        skip_garm_sim_person_and_kongfu.loc[len(skip_garm_sim_person_and_kongfu.index)] = [
            person, kongfu, cal_similarity(person, kongfu, skip_gram_path)
        ]
skip_garm_sim_person_and_kongfu

Unnamed: 0,person,KongFu,Similarity
0,张无忌,凌波微步,0.068881
1,张无忌,北冥神功,0.175137
2,张无忌,六脉神剑,0.142454
3,张无忌,一阳指,0.112489
4,张无忌,降龙十八掌,0.068362
...,...,...,...
67,金花婆婆,降龙十八掌,0.215631
68,金花婆婆,打狗棒法,0.256235
69,金花婆婆,黯然销魂掌,0.355078
70,金花婆婆,寒冰绵掌,0.394853


### 基于cbow模型

In [48]:
cbow_sim_person_and_kongFu = pd.DataFrame(columns=['person', 'KongFu', 'Similarity'])

for person in person_list:
    for kongFu in kongfu_list:
        cbow_sim_person_and_kongFu.loc[len(cbow_sim_person_and_kongFu.index)] = [
            person, kongfu, cal_similarity(person, kongfu, cbow_path)
        ]
cbow_sim_person_and_kongFu

Unnamed: 0,person,KongFu,Similarity
0,张无忌,凌波微步,0.466145
1,张无忌,北冥神功,0.298969
2,张无忌,六脉神剑,0.269821
3,张无忌,一阳指,0.257400
4,张无忌,降龙十八掌,0.222122
...,...,...,...
67,金花婆婆,降龙十八掌,0.463278
68,金花婆婆,打狗棒法,0.466101
69,金花婆婆,黯然销魂掌,0.576117
70,金花婆婆,寒冰绵掌,0.609778


## 分析门派间关系

In [49]:
sects_list = ["雪山派", "青城派", "日月神教"]

### 基于skip_gram模型

In [52]:
skip_gram_sim_sects = pd.DataFrame(columns=['sect1', 'sect2', 'Similarity'])

for sect1 in sects_list:
    all_sim_df = pd.DataFrame(columns=['sect1', 'sect2', 'Similarity'])
    for sect2 in sects:
        all_sim_df.loc[len(all_sim_df.index)] = [
            sect1, sect2, cal_similarity(sect1, sect2, skip_gram_path)
        ]
    all_sim_df.sort_values(by="Similarity", inplace=True, ascending=False)
    skip_gram_sim_sects = skip_gram_sim_sects._append(all_sim_df.iloc[1:13, :], ignore_index = True)

skip_gram_sim_sects

Unnamed: 0,sect1,sect2,Similarity
0,雪山派,华山派,0.662393
1,雪山派,金乌派,0.635169
2,雪山派,长乐帮,0.609558
3,雪山派,五岳剑派,0.599231
4,雪山派,蓬莱派,0.588801
5,雪山派,青城派,0.588599
6,雪山派,仙都派,0.586743
7,雪山派,嵩山派,0.578344
8,雪山派,泰山派,0.565896
9,雪山派,星宿派,0.562297


### 基于cbow模型

In [54]:
cbow_sim_sects = pd.DataFrame(columns=['sect1', 'sect2', 'Similarity'])

for sect1 in sects_list:
    all_sim_df = pd.DataFrame(columns=['sect1', 'sect2', 'Similarity'])
    for sect2 in sects:
        all_sim_df.loc[len(all_sim_df.index)] = [
            sect1, sect2, cal_similarity(sect1, sect2, cbow_path)
        ]
    all_sim_df.sort_values(by="Similarity", inplace=True, ascending=False)
    cbow_sim_sects = cbow_sim_sects._append(all_sim_df.iloc[1:13, :], ignore_index = True)

cbow_sim_sects

Unnamed: 0,sect1,sect2,Similarity
0,雪山派,华山派,0.96939
1,雪山派,青城派,0.95435
2,雪山派,武当派,0.941336
3,雪山派,全真教,0.937639
4,雪山派,峨嵋派,0.93386
5,雪山派,嵩山派,0.921964
6,雪山派,恒山派,0.915982
7,雪山派,逍遥派,0.909161
8,雪山派,衡山派,0.907046
9,雪山派,少林派,0.904526
