-
Notifications
You must be signed in to change notification settings - Fork 0
/
knowledge_base_constructing.py
96 lines (79 loc) · 3.66 KB
/
knowledge_base_constructing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import os
from lxml import etree
from bs4 import BeautifulSoup as bs
def get_file(path):
#path = os.getcwd()
dirs = os.listdir(path)
return dirs
def write2triple(subject,predicate,object):
with open('weaponKB.txt','a',encoding='utf-8') as f:
f.write(subject+" "+predicate+" "+object+"\n")
def theFinalWeapon(father_dir,weapon):
content = ''
with open(father_dir,'r',encoding='utf-8') as f:
if os.path.getsize(father_dir) > 1000:
for line in f:
content = content + line
else:
return 0
root = etree.HTML(content)
#国籍
country = root.xpath("//span[@class = 'country']/b/a/text()")
write2triple(weapon,'国籍',country[0])
print(weapon,'国籍',country[0])##################
#简介
try:
description = root.xpath("//div[@class = 'intron']/div[@class = 'module']/text() | //div[@class = 'intron']/div[@class = 'module']/p/text()")
write2triple(weapon,'简介',"".join(description).strip().replace("\n", ""))#输入的文本必须要去除空格和回车
print(weapon,'简介',"".join(description).strip().replace(" ","").replace("\n", ""))########################
except:
pass
#结构信息,使用情况
try:
infobox_object = root.xpath("//div[@class = 'info']/div/div[@class = 'otherList'][1]/div[@class='textInfo']/p/text()")
infobox_predicate = root.xpath("//div[@class = 'info']/div/div[@class = 'otherList'][1]/h3[@class='title_']/text()")
write2triple(weapon,infobox_predicate[0],"".join(infobox_object).strip().replace("\n", ""))
print(weapon,infobox_predicate[0],"".join(infobox_object).strip().replace(" ","").replace("\n", ""))####################
except:
pass
#基本数据 dataInfo/ul[1]
try:
infobox_object = root.xpath("//div[@class = 'dataInfo']/ul[1]/li/text()")
infobox_predicate = root.xpath("//div[@class = 'dataInfo']/ul[1]/li/span/text()")
for predicate,object in zip(infobox_predicate,infobox_object):
write2triple(weapon,predicate.replace(":",""),object)
print(weapon,predicate.replace(":",""),object)###############
except:
pass
#技术数据 dataInfo/ul[@class ='dataList']
try:
infobox_object = root.xpath("//div[@class = 'dataInfo']/ul[@class='dataList']/li/text() | //div[@class = 'dataInfo']/ul[@class='dataList']/li/b/text()")
infobox_predicate = root.xpath("//div[@class = 'dataInfo']/ul[@class='dataList']/li/span/text()")
for predicate,object in zip(infobox_predicate,infobox_object):
write2triple(weapon,predicate.replace(":",""),object)
print(weapon,predicate.replace(":",""),object)####################
except:
pass
def weaponlist(father_dir,father):
#print(father_dir,father)
dirs = get_file(father_dir)
for weaponName in dirs:
write2triple(father,'子类',weaponName.replace(".html",""))
#print(father,'子类',weaponName.replace(".html",""))
mother_dir = father_dir+"\\"+weaponName
theFinalWeapon(mother_dir,weaponName.replace(".html",""))
def weaponclass(father_dir,father):
dirs = get_file(father_dir)
for weaponName in dirs:
write2triple(father,'子类',weaponName)
mother_dir = father_dir+"\\"+weaponName
weaponlist(mother_dir,weaponName)
def start():
dirs = get_file(os.getcwd()+"\\weapon")
#print(os.getcwd()+"\\weapon")
for weaponName in dirs:
write2triple('武器','子类',weaponName)
father_dir = os.getcwd()+"\\weapon\\"+weaponName
weaponclass(father_dir,weaponName)
if __name__ =='__main__':
start()