-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataloader.py
54 lines (40 loc) · 1.7 KB
/
dataloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import os
from bs4 import BeautifulSoup
def get_data(path):
files = [file for file in sorted(os.listdir(path)) if file.endswith('.sgm')]
text_categories = {}
for file in files:
f = open(os.path.join(path, file), 'r', encoding='utf-8', errors='ignore')
dataFile = f.read()
soup = BeautifulSoup(dataFile, 'html.parser')
contents = soup.findAll('body')
categories = soup.findAll('topics')
for text_tag, topic_tag in zip(contents, categories):
text = text_tag.get_text()
categories = [category.text.strip() for category in topic_tag.find_all('d')]
text_categories[text] = categories
non_cat = [key for key, value in text_categories.items() if len(value) < 1]
for key in non_cat:
del text_categories[key]
categories_list = ["earn", "acq", "grain", "crude", "money-fx"]
wrng_cat = []
for txt, cls in text_categories.items():
if len(cls) == 1 and cls[0] not in categories_list:
wrng_cat.append(txt)
elif len(cls) > 1:
count=0
for c in cls:
if c in categories_list:
count+=1
if count == 0 or count > 1:
wrng_cat.append(txt)
for key in wrng_cat:
del text_categories[key]
documents = list(text_categories.keys())
categories = list(text_categories.values())
for i, cat in enumerate(categories):
if len(cat) > 1:
for cat_vl in categories_list:
if cat_vl in cat:
categories[i] = [cat_vl]
return documents, categories