-
Notifications
You must be signed in to change notification settings - Fork 1
/
sampling_domains.py
44 lines (34 loc) · 1.19 KB
/
sampling_domains.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# Sampling from dataset by article url domain
# Yifan Zhang (yzhang@hbku.edu.qa)
import sys
import json
import random
import fileinput
from collections import Counter, defaultdict
from urllib.parse import urlparse
def domain_of(url):
parts = urlparse(url)
hostname = parts.hostname or parts.path
return hostname
NSAMPLES = 20
domains = Counter()
samples = defaultdict(list)
for i, line in enumerate(fileinput.input(files=("-"), encoding="utf-8")):
page = json.loads(line)
# in order to process the result of our script (merge results), we
# will use domain and domain_count if available
domain = page.get('domain', domain_of(page['url']))
count = page.get('domain_count', 1)
domains.update({domain: count})
if page['text'] and \
page['language'].startswith("ara"):
#
if len(samples[domain]) < NSAMPLES:
samples[domain].append({'domain': domain, **page})
elif random.randint(0, 10) == 1:
samples[domain].pop(0)
samples[domain].append({'domain': domain, **page})
for domain, count in domains.most_common():
for page in samples.get(domain, []):
page.update({'domain_count': count})
print(json.dumps(page, ensure_ascii=False), file=sys.stdout)