-
Notifications
You must be signed in to change notification settings - Fork 5
/
txt_dl.py
59 lines (47 loc) · 2.2 KB
/
txt_dl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
## Written by Ye Kyaw Thu, LU Lab., Myanmar
## for text mining with keyword
## last updated: 14 Nov 2023
import requests
from bs4 import BeautifulSoup
import re
import argparse
def download_sentences(url, keyword, num_sentences):
# Send a GET request to the website
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# Extract all text and split into sentences
text = soup.get_text()
sentences = re.split(r'[\u104A\u104B]+', text) # Split on PotHtee and PotMa
# Filter sentences that contain the keyword
keyword_sentences = [sentence for sentence in sentences if keyword in sentence]
# Limit the number of sentences if specified
return keyword_sentences[:num_sentences]
def process_urls(urls, keyword, num_sentences, output_filename):
output_data = []
for url in urls:
sentences = download_sentences(url, keyword, num_sentences)
output_data.extend(sentences)
if output_filename:
with open(output_filename, 'w', encoding='utf-8') as file:
for sentence in output_data:
file.write(sentence + '\n')
else:
for sentence in output_data:
print(sentence)
def main():
parser = argparse.ArgumentParser(description='Download sentences from webpages containing a specific keyword.')
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('--url', type=str, help='URL of the webpage to scrape')
group.add_argument('--url_file', type=str, help='File containing URLs to scrape, one per line')
parser.add_argument('--keyword', type=str, required=True, help='Keyword to search in sentences')
parser.add_argument('--num_sentences', type=int, default=10, help='Number of sentences to return')
parser.add_argument('--output_filename', type=str, help='File to save the downloaded sentences (optional, prints to stdout if not specified)')
args = parser.parse_args()
if args.url:
urls = [args.url]
else:
with open(args.url_file, 'r') as file:
urls = [line.strip() for line in file.readlines()]
process_urls(urls, args.keyword, args.num_sentences, args.output_filename)
if __name__ == '__main__':
main()