Skip to content

Commit 406ef1d

Browse files
committed
edit link extractor tutorial
1 parent 272ee58 commit 406ef1d

File tree

2 files changed

+8
-2
lines changed

2 files changed

+8
-2
lines changed

web-scraping/link-extractor/link_extractor.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
GREEN = colorama.Fore.GREEN
1010
GRAY = colorama.Fore.LIGHTBLACK_EX
1111
RESET = colorama.Fore.RESET
12+
YELLOW = colorama.Fore.YELLOW
1213

1314
# initialize the set of links (unique links)
1415
internal_urls = set()
@@ -62,7 +63,7 @@ def get_all_website_links(url):
6263
return urls
6364

6465

65-
def crawl(url, max_urls=50):
66+
def crawl(url, max_urls=30):
6667
"""
6768
Crawls a web page and extracts all links.
6869
You'll find all links in `external_urls` and `internal_urls` global set variables.
@@ -71,6 +72,7 @@ def crawl(url, max_urls=50):
7172
"""
7273
global total_urls_visited
7374
total_urls_visited += 1
75+
print(f"{YELLOW}[*] Crawling: {url}{RESET}")
7476
links = get_all_website_links(url)
7577
for link in links:
7678
if total_urls_visited > max_urls:
@@ -93,6 +95,7 @@ def crawl(url, max_urls=50):
9395
print("[+] Total Internal links:", len(internal_urls))
9496
print("[+] Total External links:", len(external_urls))
9597
print("[+] Total URLs:", len(external_urls) + len(internal_urls))
98+
print("[+] Total crawled URLs:", max_urls)
9699

97100
domain_name = urlparse(url).netloc
98101

web-scraping/link-extractor/link_extractor_js.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
GREEN = colorama.Fore.GREEN
1010
GRAY = colorama.Fore.LIGHTBLACK_EX
1111
RESET = colorama.Fore.RESET
12+
YELLOW = colorama.Fore.YELLOW
1213

1314
# initialize the set of links (unique links)
1415
internal_urls = set()
@@ -71,7 +72,7 @@ def get_all_website_links(url):
7172
return urls
7273

7374

74-
def crawl(url, max_urls=50):
75+
def crawl(url, max_urls=30):
7576
"""
7677
Crawls a web page and extracts all links.
7778
You'll find all links in `external_urls` and `internal_urls` global set variables.
@@ -80,6 +81,7 @@ def crawl(url, max_urls=50):
8081
"""
8182
global total_urls_visited
8283
total_urls_visited += 1
84+
print(f"{YELLOW}[*] Crawling: {url}{RESET}")
8385
links = get_all_website_links(url)
8486
for link in links:
8587
if total_urls_visited > max_urls:
@@ -102,6 +104,7 @@ def crawl(url, max_urls=50):
102104
print("[+] Total Internal links:", len(internal_urls))
103105
print("[+] Total External links:", len(external_urls))
104106
print("[+] Total URLs:", len(external_urls) + len(internal_urls))
107+
print("[+] Total crawled URLs:", max_urls)
105108

106109
domain_name = urlparse(url).netloc
107110

0 commit comments

Comments
 (0)