Skip to content

Commit

Permalink
Added feature to remove redundant logging
Browse files Browse the repository at this point in the history
  • Loading branch information
z7r1k3 committed Mar 24, 2021
1 parent a5a1b9c commit e9f46f4
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 21 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Creeper v1.2.1
A Web Crawler and Scraper, built in Python 3.8.
# Creeper v1.3.0
A Web Crawler and Scraper, built in Python 3.9.1

Works with HTTP(S) and FTP(S) links.

Expand Down
71 changes: 52 additions & 19 deletions creeper.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Version 1.2.1
# Version 1.3.0
from bs4 import BeautifulSoup
import datetime
import re
Expand All @@ -19,12 +19,11 @@
ogUrl = ''
ogUrlDomain = ''
totalDepth = 0
alreadyCrawled = []
alreadyCrawled = {}
urlList = {}
emailList = []
phoneList = []
errorCount = 0
errorLog = None


class CrawlJob:
Expand Down Expand Up @@ -92,9 +91,9 @@ def crawl(depth, url):
if (currentCrawlJob.depth > 1 and urlStrip(parsedUrl).startswith(ogUrlDomain) and urlStrip(parsedUrl) != urlStrip(ogUrl)):
crawl(currentCrawlJob.depth - 1, parsedUrl)

alreadyCrawled.append(currentCrawlJob.checkLink)
alreadyCrawled[currentCrawlJob.checkLink] = currentCrawlJob.depth

elif (currentCrawlJob.depth > 0): # If URL has already been crawled, use the previously stored URL's
elif (currentCrawlJob.depth > 0 and isQualifiedRelog(currentCrawlJob.depth, currentCrawlJob.checkLink)): # If URL has already been crawled, use the previously stored URL's if redundant logging is enabled or URL has higher depth.
for url in urlList[currentCrawlJob.checkLink]:

if (isQualifiedLink(url)):
Expand Down Expand Up @@ -196,6 +195,21 @@ def isQualifiedEmail(url): # Return boolean on whether the passed item is a vali
return False


def isQualifiedInput(depth, scrape, save, relog, displayLevel):
binaryInputChecklist = [scrape, save, relog]
isBinaryInput = True
isDisplayLevel = True

for u in binaryInputChecklist:
if (not u.lower().startswith('y') and not u.lower().startswith('n')):
isBinaryInput = False

if (not isinstance(displayLevel, int) or (displayLevel < 0 or displayLevel > 2)):
isDisplayLevel = False

return isinstance(depth, int) and isBinaryInput and isDisplayLevel


def isQualifiedLink(url): # Return boolean on whether the passed item is crawlable or not (i.e. not a mailto: or .mp3 file)
if (urlStrip(url).endswith('..')): return False # Back links

Expand All @@ -222,6 +236,10 @@ def isQualifiedPhone(url): # Return boolean on whether the passed item is a vali
return False


def isQualifiedRelog(depth, checkLink):
return relog or depth > alreadyCrawled[checkLink] # If depth is greater than when previously crawled, there is more to be discovered, hence the recrawl


def isWebFile(url): # Return boolean on whether the passed URL ends with one of the extensions in fileEndings or not
if (url.endswith('/')): url = url[:-1] # Remove last '/' if applicable

Expand All @@ -236,11 +254,8 @@ def log(depth, entry): # entry can be either a string (URL, Phone, Email) or an
indent = ''

if (type(entry) is Error):
global errorLog
errorMessage = 'ERROR ' + str(entry.count) + '.' + str(entry.code) + ': ' + entry.message + ' | ' + entry.url

if (errorLog == None): errorLog = open(logPath.error, 'w+')

if (displayLevel > 0): print(errorMessage)

errorLog.write(errorMessage)
Expand Down Expand Up @@ -368,40 +383,57 @@ def urlStrip(url): # Returns the bare URL after removing http, https, www, etc.

# Get user variables
urlInputList = input('\nPlease enter the target URL(s), separated by spaces:\n').split(' ')
totalDepth = int(input('\nPlease enter how many levels deep the crawler should go:\n'))
scrape = input('''

while True:
try:
totalDepth = int(input('\nPlease enter how many levels deep the crawler should go:\n'))
except Exception as e:
totalDepth = None

scrape = input('''
Do you want to scrape for emails and phone numbers?
y: yes
n: no
''')

scrape = scrape.lower() == 'y' or scrape.lower() == 'yes'

save = input('''
save = input('''
Would you like to save all data to files in the /logs folder?
y: yes
n: no
''')

save = save.lower() == 'y' or save.lower() == 'yes'

displayLevel = int(input('''
relog = input('''
Would you like to log redundant URL's?
y: yes (Preserves original tree structure)
n: no (Reduces overall crawling duration)
''')

displayLevel = int(input('''
Please select a logging display option:
0: Quiet
1: Standard
2: Verbose
'''))

if (isQualifiedInput(totalDepth, scrape, save, relog, displayLevel)):
break
else:
print("\n***\nINVALID INPUT\n***\n")

scrape = scrape.lower().startswith('y')
save = save.lower().startswith('y')
relog = relog.lower().startswith('y')

logPath = LogPath()

# Open log files if applicable
errorLog = open(logPath.error, 'w+')

if (save):
global urlLog
urlLog = open(logPath.url, 'w+')

if (scrape):
global emailLog
emailLog = open(logPath.email, 'w+')
global phoneLog
phoneLog = open(logPath.phone, 'w+')

# Begin crawling/scraping
Expand All @@ -410,6 +442,7 @@ def urlStrip(url): # Returns the bare URL after removing http, https, www, etc.
ogUrlDomain = getDomain(ogUrl)

print('\n\n\nCrawling ' + link + '\n')

crawl(totalDepth, link)

if (displayLevel > 0):
Expand Down

0 comments on commit e9f46f4

Please sign in to comment.