Skip to content

Commit 01ba41c

Browse files
committed
convert log parser to python3
1 parent 23f9ad3 commit 01ba41c

1 file changed

Lines changed: 230 additions & 0 deletions

File tree

CPScripts/access-logparser.py

Lines changed: 230 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,230 @@
1+
#!/usr/bin/python
2+
# -*- coding: utf-8 -*-
3+
# Originally based on code from: https://leancrew.com/all-this/2013/07/parsing-my-apache-logs/
4+
5+
import os
6+
import re
7+
import sys
8+
from collections import Counter
9+
from datetime import datetime, date, timedelta
10+
11+
12+
# print('version is', sys.version)
13+
14+
def detectcontrolpanel():
15+
global controlpanel
16+
try:
17+
if os.path.isfile('/usr/local/cpanel/cpanel'):
18+
controlpanel = 'cpanel'
19+
except:
20+
controlpanel = 'Control Panel not found'
21+
22+
try:
23+
if os.path.isfile('/usr/bin/cyberpanel'):
24+
controlpanel = 'cyberpanel'
25+
except:
26+
controlpanel = 'Control Panel not found'
27+
return controlpanel
28+
29+
30+
def main():
31+
script = sys.argv[0]
32+
filename = sys.argv[2]
33+
# filenametest = "/home/example.com.access_log"
34+
35+
# Define the day of interest in the Apache common log format.
36+
try:
37+
daysAgo = int(sys.argv[1])
38+
# daysAgo = 2
39+
except:
40+
daysAgo = 1
41+
theDay = date.today() - timedelta(daysAgo)
42+
apacheDay = theDay.strftime('[%d/%b/%Y:')
43+
44+
# Regex for the Apache common log format.
45+
parts = [ # host %h :ip/hostname of the client 172.68.142.138
46+
# indent %l (unused) :client identity via client's identd configuration -
47+
# user %u :HTTP authenticated user ID -
48+
# time %t :timestamp [09/Mar/2019:00:38:03 -0600]
49+
# request "%r" :request method of request, resource requested, & protocol "POST /wp-login.php HTTP/1.1"
50+
# status %>s :Apache status code 404
51+
# size %b (careful,can be'-'):size of request in bytes, excluding headers 3767
52+
# referrer "%{Referer}i" :Referer "https://www.google.com/"
53+
# user agent "%{User-agent}i":User-Agent "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"
54+
r'(?P<host>\S+)',
55+
r'\S+',
56+
r'(?P<user>\S+)',
57+
r'\[(?P<time>.+)\]',
58+
r'"(?P<request>.*)"',
59+
r'(?P<status>[0-9]+)',
60+
r'(?P<size>\S+)',
61+
r'"(?P<referrer>.*)"',
62+
r'"(?P<agent>.*)"',
63+
]
64+
pattern = re.compile(r'\s+'.join(parts) + r'\s*\Z')
65+
66+
# Regex for a feed request.
67+
feed = re.compile(r'/all-this/(\d\d\d\d/\d\d/[^/]+/)?feed/(atom/)?')
68+
69+
# Regexes for internal and Google search referrers.
70+
71+
internal = re.compile(r'https?://(www\.)?example\.com.*')
72+
google = re.compile(r'https?://(www\.)?google\..*')
73+
74+
# Regexes for Uptime Monitoring Robots
75+
uptimeroboturl = re.compile(r'https?://(www\.)?uptimerobot\..*')
76+
uptimerobot = re.compile(r'UptimeRobot')
77+
78+
# WordPress CMS Regex
79+
wordpresslogin = re.compile(r'wp-login\.php.*')
80+
wordpressadmin = re.compile(r'wp-admin')
81+
wordpresscron = re.compile(r'wp-cron\.php.*')
82+
wordpressxmlrpc = re.compile(r'xmlrpc\.php')
83+
wordpressajax = re.compile(r'admin-ajax\.php')
84+
85+
# Change Apache log items into Python types.
86+
87+
def pythonized(d):
88+
# Clean up the request.
89+
90+
d['request'] = d['request'].split()[1]
91+
92+
# Some dashes become None.
93+
94+
for k in ('user', 'referrer', 'agent'):
95+
if d[k] == '-':
96+
d[k] = None
97+
98+
# The size dash becomes 0.
99+
100+
if d['size'] == '-':
101+
d['size'] = 0
102+
else:
103+
d['size'] = int(d['size'])
104+
105+
# Convert the timestamp into a datetime object. Accept the server's time zone.
106+
107+
(time, zone) = d['time'].split()
108+
d['time'] = datetime.strptime(time, '%d/%b/%Y:%H:%M:%S')
109+
110+
return d
111+
112+
# Is this hit a page?
113+
114+
def ispage(hit):
115+
# Failures and redirects.
116+
117+
hit['status'] = int(hit['status'])
118+
if hit['status'] < 200 or hit['status'] >= 300:
119+
return False
120+
121+
# Feed requests.
122+
123+
if feed.search(hit['request']):
124+
return False
125+
126+
# Requests that aren't GET.
127+
128+
# if (hit['request'])[0:3] != 'GET':
129+
# return False
130+
131+
# Images, sounds, etc.
132+
133+
if hit['request'].split()[1][-1] != '/':
134+
return False
135+
136+
# Requests that aren't Head type. AKA uptime monitoring
137+
138+
if (hit['request'])[0:3] == 'HEAD':
139+
return False
140+
141+
# Must be a page.
142+
143+
return True
144+
145+
# Is the referrer interesting? Internal and Google referrers are not.
146+
def goodref(hit):
147+
if hit['referrer']:
148+
return not (google.search(hit['referrer'])
149+
or internal.search(hit['referrer']))
150+
else:
151+
return False
152+
153+
# Is the user agent interesting? An uptime monitoring robot is not.
154+
def goodagent(hit):
155+
if hit['agent']:
156+
return not (uptimerobot.search(hit['agent'])
157+
or uptimeroboturl.search(hit['agent']))
158+
else:
159+
return False
160+
161+
# Is the request a Wordpress related login event?
162+
def wordpressbrute(hit):
163+
if hit['request']:
164+
return (wordpresslogin.search(hit['request'])
165+
or wordpressadmin.search(hit['request']))
166+
else:
167+
return False
168+
169+
# Initialize.
170+
171+
pages = []
172+
173+
# Parse all the lines associated with the day of interest.
174+
175+
# Open file
176+
log = open(filename)
177+
for line in log:
178+
if apacheDay in line:
179+
m = pattern.match(line)
180+
hit = m.groupdict()
181+
if ispage(hit):
182+
pages.append(pythonized(hit))
183+
else:
184+
continue
185+
log.close()
186+
187+
# Show the top five pages and the total.
188+
189+
print ('Show top 10 pages %s' % theDay.strftime('%b %d, %Y'))
190+
pageviews = Counter(x['request'] for x in pages if goodagent(x))
191+
pagestop10 = pageviews.most_common(10)
192+
for p in pagestop10:
193+
print (' %5d %s' % p[::-1])
194+
print (' %5d total' % len(pages))
195+
196+
# Show the top five referrers.
197+
198+
print ('''
199+
Show top 10 referrers %s''' % theDay.strftime('%b %d, %Y'))
200+
referrers = Counter(x['referrer'] for x in pages if goodref(x))
201+
referrerstop10 = referrers.most_common(10)
202+
for r in referrerstop10:
203+
print (' %5d %s' % r[::-1])
204+
print (' %5d total' % sum(referrers.values()))
205+
206+
# Show the top 10 IPs.
207+
print ('''
208+
Show Top 10 IPs %s''' % theDay.strftime('%b %d, %Y'))
209+
iphits = Counter(x['host'] for x in pages if goodagent(x))
210+
iptop10 = iphits.most_common(10)
211+
for p in iptop10:
212+
print (' %5d %s' % p[::-1])
213+
print (' %5d total hits' % sum(iphits.values()))
214+
215+
# CMS Checks
216+
217+
# Wordpress Checks
218+
# Wordpress Login Bruteforcing checks for wp-login.php
219+
print ('''
220+
Wordpress Bruteforce Logins for wp-login.php %s''' % theDay.strftime('%b %d, %Y'))
221+
wordpressloginhits = Counter(x['request'] for x in pages if wordpressbrute(x))
222+
# wordpresslogintop10 = wordpressloginhits.most_common(10)
223+
# for p in wordpresslogintop10:
224+
# print ' %5d %s' % p[::-1]
225+
print (' %5d total' % sum(wordpressloginhits.values()))
226+
227+
228+
if __name__ == '__main__':
229+
# detectcontrolpanel()
230+
main()

0 commit comments

Comments
 (0)