1+ #!/usr/bin/python
2+ # -*- coding: utf-8 -*-
3+ # Originally based on code from: https://leancrew.com/all-this/2013/07/parsing-my-apache-logs/
4+
5+ import os
6+ import re
7+ import sys
8+ from collections import Counter
9+ from datetime import datetime , date , timedelta
10+
11+
12+ # print('version is', sys.version)
13+
14+ def detectcontrolpanel ():
15+ global controlpanel
16+ try :
17+ if os .path .isfile ('/usr/local/cpanel/cpanel' ):
18+ controlpanel = 'cpanel'
19+ except :
20+ controlpanel = 'Control Panel not found'
21+
22+ try :
23+ if os .path .isfile ('/usr/bin/cyberpanel' ):
24+ controlpanel = 'cyberpanel'
25+ except :
26+ controlpanel = 'Control Panel not found'
27+ return controlpanel
28+
29+
30+ def main ():
31+ script = sys .argv [0 ]
32+ filename = sys .argv [2 ]
33+ # filenametest = "/home/example.com.access_log"
34+
35+ # Define the day of interest in the Apache common log format.
36+ try :
37+ daysAgo = int (sys .argv [1 ])
38+ # daysAgo = 2
39+ except :
40+ daysAgo = 1
41+ theDay = date .today () - timedelta (daysAgo )
42+ apacheDay = theDay .strftime ('[%d/%b/%Y:' )
43+
44+ # Regex for the Apache common log format.
45+ parts = [ # host %h :ip/hostname of the client 172.68.142.138
46+ # indent %l (unused) :client identity via client's identd configuration -
47+ # user %u :HTTP authenticated user ID -
48+ # time %t :timestamp [09/Mar/2019:00:38:03 -0600]
49+ # request "%r" :request method of request, resource requested, & protocol "POST /wp-login.php HTTP/1.1"
50+ # status %>s :Apache status code 404
51+ # size %b (careful,can be'-'):size of request in bytes, excluding headers 3767
52+ # referrer "%{Referer}i" :Referer "https://www.google.com/"
53+ # user agent "%{User-agent}i":User-Agent "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0"
54+ r'(?P<host>\S+)' ,
55+ r'\S+' ,
56+ r'(?P<user>\S+)' ,
57+ r'\[(?P<time>.+)\]' ,
58+ r'"(?P<request>.*)"' ,
59+ r'(?P<status>[0-9]+)' ,
60+ r'(?P<size>\S+)' ,
61+ r'"(?P<referrer>.*)"' ,
62+ r'"(?P<agent>.*)"' ,
63+ ]
64+ pattern = re .compile (r'\s+' .join (parts ) + r'\s*\Z' )
65+
66+ # Regex for a feed request.
67+ feed = re .compile (r'/all-this/(\d\d\d\d/\d\d/[^/]+/)?feed/(atom/)?' )
68+
69+ # Regexes for internal and Google search referrers.
70+
71+ internal = re .compile (r'https?://(www\.)?example\.com.*' )
72+ google = re .compile (r'https?://(www\.)?google\..*' )
73+
74+ # Regexes for Uptime Monitoring Robots
75+ uptimeroboturl = re .compile (r'https?://(www\.)?uptimerobot\..*' )
76+ uptimerobot = re .compile (r'UptimeRobot' )
77+
78+ # WordPress CMS Regex
79+ wordpresslogin = re .compile (r'wp-login\.php.*' )
80+ wordpressadmin = re .compile (r'wp-admin' )
81+ wordpresscron = re .compile (r'wp-cron\.php.*' )
82+ wordpressxmlrpc = re .compile (r'xmlrpc\.php' )
83+ wordpressajax = re .compile (r'admin-ajax\.php' )
84+
85+ # Change Apache log items into Python types.
86+
87+ def pythonized (d ):
88+ # Clean up the request.
89+
90+ d ['request' ] = d ['request' ].split ()[1 ]
91+
92+ # Some dashes become None.
93+
94+ for k in ('user' , 'referrer' , 'agent' ):
95+ if d [k ] == '-' :
96+ d [k ] = None
97+
98+ # The size dash becomes 0.
99+
100+ if d ['size' ] == '-' :
101+ d ['size' ] = 0
102+ else :
103+ d ['size' ] = int (d ['size' ])
104+
105+ # Convert the timestamp into a datetime object. Accept the server's time zone.
106+
107+ (time , zone ) = d ['time' ].split ()
108+ d ['time' ] = datetime .strptime (time , '%d/%b/%Y:%H:%M:%S' )
109+
110+ return d
111+
112+ # Is this hit a page?
113+
114+ def ispage (hit ):
115+ # Failures and redirects.
116+
117+ hit ['status' ] = int (hit ['status' ])
118+ if hit ['status' ] < 200 or hit ['status' ] >= 300 :
119+ return False
120+
121+ # Feed requests.
122+
123+ if feed .search (hit ['request' ]):
124+ return False
125+
126+ # Requests that aren't GET.
127+
128+ # if (hit['request'])[0:3] != 'GET':
129+ # return False
130+
131+ # Images, sounds, etc.
132+
133+ if hit ['request' ].split ()[1 ][- 1 ] != '/' :
134+ return False
135+
136+ # Requests that aren't Head type. AKA uptime monitoring
137+
138+ if (hit ['request' ])[0 :3 ] == 'HEAD' :
139+ return False
140+
141+ # Must be a page.
142+
143+ return True
144+
145+ # Is the referrer interesting? Internal and Google referrers are not.
146+ def goodref (hit ):
147+ if hit ['referrer' ]:
148+ return not (google .search (hit ['referrer' ])
149+ or internal .search (hit ['referrer' ]))
150+ else :
151+ return False
152+
153+ # Is the user agent interesting? An uptime monitoring robot is not.
154+ def goodagent (hit ):
155+ if hit ['agent' ]:
156+ return not (uptimerobot .search (hit ['agent' ])
157+ or uptimeroboturl .search (hit ['agent' ]))
158+ else :
159+ return False
160+
161+ # Is the request a Wordpress related login event?
162+ def wordpressbrute (hit ):
163+ if hit ['request' ]:
164+ return (wordpresslogin .search (hit ['request' ])
165+ or wordpressadmin .search (hit ['request' ]))
166+ else :
167+ return False
168+
169+ # Initialize.
170+
171+ pages = []
172+
173+ # Parse all the lines associated with the day of interest.
174+
175+ # Open file
176+ log = open (filename )
177+ for line in log :
178+ if apacheDay in line :
179+ m = pattern .match (line )
180+ hit = m .groupdict ()
181+ if ispage (hit ):
182+ pages .append (pythonized (hit ))
183+ else :
184+ continue
185+ log .close ()
186+
187+ # Show the top five pages and the total.
188+
189+ print ('Show top 10 pages %s' % theDay .strftime ('%b %d, %Y' ))
190+ pageviews = Counter (x ['request' ] for x in pages if goodagent (x ))
191+ pagestop10 = pageviews .most_common (10 )
192+ for p in pagestop10 :
193+ print (' %5d %s' % p [::- 1 ])
194+ print (' %5d total' % len (pages ))
195+
196+ # Show the top five referrers.
197+
198+ print ('''
199+ Show top 10 referrers %s''' % theDay .strftime ('%b %d, %Y' ))
200+ referrers = Counter (x ['referrer' ] for x in pages if goodref (x ))
201+ referrerstop10 = referrers .most_common (10 )
202+ for r in referrerstop10 :
203+ print (' %5d %s' % r [::- 1 ])
204+ print (' %5d total' % sum (referrers .values ()))
205+
206+ # Show the top 10 IPs.
207+ print ('''
208+ Show Top 10 IPs %s''' % theDay .strftime ('%b %d, %Y' ))
209+ iphits = Counter (x ['host' ] for x in pages if goodagent (x ))
210+ iptop10 = iphits .most_common (10 )
211+ for p in iptop10 :
212+ print (' %5d %s' % p [::- 1 ])
213+ print (' %5d total hits' % sum (iphits .values ()))
214+
215+ # CMS Checks
216+
217+ # Wordpress Checks
218+ # Wordpress Login Bruteforcing checks for wp-login.php
219+ print ('''
220+ Wordpress Bruteforce Logins for wp-login.php %s''' % theDay .strftime ('%b %d, %Y' ))
221+ wordpressloginhits = Counter (x ['request' ] for x in pages if wordpressbrute (x ))
222+ # wordpresslogintop10 = wordpressloginhits.most_common(10)
223+ # for p in wordpresslogintop10:
224+ # print ' %5d %s' % p[::-1]
225+ print (' %5d total' % sum (wordpressloginhits .values ()))
226+
227+
228+ if __name__ == '__main__' :
229+ # detectcontrolpanel()
230+ main ()
0 commit comments