Added regex for watchlist.names. Added test case for verifying the wa…

…tchlist.names. Need to plug this search in, but will do this after I wraup up the other handful of regexes needed.
truedat101 · May 22, 2009 · acdf862 · acdf862
1 parent 1bafcf6
commit acdf862
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 2 deletions.
diff --git a/examples/luffaproject.conf b/examples/luffaproject.conf
@@ -6,6 +6,11 @@
 #
 # Project Setup
 #
+# fullname - the name of the project, used for reporting purposes
+# path.uri = the path to the base of the code.  Currently this supports only one path.
+# source.ext.whitelist - The list of extensions for files you want to scan.  Other files are ignored.  There is no way as of 
+#								 yet to handle search on binary files.  Not sure if this makes sense for our purposes.
+#								 You could potentially scan symbols in object files and strings in classfiles, but naaaah.
 project.fullname=Project Luffa
 project.path.uri=../../../../src
 project.source.ext.whitelist=.py,.doc,.txt
@@ -27,9 +32,19 @@ license.gpl.v3=foo
 # 
 # Watchlist should be stuff that we want to flag in a report, typically 
 # Format should be watchlist.KEY=REGEXP
-watchlist.names=REGEXP
+# Python raw string notation will be appended to whatever you use for the regex.  I had the raw string notation here 
+# originally, but had problems with the string escaping.  
+#
+# names - should be list of known names of team members, aliases, nicknames
+# companies - any company names possibly referenced.  This is a good idea if you've acquired a company, or you think your
+# 				  dev team might have lifted code from someone else inadvertantly (yikes)
+# badwords - we all know what these are...don't use these words in your code unless you are creating apps for adult industry.
+# 				 The concrete 5 project has a nice list of bad words.  
+# emailaddresses - a * indicates search for all email addresses.  Otherwise, only search for email addresses in the list
+watchlist.names=(David|Mike|Truedat)
 watchlist.companies=REGEXP
-watchlist.words=REGEXP
+watchlist.badwords=REGEXP
+watchlist.emailaddresses=*
 
 #
 # Reports

diff --git a/src/razortooth/luffa/tools/scan.py b/src/razortooth/luffa/tools/scan.py
@@ -36,6 +36,7 @@
 import os
 import unittest
 import string
+import re
 
 class scan:
     confFile = 0
@@ -83,6 +84,7 @@ def deepScan(self, currentPath):
                 # print "about to deep scan %s" % os.path.join(currentPath, f)
                 self.deepScan(os.path.join(currentPath, f))
         else:
+            # XXX TODO Make sure this section handles double byte character encodings
             extList = self.luffaProjectEnv.get("project.source.ext.whitelist")
             for ext in extList.split(","): # Convert this to a regex, more efficient
                 if (currentPath.endswith(ext)): # XXX TODO FIX this to handle upper case
@@ -108,6 +110,15 @@ def testInitEnv(self):
     def testDeepScan1(self):
         propsRead = self.aLuffa.initEnv("../../../../examples/luffaproject.conf")
         self.aLuffa.deepScan(str(self.aLuffa.luffaProjectEnv["project.path.uri"]).rstrip()) # Watch the newlines.  Why?
+    def testWatchlistNames(self):
+        propsRead = self.aLuffa.initEnv("../../../../examples/luffaproject.conf")
+        pattern = self.aLuffa.luffaWatchlistEnv.get('watchlist.names').rstrip()
+        print "loaded watchlist.names pattern = %s" % pattern
+        p = re.compile(r"" + pattern + "", re.IGNORECASE)
+        result1 = p.findall("Mike and David are cool")
+        self.assert_(result1 > 0)
+        print result1
+        self.assert_(len(result1) == 2)
     def tearDown(self):
         print "tearing down"
 if __name__ == '__main__':