Skip to content

Commit

Permalink
minor fix for tokenization
Browse files Browse the repository at this point in the history
  • Loading branch information
faraday committed Aug 21, 2010
1 parent d2f0a30 commit 31f0d92
Showing 1 changed file with 2 additions and 3 deletions.
5 changes: 2 additions & 3 deletions scanData.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,7 @@

# reToken = re.compile('[a-zA-Z\-]+')
reToken = re.compile("[^ \t\n\r`~!@#$%^&*()_=+|\[;\]\{\},./?<>:’'\\\\\"]+")
reAlpha = re.compile("[a-zA-Z\-_]")
reNum = re.compile("[0-9]")
reAlpha = re.compile("^[a-zA-Z\-_]+$")
NONSTOP_THRES = 100

STEMMER = Stemmer.Stemmer('porter')
Expand Down Expand Up @@ -248,7 +247,7 @@ def recordArticle(pageDict):
wordCount = 0
for m in reToken.finditer(cmerged):
w = m.group()
if not w or len(w) <= 2 or not reAlpha.search(w) or reNum.search(w):
if not w or len(w) <= 2 or not reAlpha.match(w):
continue
lword = w.lower()
if not lword in STOP_WORDS:
Expand Down

0 comments on commit 31f0d92

Please sign in to comment.