Merge branch 'master' of git@github.com:faraday/wikiprep-esa

turian · Aug 18, 2010 · 58e97e3 · 58e97e3
2 parents a2b9a8e + d566443
commit 58e97e3
Show file tree

Hide file tree

Showing 7 changed files with 172,167 additions and 12 deletions.
diff --git a/README b/README
@@ -66,7 +66,18 @@ Trove: http://trove4j.sourceforge.net/
 
 USAGE
 
+This creates the pagelinks table and records incoming and outgoing link counts.
+
+python scanLinks.py <hgw.xml file from Wikiprep dump>
+
+As stop categories, a list "wiki_stop_categories.txt" is provided.
+But if you want to descend down and include all subtrees of these categories, you can use:
+
 python scanCatHier.py <hgw.xml file from Wikiprep dump> <cat_hier output path>
+
+
+[The commands below are standard]
+
 python scanData.py <hgw.xml file from Wikiprep dump>
 python addAnchors.py <anchor_text file from Wikiprep dump> <a writeable folder>
 

diff --git a/directScan.py b/directScan.py
@@ -0,0 +1,228 @@
+#!/usr/bin/python
+
+'''
+Copyright (C) 2010  Cagatay Calli <ccalli@gmail.com>
+
+Run scanLinks.py first..
+Scans using IDs from Gabrilovich log and XML output (gum.xml) from Wikiprep, creates 3 tables:
+
+TABLE: article   	COLUMNS: id INT, title VARBINARY(255)
+TABLE: text 	 	COLUMNS: old_id INT, old_text MEDIUMBLOB
+TABLE: pagelinks	COLUMNS: source_id INT, target_id INT
+
+USAGE: scanData.py <hgw.xml file from Wikiprep>
+
+IMPORTANT: If you use XML output from a recent version of Wikiprep
+(e.g. Zemanta fork), then set FORMAT to 'Zemanta-legacy' or 'Zemanta-modern'.
+
+'''
+
+import sys
+import re
+import MySQLdb
+import signal
+
+import lxml.html as html
+
+# formats: 1) Gabrilovich 2) Zemanta-legacy 3) Zemanta-modern
+FORMAT = 'Gabrilovich'
+
+TITLE_WEIGHT = 4
+
+idList = []
+try:
+	f = open('selected.txt','r')
+	for line in f.readlines():
+		strId = line.split('\t')[0]
+		if strId:
+			idList.append(int(strId))
+	f.close()
+except:
+	print '(Direct) Article list cannot be read! Please put "selected.txt" file containing stop categories in this folder.'
+	sys.exit(1)
+
+ARTICLE_IDS = frozenset(idList)
+
+try:
+	conn = MySQLdb.connect(host='localhost',user='root',passwd='123456',db='wiki',charset = "utf8", use_unicode = True)
+except MySQLdb.Error, e:
+	print "Error %d: %s" % (e.args[0], e.args[1])
+	sys.exit(1)
+
+try:
+	cursor = conn.cursor()
+	cursor.execute("DROP TABLE IF EXISTS article")
+	cursor.execute("""
+		CREATE TABLE article
+		(
+		  id	INT(10),
+		  title	VARBINARY(255) NOT NULL,
+		  PRIMARY KEY (id),
+		  KEY title (title(32))
+		) DEFAULT CHARSET=binary
+	""")
+
+	cursor.execute("DROP TABLE IF EXISTS text")
+	cursor.execute("""
+		CREATE TABLE text 
+		(
+		  old_id INT(10) unsigned NOT NULL,
+		  old_text MEDIUMBLOB NOT NULL,
+		  PRIMARY KEY (old_id)
+		) DEFAULT CHARSET=binary MAX_ROWS=10000000 AVG_ROW_LENGTH=10240;
+	""")
+
+except MySQLdb.Error, e:
+	print "Error %d: %s" % (e.args[0], e.args[1])
+	sys.exit (1)
+
+
+## handler for SIGTERM ###
+def signalHandler(signum, frame):
+    global conn, cursor
+    cursor.close() 
+    conn.close()
+    sys.exit(1)
+
+signal.signal(signal.SIGTERM, signalHandler)
+#####
+
+rePageLegacy = re.compile('<page id="(?P<id>\d+)".+?newlength="(?P<len>\d+)" stub="(?P<stub>\d)".+?>(?P<page>.+?)</page>',re.MULTILINE | re.DOTALL)
+
+rePageModern = re.compile('<page id="(?P<id>\d+)".+?newlength="(?P<len>\d+)" stub="(?P<stub>\d)" disambig="(?P<disambig>\d)" category="(?P<cat>\d)" image="(?P<img>\d)">(?P<page>.+?)</page>',re.MULTILINE | re.DOTALL)
+
+reContent = re.compile('<title>(?P<title>.+?)</title>\n<categories>(?P<categories>.*?)</categories>\n<links>(?P<links>.*?)</links>.+?<text>(?P<text>.+?)</text>',re.MULTILINE | re.DOTALL)
+
+if FORMAT == 'Zemanta-modern':
+	rePage = rePageModern
+else:
+	rePage = rePageLegacy
+
+RSIZE = 10000000	# read chunk size = 10 MB
+
+###
+articleBuffer = []	# len: 100  / now: 200
+aBuflen = 0
+
+textBuffer = []		# same as articleBuffer, stores text
+###
+
+# pageContent - <page>..content..</page>
+# pageDict - stores page attribute dict
+def recordArticle(pageDict):
+   global articleBuffer, textBuffer, aBuflen
+
+   '''if FORMAT == 'Zemanta-modern' and (pageDict['stub'] == '1' or pageDict['disambig'] == '1' or pageDict['cat'] == '1' or pageDict['img'] == '1'):
+	return
+   elif FORMAT != 'Zemanta-modern' pageDict['stub'] == '1':
+	return'''
+
+   id = int(pageDict['id'])
+   if not id in ARTICLE_IDS:
+	return
+
+   mContent = reContent.search(pageDict['page'])
+   contentDict = mContent.groupdict()
+
+   title = contentDict['title']
+   text = contentDict['text']
+
+   # convert HTML to plain text
+   t = html.fromstring(title.decode("utf-8"))
+   ctitle = t.text_content()
+
+   ctext = '' 
+   t = html.fromstring(text.decode("utf-8"))
+   ctext = t.text_content()
+
+   cadd = ''
+   for i in range(TITLE_WEIGHT):
+	cadd += ctitle + ' \n '
+   cadd += ctext
+
+   # write article info (id,title,text)
+   articleBuffer.append((id,ctitle))
+   textBuffer.append((id,cadd))
+   aBuflen += 1
+
+   if aBuflen >= 200:
+	cursor.executemany("""
+		INSERT INTO article (id,title)
+		VALUES (%s,%s)
+		""",articleBuffer)
+	cursor.executemany("""
+		INSERT INTO text (old_id,old_text)
+		VALUES (%s,%s)
+		""",textBuffer)
+	articleBuffer = []
+	textBuffer = []
+	aBuflen = 0
+
+   return
+
+
+args = sys.argv[1:]
+# scanData.py <hgw_file> <RSIZE>
+
+if len(args) < 1:
+    sys.exit()
+
+if len(args) == 2:
+    RSIZE = int(args[1])
+
+f = open(args[0],'r')
+prevText = ''
+
+firstRead = f.read(10000)
+
+if FORMAT == 'Gabrilovich':
+	documentStart = firstRead.find('</siteinfo>') + len('</siteinfo>')
+else:
+	documentStart = firstRead.find('<gum>') + len('<gum>')
+
+prevText = firstRead[documentStart:10000]
+
+while True:
+
+    newText = f.read(RSIZE)
+    if not newText:
+        break
+
+    text = prevText + newText
+
+    endIndex = -1
+
+    for page in rePage.finditer(text):
+        recordArticle(page.groupdict())
+	endIndex = page.end()
+
+    prevText = text[endIndex:]
+
+f.close()
+
+if aBuflen > 0:
+	cursor.executemany("""
+		INSERT INTO article (id,title)
+		VALUES (%s,%s)
+		""",articleBuffer)
+	cursor.executemany("""
+		INSERT INTO text (old_id,old_text)
+		VALUES (%s,%s)
+		""",textBuffer)
+	articleBuffer = []
+	textBuffer = []
+
+# remove links to articles that are filtered out
+cursor.execute("CREATE TABLE tmppagelinks LIKE pagelinks")
+cursor.execute("INSERT tmppagelinks SELECT * FROM pagelinks WHERE EXISTS (SELECT id FROM article WHERE id = target_id) AND EXISTS (SELECT id FROM article WHERE id = source_id)")
+cursor.execute("DROP TABLE pagelinks")
+cursor.execute("RENAME TABLE tmppagelinks TO pagelinks")
+
+cursor.execute("SELECT COUNT(id) FROM article")
+r = cursor.fetchone()
+print "Articles: ", r[0]
+
+# release DB resources
+cursor.close()
+conn.close()
+
diff --git a/scanCatHier.py b/scanCatHier.py
@@ -42,14 +42,12 @@
 RSIZE = 10000000	# read chunk size = 10 MB
 
 catDict = {}
-#linkDict = {}
-
 catTitles = {}
 
 # pageContent - <page>..content..</page>
 # pageDict - stores page attribute dict
 def recordArticle(pageDict):
-   global catDict,catList,catTitles
+   global catDict,catTitles
 
    mContent = reContent.search(pageDict['page'])
    if not mContent:
@@ -72,9 +70,9 @@ def recordArticle(pageDict):
    for cat in cats.split():
 	c = int(cat)
 	if catDict.has_key(c):
-		catDict[c].append(curId)
+		catDict[c].add(curId)
 	else:
-		catDict[c] = [curId]
+		catDict[c] = set([curId])
 
    return
 
@@ -124,8 +122,6 @@ def recordArticle(pageDict):
 cats = set(STOP_CATS)
 outcats = set(STOP_CATS)
 
-#allCatSet = frozenset(catList)
-
 while cats:
 	parent = cats.pop()
 

diff --git a/scanData.py b/scanData.py
@@ -48,7 +48,8 @@
 	f = open('extended_stop_categories.txt','r')
 	for line in f.readlines():
 		strId = line.split('\t')[0]
-		catList.append(strId)
+		if strId:
+			catList.append(strId)
 	f.close()
 except:
 	print 'Stop categories cannot be read! Please put "extended_stop_categories.txt" file containing stop categories in this folder.'
@@ -146,6 +147,11 @@ def signalHandler(signum, frame):
 linkBuflen = 0 
 ###
 
+
+# for logging
+# Filtered concept id=12 (hede hodo) [minIncomingLinks]
+log = open('log.txt','w')
+
 # pageContent - <page>..content..</page>
 # pageDict - stores page attribute dict
 def recordArticle(pageDict):
@@ -176,20 +182,27 @@ def recordArticle(pageDict):
 
    # filter articles based on title  
    if piped_re.match(title):
+       log.write('Filtered concept id='+str(id)+' ('+ title +') [regex]\n')
        return
 
    text = contentDict['text']
-   cats = contentDict['categories']
-   cats = cats.split()
+   cs = contentDict['categories']
+   cs = cs.split()
+   cats = set()
+   for c in cs:
+	if c:
+		cats.add(c)
    links = contentDict['links']
    links = links.split()
 
    # filter article with no category or belonging to stop categories
-   if not cats or STOP_CATS.intersection(set(cats)):
+   if not cats or STOP_CATS.intersection(cats):
+        log.write('Filtered concept id='+str(id)+' ('+ title +') [stop category]\n')
 	return
 
    # filter articles with outlinks < 5
    if len(links) < 5:
+        log.write('Filtered concept id='+str(id)+' ('+ title +') [minOutgoingLinks]\n')
 	return
 
    # convert HTML to plain text
@@ -211,6 +224,7 @@ def recordArticle(pageDict):
 			break
 
    if wordCount > 0:
+        log.write('Filtered concept id='+str(id)+' ('+ title +') [minNumFeaturesPerArticle]\n')
 	return
 
    # write links
@@ -316,9 +330,16 @@ def recordArticle(pageDict):
 cursor.execute("CREATE TABLE inlinks AS SELECT p.target_id, COUNT(p.source_id) AS inlink FROM pagelinks p GROUP BY p.target_id")
 cursor.execute("CREATE INDEX idx_target_id ON inlinks (inlink)")
 
+# list articles discarded because of minIncomingLinks
+cursor.execute("SELECT a.* FROM article a, inlinks i WHERE a.id = i.target_id AND i.inlink < 5")
+rows = cursor.fetchall()
+for row in rows:
+        log.write('Filtered concept id='+str(row[0])+' ('+ row[1] +') [minIncomingLinks]\n')
+
+
 # filter
 cursor.execute("CREATE TABLE tmparticle LIKE article")
-cursor.execute("INSERT tmparticle SELECT a.* FROM article a, inlinks i WHERE a.id = i.target_id AND i.inlink > 5")
+cursor.execute("INSERT tmparticle SELECT a.* FROM article a, inlinks i WHERE a.id = i.target_id AND i.inlink >= 5")
 cursor.execute("DROP TABLE article")
 cursor.execute("RENAME TABLE tmparticle TO article")
 
@@ -338,3 +359,5 @@ def recordArticle(pageDict):
 cursor.close()
 conn.close()
 
+log.close()
+