Skip to content

Commit

Permalink
Merge branch 'master' of git@github.com:faraday/wikiprep-esa
Browse files Browse the repository at this point in the history
  • Loading branch information
Çağatay Çallı committed Aug 18, 2010
2 parents a2b9a8e + d566443 commit 58e97e3
Show file tree
Hide file tree
Showing 7 changed files with 172,167 additions and 12 deletions.
11 changes: 11 additions & 0 deletions README
Expand Up @@ -66,7 +66,18 @@ Trove: http://trove4j.sourceforge.net/

USAGE

This creates the pagelinks table and records incoming and outgoing link counts.

python scanLinks.py <hgw.xml file from Wikiprep dump>

As stop categories, a list "wiki_stop_categories.txt" is provided.
But if you want to descend down and include all subtrees of these categories, you can use:

python scanCatHier.py <hgw.xml file from Wikiprep dump> <cat_hier output path>


[The commands below are standard]

python scanData.py <hgw.xml file from Wikiprep dump>
python addAnchors.py <anchor_text file from Wikiprep dump> <a writeable folder>

Expand Down
228 changes: 228 additions & 0 deletions directScan.py
@@ -0,0 +1,228 @@
#!/usr/bin/python

'''
Copyright (C) 2010 Cagatay Calli <ccalli@gmail.com>
Run scanLinks.py first..
Scans using IDs from Gabrilovich log and XML output (gum.xml) from Wikiprep, creates 3 tables:
TABLE: article COLUMNS: id INT, title VARBINARY(255)
TABLE: text COLUMNS: old_id INT, old_text MEDIUMBLOB
TABLE: pagelinks COLUMNS: source_id INT, target_id INT
USAGE: scanData.py <hgw.xml file from Wikiprep>
IMPORTANT: If you use XML output from a recent version of Wikiprep
(e.g. Zemanta fork), then set FORMAT to 'Zemanta-legacy' or 'Zemanta-modern'.
'''

import sys
import re
import MySQLdb
import signal

import lxml.html as html

# formats: 1) Gabrilovich 2) Zemanta-legacy 3) Zemanta-modern
FORMAT = 'Gabrilovich'

TITLE_WEIGHT = 4

idList = []
try:
f = open('selected.txt','r')
for line in f.readlines():
strId = line.split('\t')[0]
if strId:
idList.append(int(strId))
f.close()
except:
print '(Direct) Article list cannot be read! Please put "selected.txt" file containing stop categories in this folder.'
sys.exit(1)

ARTICLE_IDS = frozenset(idList)

try:
conn = MySQLdb.connect(host='localhost',user='root',passwd='123456',db='wiki',charset = "utf8", use_unicode = True)
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
sys.exit(1)

try:
cursor = conn.cursor()
cursor.execute("DROP TABLE IF EXISTS article")
cursor.execute("""
CREATE TABLE article
(
id INT(10),
title VARBINARY(255) NOT NULL,
PRIMARY KEY (id),
KEY title (title(32))
) DEFAULT CHARSET=binary
""")

cursor.execute("DROP TABLE IF EXISTS text")
cursor.execute("""
CREATE TABLE text
(
old_id INT(10) unsigned NOT NULL,
old_text MEDIUMBLOB NOT NULL,
PRIMARY KEY (old_id)
) DEFAULT CHARSET=binary MAX_ROWS=10000000 AVG_ROW_LENGTH=10240;
""")

except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
sys.exit (1)


## handler for SIGTERM ###
def signalHandler(signum, frame):
global conn, cursor
cursor.close()
conn.close()
sys.exit(1)

signal.signal(signal.SIGTERM, signalHandler)
#####

rePageLegacy = re.compile('<page id="(?P<id>\d+)".+?newlength="(?P<len>\d+)" stub="(?P<stub>\d)".+?>(?P<page>.+?)</page>',re.MULTILINE | re.DOTALL)

rePageModern = re.compile('<page id="(?P<id>\d+)".+?newlength="(?P<len>\d+)" stub="(?P<stub>\d)" disambig="(?P<disambig>\d)" category="(?P<cat>\d)" image="(?P<img>\d)">(?P<page>.+?)</page>',re.MULTILINE | re.DOTALL)

reContent = re.compile('<title>(?P<title>.+?)</title>\n<categories>(?P<categories>.*?)</categories>\n<links>(?P<links>.*?)</links>.+?<text>(?P<text>.+?)</text>',re.MULTILINE | re.DOTALL)

if FORMAT == 'Zemanta-modern':
rePage = rePageModern
else:
rePage = rePageLegacy

RSIZE = 10000000 # read chunk size = 10 MB

###
articleBuffer = [] # len: 100 / now: 200
aBuflen = 0

textBuffer = [] # same as articleBuffer, stores text
###

# pageContent - <page>..content..</page>
# pageDict - stores page attribute dict
def recordArticle(pageDict):
global articleBuffer, textBuffer, aBuflen

'''if FORMAT == 'Zemanta-modern' and (pageDict['stub'] == '1' or pageDict['disambig'] == '1' or pageDict['cat'] == '1' or pageDict['img'] == '1'):
return
elif FORMAT != 'Zemanta-modern' pageDict['stub'] == '1':
return'''

id = int(pageDict['id'])
if not id in ARTICLE_IDS:
return

mContent = reContent.search(pageDict['page'])
contentDict = mContent.groupdict()

title = contentDict['title']
text = contentDict['text']

# convert HTML to plain text
t = html.fromstring(title.decode("utf-8"))
ctitle = t.text_content()

ctext = ''
t = html.fromstring(text.decode("utf-8"))
ctext = t.text_content()

cadd = ''
for i in range(TITLE_WEIGHT):
cadd += ctitle + ' \n '
cadd += ctext

# write article info (id,title,text)
articleBuffer.append((id,ctitle))
textBuffer.append((id,cadd))
aBuflen += 1

if aBuflen >= 200:
cursor.executemany("""
INSERT INTO article (id,title)
VALUES (%s,%s)
""",articleBuffer)
cursor.executemany("""
INSERT INTO text (old_id,old_text)
VALUES (%s,%s)
""",textBuffer)
articleBuffer = []
textBuffer = []
aBuflen = 0

return


args = sys.argv[1:]
# scanData.py <hgw_file> <RSIZE>

if len(args) < 1:
sys.exit()

if len(args) == 2:
RSIZE = int(args[1])

f = open(args[0],'r')
prevText = ''

firstRead = f.read(10000)

if FORMAT == 'Gabrilovich':
documentStart = firstRead.find('</siteinfo>') + len('</siteinfo>')
else:
documentStart = firstRead.find('<gum>') + len('<gum>')

prevText = firstRead[documentStart:10000]

while True:

newText = f.read(RSIZE)
if not newText:
break

text = prevText + newText

endIndex = -1

for page in rePage.finditer(text):
recordArticle(page.groupdict())
endIndex = page.end()

prevText = text[endIndex:]

f.close()

if aBuflen > 0:
cursor.executemany("""
INSERT INTO article (id,title)
VALUES (%s,%s)
""",articleBuffer)
cursor.executemany("""
INSERT INTO text (old_id,old_text)
VALUES (%s,%s)
""",textBuffer)
articleBuffer = []
textBuffer = []

# remove links to articles that are filtered out
cursor.execute("CREATE TABLE tmppagelinks LIKE pagelinks")
cursor.execute("INSERT tmppagelinks SELECT * FROM pagelinks WHERE EXISTS (SELECT id FROM article WHERE id = target_id) AND EXISTS (SELECT id FROM article WHERE id = source_id)")
cursor.execute("DROP TABLE pagelinks")
cursor.execute("RENAME TABLE tmppagelinks TO pagelinks")

cursor.execute("SELECT COUNT(id) FROM article")
r = cursor.fetchone()
print "Articles: ", r[0]

# release DB resources
cursor.close()
conn.close()

10 changes: 3 additions & 7 deletions scanCatHier.py
Expand Up @@ -42,14 +42,12 @@
RSIZE = 10000000 # read chunk size = 10 MB

catDict = {}
#linkDict = {}

catTitles = {}

# pageContent - <page>..content..</page>
# pageDict - stores page attribute dict
def recordArticle(pageDict):
global catDict,catList,catTitles
global catDict,catTitles

mContent = reContent.search(pageDict['page'])
if not mContent:
Expand All @@ -72,9 +70,9 @@ def recordArticle(pageDict):
for cat in cats.split():
c = int(cat)
if catDict.has_key(c):
catDict[c].append(curId)
catDict[c].add(curId)
else:
catDict[c] = [curId]
catDict[c] = set([curId])

return

Expand Down Expand Up @@ -124,8 +122,6 @@ def recordArticle(pageDict):
cats = set(STOP_CATS)
outcats = set(STOP_CATS)

#allCatSet = frozenset(catList)

while cats:
parent = cats.pop()

Expand Down
33 changes: 28 additions & 5 deletions scanData.py
Expand Up @@ -48,7 +48,8 @@
f = open('extended_stop_categories.txt','r')
for line in f.readlines():
strId = line.split('\t')[0]
catList.append(strId)
if strId:
catList.append(strId)
f.close()
except:
print 'Stop categories cannot be read! Please put "extended_stop_categories.txt" file containing stop categories in this folder.'
Expand Down Expand Up @@ -146,6 +147,11 @@ def signalHandler(signum, frame):
linkBuflen = 0
###


# for logging
# Filtered concept id=12 (hede hodo) [minIncomingLinks]
log = open('log.txt','w')

# pageContent - <page>..content..</page>
# pageDict - stores page attribute dict
def recordArticle(pageDict):
Expand Down Expand Up @@ -176,20 +182,27 @@ def recordArticle(pageDict):

# filter articles based on title
if piped_re.match(title):
log.write('Filtered concept id='+str(id)+' ('+ title +') [regex]\n')
return

text = contentDict['text']
cats = contentDict['categories']
cats = cats.split()
cs = contentDict['categories']
cs = cs.split()
cats = set()
for c in cs:
if c:
cats.add(c)
links = contentDict['links']
links = links.split()

# filter article with no category or belonging to stop categories
if not cats or STOP_CATS.intersection(set(cats)):
if not cats or STOP_CATS.intersection(cats):
log.write('Filtered concept id='+str(id)+' ('+ title +') [stop category]\n')
return

# filter articles with outlinks < 5
if len(links) < 5:
log.write('Filtered concept id='+str(id)+' ('+ title +') [minOutgoingLinks]\n')
return

# convert HTML to plain text
Expand All @@ -211,6 +224,7 @@ def recordArticle(pageDict):
break

if wordCount > 0:
log.write('Filtered concept id='+str(id)+' ('+ title +') [minNumFeaturesPerArticle]\n')
return

# write links
Expand Down Expand Up @@ -316,9 +330,16 @@ def recordArticle(pageDict):
cursor.execute("CREATE TABLE inlinks AS SELECT p.target_id, COUNT(p.source_id) AS inlink FROM pagelinks p GROUP BY p.target_id")
cursor.execute("CREATE INDEX idx_target_id ON inlinks (inlink)")

# list articles discarded because of minIncomingLinks
cursor.execute("SELECT a.* FROM article a, inlinks i WHERE a.id = i.target_id AND i.inlink < 5")
rows = cursor.fetchall()
for row in rows:
log.write('Filtered concept id='+str(row[0])+' ('+ row[1] +') [minIncomingLinks]\n')


# filter
cursor.execute("CREATE TABLE tmparticle LIKE article")
cursor.execute("INSERT tmparticle SELECT a.* FROM article a, inlinks i WHERE a.id = i.target_id AND i.inlink > 5")
cursor.execute("INSERT tmparticle SELECT a.* FROM article a, inlinks i WHERE a.id = i.target_id AND i.inlink >= 5")
cursor.execute("DROP TABLE article")
cursor.execute("RENAME TABLE tmparticle TO article")

Expand All @@ -338,3 +359,5 @@ def recordArticle(pageDict):
cursor.close()
conn.close()

log.close()

0 comments on commit 58e97e3

Please sign in to comment.