Skip to content

Commit

Permalink
Improved wiki parsing and cache pages
Browse files Browse the repository at this point in the history
  • Loading branch information
joto committed Jan 10, 2013
1 parent 7b92947 commit 44b6792
Show file tree
Hide file tree
Showing 4 changed files with 111 additions and 51 deletions.
16 changes: 16 additions & 0 deletions sources/wiki/cache.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
--
-- Taginfo source: Wiki
--
-- cache.sql
--

.bail ON

CREATE TABLE cache_pages (
title TEXT,
timestamp TEXT,
body TEXT
);

CREATE INDEX cache_pages_title_timestamp ON cache_pages (title, timestamp);

8 changes: 6 additions & 2 deletions sources/wiki/get_image_info.rb
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,17 @@

db = SQLite3::Database.new(dir + '/taginfo-wiki.db')
db.results_as_hash = true
image_titles = db.execute("SELECT DISTINCT(image) AS title FROM wikipages").map{ |row| row['title'] }.select{ |title| !title.nil? && title.match(%r{^(file|image):}i) }
image_titles = db.execute("SELECT DISTINCT(image) AS title FROM wikipages WHERE image IS NOT NULL AND image != ''").
map{ |row| row['title'] }.
select{ |title| title.match(%r{^(file|image):}i) }

db.execute('BEGIN TRANSACTION');

puts "Found #{ image_titles.size } different image titles"

until image_titles.empty?
some_titles = image_titles.slice!(0, 10)
# puts some_titles.join(",") + "\n"
puts "Get image info for: #{ some_titles.join(' ') }"

begin
data = api.query(:prop => 'imageinfo', :iiprop => 'url|size|mime', :titles => some_titles.join('|'), :iiurlwidth => 10, :iiurlheight => 10)
Expand Down
126 changes: 80 additions & 46 deletions sources/wiki/get_wiki_data.rb
Original file line number Diff line number Diff line change
Expand Up @@ -227,14 +227,45 @@ def add_parameter(value)

#------------------------------------------------------------------------------

def get_page(db, api, page)
db.execute("SELECT * FROM cache.cache_pages WHERE title=? AND timestamp=?", page.title, page.timestamp) do |row|
page.content = row['body']
puts "Page #{ page.title } in cache (#{ page.timestamp })"
return
end
db.execute("DELETE FROM cache.cache_pages WHERE title=?", page.title);
res = api.get(page.params)
page.content = res.body
db.execute("INSERT INTO cache.cache_pages (title, timestamp, body) VALUES (?, ?, ?)", page.title, page.timestamp, page.content);
puts "Page #{ page.title } not in cache (#{ page.timestamp })"
end

def cleanup_cache(db, current_pagetitles)
db.execute("SELECT title FROM cache.cache_pages") do |row|
current_pagetitles.delete(row['title'])
end
to_delete = current_pagetitles.keys
puts "Deleting pages from cache: #{ to_delete.join(' ') }"
to_delete.each do |title|
db.execute("DELETE FROM cache.cache_pages WHERE title=?", title);
end
end

#------------------------------------------------------------------------------

dir = ARGV[0] || '.'

api = MediaWikiAPI::API.new('wiki.openstreetmap.org', 80, '/w/index.php?')
api.add_header('User-agent', 'taginfo/0.1 (jochen@remote.org)')

db = SQLite3::Database.new(dir + '/taginfo-wiki.db')
db.results_as_hash = true

db.execute("ATTACH DATABASE '#{dir}/wikicache.db' AS cache")

db.execute('BEGIN TRANSACTION')

db.execute('BEGIN TRANSACTION');
current_pagetitles = {}

File.open(dir + '/tagpages.list') do |wikipages|
wikipages.each do |line|
Expand All @@ -245,8 +276,9 @@ def add_parameter(value)

reason = page.check_title
if reason == :ok
res = api.get(page.params)
page.content = res.body
current_pagetitles[page.title] = page.timestamp

get_page(db, api, page)

page.parse_content do |template|
puts "Template: #{template.name} [#{template.parameters.join(',')}] #{template.named_parameters.inspect}"
Expand All @@ -259,54 +291,54 @@ def add_parameter(value)
end
if template.name =~ /(Key|Value)Description$/
page.has_templ = true
end
if template.named_parameters['description']
desc = []
template.named_parameters['description'].each do |i|
if i.class == Template
desc << ' ' << i.parameters.join('=') << ' '
if template.named_parameters['description']
desc = []
template.named_parameters['description'].each do |i|
if i.class == Template
desc << ' ' << i.parameters.join('=') << ' '
else
desc << i
end
page.description = desc.join('').strip
end
end
if template.named_parameters['image']
ititle = template.named_parameters['image'][0]
if !ititle.nil? && ititle.match(%r{^(file|image):(.*)$}i)
page.image = "File:#{$2}"
else
desc << i
puts "invalid image: page='#{page.title}' image='#{ititle}'"
db.execute('INSERT INTO invalid_image_titles (page_title, image_title) VALUES (?, ?)', page.title, ititle)
page.image = ''
end
page.description = desc.join('').strip
end
end
if template.named_parameters['image']
ititle = template.named_parameters['image'][0]
if !ititle.nil? && ititle.match(%r{^(file|image):(.*)$}i)
page.image = "File:#{$2}"
else
puts "invalid image: page='#{page.title}' image='#{ititle}'"
db.execute('INSERT INTO invalid_image_titles (page_title, image_title) VALUES (?, ?)', page.title, ititle)
page.image = ''
if template.named_parameters['group']
page.group = template.named_parameters['group'][0]
end
end
if template.named_parameters['group']
page.group = template.named_parameters['group'][0]
end
if template.named_parameters['onNode'] == ['yes']
page.onNode = true
end
if template.named_parameters['onWay'] == ['yes']
page.onWay = true
end
if template.named_parameters['onArea'] == ['yes']
page.onArea = true
end
if template.named_parameters['onRelation'] == ['yes']
page.onRelation = true
end
if template.named_parameters['implies']
template.named_parameters['implies'].each do |i|
if i.class == Template
page.tags_implies << i.parameters.join('=')
if template.named_parameters['onNode'] == ['yes']
page.onNode = true
end
if template.named_parameters['onWay'] == ['yes']
page.onWay = true
end
if template.named_parameters['onArea'] == ['yes']
page.onArea = true
end
if template.named_parameters['onRelation'] == ['yes']
page.onRelation = true
end
if template.named_parameters['implies']
template.named_parameters['implies'].each do |i|
if i.class == Template
page.tags_implies << i.parameters.join('=')
end
end
end
end
if template.named_parameters['combination']
template.named_parameters['combination'].each do |i|
if i.class == Template
page.tags_combination << i.parameters.join('=')
if template.named_parameters['combination']
template.named_parameters['combination'].each do |i|
if i.class == Template
page.tags_combination << i.parameters.join('=')
end
end
end
end
Expand All @@ -319,7 +351,9 @@ def add_parameter(value)
end
end

db.execute('COMMIT');
cleanup_cache(db, current_pagetitles)

db.execute('COMMIT')


#-- THE END -------------------------------------------------------------------
12 changes: 9 additions & 3 deletions sources/wiki/update.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,19 @@ fi
echo "`$DATECMD` Start wiki..."

DATABASE=$DIR/taginfo-wiki.db
LOGFILE=$DIR/get_wiki_data.log
CACHEDB=$DIR/wikicache.db
LOGFILE_WIKI_DATA=$DIR/get_wiki_data.log
LOGFILE_IMAGE_INFO=$DIR/get_image_info.log

rm -f $DIR/allpages.list
rm -f $DIR/tagpages.list
rm -f $LOGFILE
rm -f $DATABASE

if [ ! -e $CACHEDB ]; then
sqlite3 $CACHEDB <cache.sql
fi

echo "`$DATECMD` Running init.sql..."
sqlite3 $DATABASE <../init.sql

Expand All @@ -36,10 +42,10 @@ echo "`$DATECMD` Getting page list..."
./get_page_list.rb $DIR

echo "`$DATECMD` Getting wiki data..."
./get_wiki_data.rb $DIR >$LOGFILE
./get_wiki_data.rb $DIR >$LOGFILE_WIKI_DATA

echo "`$DATECMD` Getting image info..."
./get_image_info.rb $DIR >$LOGFILE
./get_image_info.rb $DIR >$LOGFILE_IMAGE_INFO

echo "`$DATECMD` Extracting words..."
./extract_words.rb $DIR
Expand Down

0 comments on commit 44b6792

Please sign in to comment.