Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
tree: 5f724008f6
Fetching contributors…

Cannot retrieve contributors at this time

executable file 980 lines (831 sloc) 37.523 kb
#!/bin/env ruby
# vim scripts scraping monster
#
# Full Scrape:
# - to do a full scrape / download / upload cycle, pass the script ids:
# ./scraper `seq 1 3217` # where 3217 is the highest script id
#
# RSS Scrape:
# Now, run with no arguments to perform an rss scrape. It downloads
# all scripts present in the rss feed (stores its state in a file).
# ./scraper
# Do a full scrape before setting up cron to do the rss scrape.
#
# Debugging / manual invocation:
# - with a negative number, scrapes that script ID and stores the json
# ./scraper -987
# - with a .json file, downloads the script and converts to a git repo
# ./scraper scripts/0987*
# - with a bare git repo, pushes that repo up to github
# ./scraper repos/0987*
# When you run the above commands in that order, you do a full scrape
# on script 987.
require 'rubygems'
require 'hpricot' # hpricot gem
require 'open-uri'
require 'cgi'
require 'json' # json gem
require 'gitrb' # gitrb gem
require 'zip/zipfilesystem' # rubyzip gem
require 'archive/tar/minitar' # minitar gem
require 'zlib'
require 'bzip2' # bzip2-ruby gem
require 'mime/types'
require 'mimemagic' # mimemagic gem
require 'tmpdir'
require 'tempfile'
require 'find'
require 'octopussy' # octopussy gem
require 'hashie' # hashie gem
require 'htmlentities' # htmlentities gem
require 'feedzirra' # feedzirra gem
# thoughts:
# We do everything we can to make it so you can drop the repo in bundles and just have it work.
# We reset permissions on extracted files -- usually when they're different it's a mistake.
$vimscripts_name = "vim-scripts"
$vimscripts_email = 'vimscripts@rinspin.com'
$repos_dir = 'repos'
$scripts_dir = 'scripts'
$packages_dir = 'packages'
$webcache_dir = 'webcache'
$git_script_file = 'vim-script.json'
$vimdirs = %w{after autoload bin colors doc ftdetect ftplugin indent plugin syntax}
$textext = %w{bat c cpp csh diff h klip patch pl pm ps py rb set sh snip snippet snippets tcl txt xml vim vim.orig}.map { |x| "\\.#{x}$" }.join('|')
$deleted_scripts = %w{364 548 549 550 762 1032 1129 1263 1280 1295 1301 1436 1452 1509 1562 1669 1789 1824 1949 2056 2172 2306 2309 2313 2316 2318 2323 2352 2456 2498 2664 2861 3076}
Dir.mkdir $scripts_dir unless test ?d, $scripts_dir
Dir.mkdir $repos_dir unless test ?d, $repos_dir
Dir.mkdir $packages_dir unless test ?d, $packages_dir
Dir.mkdir $webcache_dir unless test ?d, $webcache_dir
# http://github.com/hpricot/hpricot/issues#issue/25
# super ugly that hpricot manages to screw up charset encodings so badly.
# I'm patching hpricot directly as follows. should probably convert this to a monkeypatch.
#
# --- a/lib/hpricot/traverse.rb 2010-08-25 02:07:55.288095773 -0700
# +++ b/lib/hpricot/traverse.rb 2010-08-25 02:08:02.403095712 -0700
# @@ -157,7 +157,7 @@
# # HTML elements are removed.
# def inner_text
# if respond_to?(:children) and children
# - children.map { |x| x.inner_text }.join
# + children.map { |x| str = x.inner_text; str = str.dup.force_encoding('ISO-8859-1').encode('UTF-8') if str.encoding.to_s == 'ASCII-8BIT' || str.encoding.to_s == 'ISO-8859-1'; str }.join
# else
# ""
# end
#
# --- a/lib/hpricot/builder.rb 2010-08-25 09:55:37.894992548 -0700
# +++ b/lib/hpricot/builder.rb 2010-08-25 09:55:23.880992337 -0700
# @@ -6,8 +6,9 @@
# module Hpricot
# # XML unescape
# def self.uxs(str)
# - str.to_s.
# - gsub(/\&(\w+);/) { [NamedCharacters[$1] || ??].pack("U*") }.
# + str = str.to_s
# + str = str.dup.force_encoding('ISO-8859-1').encode('UTF-8') if str.encoding.to_s == 'ASCII-8BIT' || str.encoding.to_s == 'ISO-8859-1'
# + str.gsub(/\&(\w+);/) { [NamedCharacters[$1] || ??].pack("U*") }.
# gsub(/\&\#(\d+);/) { [$1.to_i].pack("U*") }
# end
#
# Run './scraper 5' to see a script with a bunch of embedded nbsps
# that breaks hpricot horribly.
#
# encoding resources:
# http://yokolet.blogspot.com/2009/07/design-and-implementation-of-ruby-m17n.html
# http://yehudakatz.com/2010/05/05/ruby-1-9-encodings-a-primer-and-the-solution-for-rails/
module Hpricot
module Traverse
def inner_content
if respond_to?(:children) and children
children.map { |x| x.inner_content }.join
else
''
end
end
end
class Elements < Array
def inner_content
map { |x| x.inner_content }.join
end
end
class Text
def inner_content
str = content.to_s
str = str.force_encoding('ISO-8859-1').encode("UTF-8") if str.encoding.to_s == 'ASCII-8BIT'
CGI.unescapeHTML(HTMLEntities.new.decode(str.gsub(/&nbsp;/, " ")))
end
end
class CData
alias_method :inner_content, :content
end
end
# monkeypatch remove into mimemagic, hopefully this gets pulled upstream.
# http://github.com/bronson/mimemagic/commit/e1860b1fac0187c638569bc45833adc2ca521661
class MimeMagic
def self.remove(type)
obj = new(type)
obj.extensions.each { |ext| EXTENSIONS.delete(ext) }
MAGIC.delete_if { |t, m| t == type }
TYPES.delete(type)
end
end
MimeMagic.remove('application/x-gmc-link')
# monkeypatch set_repo_info into octopussy. Dunno why they missed this call.
# http://github.com/pengwynn/octopussy/pull/2
module Octopussy
class Client
def set_repo_info(repo, options)
repo = Repo.new(repo)
# post body needs to be "values[has_wiki]=false"
response = self.class.post("/repos/show/#{repo.username}/#{repo.name}",
:body => options.keys.reduce({}) { |a,v| a["values[#{v}]"] = options[v]; a }.merge(auth_params))
Hashie::Mash.new(response).repository
end
end
end
class Array
def to_hash_keys
# converts ['a,:b'] to {:a=>1,:b=>1}
Hash[*zip([1]*size)]
end
end
# http://blog.codefront.net/2008/01/14/retrying-code-blocks-in-ruby-on-exceptions-whatever/
# modified to pass the number of retries in the arg (0,1,2 if :tries => 3)
def retryable(options = {}, &block)
opts = { :tries => 1, :on => Exception, :sleep => 1 }.merge(options)
return if opts[:tries] < 1
retry_exception = opts[:on]
tries = 0
if opts[:tries] > 1
begin
return yield tries
rescue retry_exception
sleep opts[:sleep]
retry if (tries += 1) < opts[:tries] - 1
end
end
# last try will throw an exception
yield tries+1
end
def scrape_author(user_id)
$authors ||= []
unless $authors[user_id.to_i]
doc = cached_open("http://www.vim.org/account/profile.php?user_id=#{user_id}") { |f| Hpricot(f) }
doc.at('td[text()="user name"]').next_sibling.inner_content
u = {'user_id' => user_id }
u['user_name'] = doc.at('td[text()="user name"]').next_sibling.inner_content
u['first_name'] = doc.at('td[text()="first name"]').next_sibling.inner_content
u['last_name'] = doc.at('td[text()="last name"]').next_sibling.inner_content
u['email'] = doc.at('td[text()="email"]').next_sibling.inner_content
u['homepage'] = doc.at('td[text()="homepage"]').next_sibling.inner_content
$authors[user_id.to_i] = u
end
return $authors[user_id.to_i]
end
def script_id_to_url(script_id)
"http://www.vim.org/scripts/script.php?script_id=#{script_id}"
end
def script_id_from_url(url)
url =~ /[?&;]script_id=(\d+)/ or raise "Could not parse a script id from <<#{url}>>"
$1
end
def cached_open(url)
# todo: turn this off once we roll to production
path = File.join($webcache_dir, filenameify(url))
if $ignore_cache || !File.exist?(path) || File.size(path) <= 0
File.open(path, 'w') do |f|
retryable(:tries => 4, :on => [OpenURI::HTTPError, Timeout::Error], :sleep => 10) do |retries|
puts " downloading #{url}#{retries > 0 ? " TRY #{retries}" : ""}"
open(url) { |u| f.write(u.read) }
end
end
end
File.open(path) { |f| yield f }
end
def scrape_script(script_id)
# hard-code the scripts that claim to be deleted
if $deleted_scripts.include? script_id.to_s
puts "Skipped #{script_id} -- deleted."
return nil
end
doc = cached_open(script_id_to_url(script_id)) { |f| Hpricot(f) }
if doc.search('title').inner_text == "Error : vim online"
puts "Skipped #{script_id} -- doesn't exist."
return nil
end
s = {'script_id' => script_id}
s['display_name'], s['summary'] = doc.search('.txth1').inner_content.split(" : ", 2)
s['name'] = githubify(s['display_name'])
s['script_type'] = doc.at('td[text()="script type"]').parent.next_sibling.children.first.inner_content
s['description'] = doc.at('td[text()="description"]').parent.next_sibling.children.first.inner_content.gsub("\r", "\n")
doc.search('td.lightbg~td').find { |e| e.inner_text =~ /Rating.*\s(-?\d+)\/(\d+),.*Downloaded[^\d]*(\d+)/m }
s['rating_total'], s['rating_votes'], s['downloads'] = $1, $2, $3 # http://www.vim.org/karma.php
s['install_details'] = doc.at('td[text()="install details"]').parent.next_sibling.children.first.inner_content.gsub("\r", "\n")
# reject links with targets so download links in the description don't appear to be a version (script 1843)
s['versions'] = doc.search('a[@href*="download_script.php?"]').select { |e| e.attributes['target'].empty? }.to_a.map do |a|
v = {'url' => 'http://www.vim.org/scripts/' + a.attributes['href'],
'filename' => a.inner_content}
row = a.parent
v['script_version'] = row.siblings_at(1).inner_content
v['date'] = row.siblings_at(2).inner_content
v['vim_version'] = row.siblings_at(3).inner_content
v['author'] = scrape_author(row.siblings_at(4).at('a').attributes['href'].match(/\d+/)[0])
v['release_notes'] = row.siblings_at(5).inner_content.gsub("\r", "\n")
v
end
if s['versions'].empty?
puts "Skipped #{script_id} -- empty."
return nil
end
s
end
def fix_encoding(h)
# see the Hpricot monkey patch above. It gave us random encodings,
# we need to force them back to their default before converting to utf8.
# ugly!!
if h.kind_of? Hash
o = Hash.new
h.each_pair { |k,v| o[fix_encoding(k)] = fix_encoding(v) }
o
elsif h.kind_of? Array
a = Array.new
h.each { |v| a.push fix_encoding(v) }
a
elsif h.kind_of? String
if h.encoding.to_s == 'ASCII-8BIT' || h.encoding.to_s == 'ISO-8859-1'
h = h.dup.force_encoding('ISO-8859-1').encode('UTF-8')
end
h
else
h
end
end
def check_encoding(h)
# recursively prints the encoding of every key/value/element etc
if h.kind_of? Hash
h.each_pair { |k,v| check_encoding(k); check_encoding(v) }
elsif h.kind_of? Array
h.each { |v| check_encoding(v) }
elsif h.kind_of? String
puts "#{h.encoding}: #{h[0..50]}"
else
h
end
end
def filenameify(s)
# replace unsafe path chars. keep posessive Michaels, not Michael-s
s.gsub(/^\s*|\s*$/, '').gsub(/'s/i, 's').gsub(/[^ A-Za-z0-9_\-!#\@\$^%&:;<?>+=(){|},.\[\]]/, '-')
end
def gittagify(s)
# replace any chars git might take issue with (space, backslash, ^:)?]
# git doesn't like it when a tag begins or ends with periods or dashes
s.gsub(/^\s*|\s*$/, '').gsub(/[^A-Za-z0-9_\-!#\@\$%&;<>+=(){|},.\]]/, '-').gsub(/^\./, '0.').gsub(/\.$/, '.0').gsub(/^-*|-*$/, '').gsub(/^\./, '0.').gsub(/\.$/, '.0')
end
def githubify(s)
# these guys only allow A-Za-z0-9._- yet we need to try to keep the name readable.
s.gsub(/^\s*|\s*$/, '').gsub(/\s+-|-\s+/, '-').gsub(/\+\+/, 'pp').gsub(/([CF])#/i, "#{$1}sharp").gsub('::', '.').gsub('&', 'and').gsub(/\s+|:|\+/, '-').gsub(/[^A-Za-z0-9_.-]/, '')
end
def script_version(version)
# some scripts don't assign a version so just use the date
s = version['script_version']
s = version['date'] if s =~ /^\s*$/
s
end
def script_filename(script)
# if you change this, also change the regex in list_existing_scripts
File.join($scripts_dir, "#{'%04d' % script['script_id']} - #{filenameify(script['name'])}.json")
end
def script_contents(filename)
JSON.parse(File.read(File.join($scripts_dir, filename)))
end
def hashkeyify(name)
# converts a repo name into something worthy of matching against
name.downcase
end
def list_existing_scripts
# returns a hash of key=script name, value=filename
Hash[Dir.entries($scripts_dir).reject { |e| %w{. .. .git}.include?(e) }.
map { |e| e =~ /[0-9]* - (.*)\.json$/ or raise "can't match #{e}"; [hashkeyify($1), e]}]
end
def author(script)
# returns the author's first and last name or nil if this script has multiple authors.
# can't just check author id because a number of authors have abandoned old accounts and created new ones
first_name = script['versions'][0]['author']['first_name']
last_name = script['versions'][0]['author']['last_name']
if first_name =~ /^\s*$/ && last_name =~ /^\s*$/
# user declined to state first and last name so we're forced to check by login
last_name = script['versions'][0]['author']['user_name']
script['versions'][1..-1].each { |v|
return nil unless v['author']['user_name'] == last_name
}
else
if last_name =~ /^\s*$/
# if author states first name but not last name, we swap em.
last_name = first_name
first_name = ""
end
script['versions'][1..-1].each { |v|
return nil unless v['author']['first_name'] == first_name && v['author']['last_name'] == last_name
}
end
return [first_name, last_name]
end
def name_conflict_exists(all_scripts, script)
# if the script's name doesn't conflict with any script in all_scripts,
return nil unless all_scripts[hashkeyify(script['name'])]
# or if it's the same script, then that's OK.
existing_script = script_contents(all_scripts[hashkeyify(script['name'])])
return nil if existing_script['script_id'] == script['script_id']
return existing_script
end
def resolve_name_conflicts(script)
return nil unless script
all_scripts = list_existing_scripts
existing_script = name_conflict_exists(all_scripts, script)
return script unless existing_script
# if the author is different, try that first
script_author = author(script)
if script_author && script_author != author(existing_script)
script['display_name'] += ' -- ' + script_author[1]
script['name'] += '--' + githubify(script_author[1])
end
existing_script = name_conflict_exists(all_scripts, script)
return script unless existing_script
# otherwise, see if we can differentiate by type
if script['script_type'] != existing_script['script_type']
script['display_name'] += ' ' + script['script_type']
script['name'] += '-' + githubify(script['script_type'])
end
existing_script = name_conflict_exists(all_scripts, script)
return script unless existing_script
script['display_name'] += ' B'
script['name'] += '-B'
# otherwise, just tack a sequence letter on the end. didn't want to use a
# number because "php.vim 2" looks like a newer release of "php.vim")
while existing_script = name_conflict_exists(all_scripts, script)
script['name'][-1] = (script['name'][-1].ord + 1).chr
script['display_name'][-1] = (script['name'][-1].ord + 1).chr
raise "what the heck?" if script['name'][-1] > 'Z'
end
script
end
def write_script(script)
return unless script
filename = script_filename(script)
puts "Scraped #{filename}"
File.open(filename, 'w') { |f|
farg = fix_encoding(script)
# check_encoding(farg)
f.write(JSON.pretty_generate(farg)+"\n")
}
filename
end
def download_file(url, dest)
retryable(:tries => 4, :on => [OpenURI::HTTPError, Timeout::Error], :sleep => 10) do |retries|
puts "downloading #{url} to #{dest}#{retries > 0 ? " TRY #{retries}" : ""}"
open(url, 'rb') do |u|
File.open(dest, 'wb') { |f| f.write(u.read) }
end
end
end
def copy_file(repo, filename, contents)
# an empty file is represented by the empty string. contents==nil indicates an internal error.
raise "no data in #{filename}: #{contents.inspect}" unless contents
repo.root[filename] = Gitrb::Blob.new(:data => contents)
end
def cleanpath(path) # lifted from git-wiki
path = path.gsub /^[\/\s]*/, ''
names = path.split('/').reject { |str| str =~ /^\s*$/ }
i = 0
while i < names.length
case names[i]
when '..'
names.delete_at(i)
if i>0
names.delete_at(i-1)
i-=1
end
when '.'
names.delete_at(i)
else
i+=1
end
end
names.join('/')
end
def smart_copy_file(repo, script, filename, contents)
filename = cleanpath(filename)
if filename =~ /^[^\/]+\.vim/
# a .vim file in the root directory
case script['script_type']
when 'color scheme' then copy_file(repo, "colors/" + filename, contents)
when 'ftplugin' then copy_file(repo, "ftplugin/" + filename, contents)
when 'game' then copy_file(repo, "plugin/" + filename, contents)
when 'indent' then copy_file(repo, "indent/" + filename, contents)
when 'syntax' then copy_file(repo, "syntax/" + filename, contents)
when 'utility' then copy_file(repo, "plugin/" + filename, contents)
when 'patch' then copy_file(repo, "plugin/" + filename, contents)
else
# if this fires, they must have added more script types?!
raise "Don't know where to put #{filename} for #{script['script_type']}"
end
elsif filename =~ /^[^\/]+\.txt/
copy_file(repo, "doc/" + filename, contents)
elsif filename =~ /^[^\/]+\/(#{$vimdirs.join('|')})\/([^\/]+)$/
# developer put vimfiles in a subdir, i.e. fixit/plugin/fixit.vim.
copy_file(repo, "#{$1}/#{$2}", contents)
else
copy_file(repo, filename, contents)
end
end
def copy_filesystem(repo, script, dir)
Find.find(dir) do |path|
next if test(?d, path)
localpath = path.sub(dir, '').sub(/^\/+/, '')
next if localpath =~ /^\s*$/
next if block_given? && yield(localpath)
smart_copy_file(repo, script, localpath, File.read(path))
end
end
def extract_vimball(repo, script, vimball)
Dir.mktmpdir($vimscripts_name) do |tmpdir|
cmd = ['/usr/bin/vim', vimball, '-X', '-n', '-c', ':set nomore', '-c', ":set runtimepath=#{tmpdir},$VIMRUNTIME", '-c', ':so %', '-c', ':q']
raise "couldn't run #{cmd.join(' ')}: #{$?}" unless system(*cmd)
copy_filesystem(repo, script, tmpdir) { |path| path == '.VimballRecord' || path == 'doc/tags' }
end
end
def unvimball(repo, script, vimball)
# vimball dies badly if called on a file with the wrong line endings
# probably 10% of line endings are dos so we always perform the correction
Tempfile.open($vimscripts_name) do |ttfile|
ball = File.read(vimball)
ball.force_encoding("ASCII-8BIT")
ball.gsub!("\r\n", "\n")
# get rid of a useless "hit return to continue" prompt. vimballs suck.
ball.gsub!(/call input\(\"\<Hit Return to continue\>\"\)/, '') if script['script_id'].to_i == 293
ttfile.write(ball)
ttfile.close
extract_vimball(repo, script, ttfile.path)
end
end
def unshell(repo, script, localpath, cmd)
Dir.mktmpdir($vimscripts_name) do |tmpdir|
fullpath = File.expand_path(localpath)
Dir.chdir(tmpdir) do
cmd = [*cmd, fullpath]
raise "couldn't run #{cmd.join(' ')}: #{$?}" unless system(*cmd)
end
copy_filesystem(repo, script, tmpdir)
end
end
def copy_zipfile(repo, script, zip_path)
Zip::ZipFile.open(zip_path) do |zipfile|
zipfile.each do |entry|
next unless entry.file?
if entry.symlink?
puts "WARNING: ignoring symlink in #{zip_path}: #{entry.name}"
else
smart_copy_file(repo, script, entry.name, entry.get_input_stream.read)
end
end
end
end
def copy_tarfile(repo, script, tarfile)
Archive::Tar::Minitar::Reader.open(tarfile) do |tf|
tf.each_entry do |entry|
next unless entry.file?
smart_copy_file(repo, script, entry.full_name, entry.read || '')
end
end
end
def ungzip(file)
Zlib::GzipReader.open(file) { |gz| yield(gz) }
end
def unbzip2(file)
Bzip2::Reader.open(file) { |bz| yield(bz) }
end
def unxz(file)
IO.popen(['/usr/bin/xz', '-d', '--to-stdout', file]) do |contents|
yield(contents)
end
end
def download_package(version, script)
pkgname = "#{version['date']} #{filenameify(script_version(version))} #{filenameify(version['filename'])}"
pkgdir = File.join($packages_dir, "#{'%04d' % script['script_id']} - #{filenameify(script['name'])}")
Dir.mkdir pkgdir unless test ?d, pkgdir
pkgfile = File.join(pkgdir, pkgname)
download_file(version['url'], pkgfile) unless test ?f, pkgfile
return pkgfile
end
def sense_zipped_file(repo, script, filename, infile)
# sense the payload of a gz or bz2 file. don't want to recurse into sense_file: too complex.
contents = infile.read
contents.force_encoding('ASCII-8BIT')
if MimeMagic.by_magic(contents) == 'application/x-tar'
copy_tarfile(repo, script, StringIO.new(contents))
elsif contents[0..512] =~ /^\bUseVimball\s*$/
Tempfile.open($vimscripts_name) do |ttfile|
ttfile.write(contents)
ttfile.close
unvimball(repo, script, ttfile.path)
end
else
smart_copy_file(repo, script, filename, contents)
end
end
def is_some_sort_of_zipfile(type)
type.include?('application/zip') || type.include?('application/x-gzip') || type.include?('application/x-bzip')
end
def sense_file(repo, actual_name, pkgfile)
# some files lie about their type (claim to be .zips but are just .vim files). fix em.
extension_type = MIME::Types.type_for(actual_name)
magic_type = File.open(pkgfile) { |f| MimeMagic.by_magic(f) }
if ( extension_type.include?(magic_type) ||
(extension_type.include?('application/x-bzip2') && magic_type == 'application/x-bzip') ||
(extension_type.include?('application/x-rar-compressed') && magic_type == 'application/x-rar') )
# we're good, extension and magic match up
else
# need to figure out what's going on and fix it.
if (magic_type.nil? || magic_type.text?) && actual_name =~ /#{$textext}/
# no problem, it's a textfile
elsif actual_name =~ /\.vba$|\.vimball$/
# arg, a vimball. let it through. see sense_zipped_file for vimball sensing.
elsif actual_name =~ /^[^a-z]*\.VIM$/
# i guess dos users might do everything in caps
actual_name.downcase!
elsif actual_name =~ /^(.*)\.VIM$/
actual_name = "#{$1}.vim"
elsif ( %w{application/x-awk application/x-perl application/x-ruby application/x-shellscript application/xml}.include?(magic_type.to_s) ||
(magic_type == nil && %w{vimopen cleanswap cvsvimdiff vmake vim_menu_HTMLpol}.include?(actual_name)) ||
%w{zshrc vimrc _vimrc .vimrc}.include?(actual_name) ||
actual_name =~ /\.dict$/ || %w{pydiction xdebug2}.include?(actual_name) || # dictionaries
actual_name =~ /\.applescript$/ ||
magic_type == 'application/x-java' ||
magic_type == 'application/x-ms-dos-executable' )
# names and magic have failed us, copy these files over raw
copy_file(repo, actual_name, File.read(pkgfile))
actual_name = nil
elsif magic_type == 'application/zip'
actual_name += '.zip'
elsif magic_type == 'application/x-gzip'
actual_name += '.gz'
elsif magic_type == 'application/x-bzip'
actual_name += '.bz2'
elsif magic_type == 'application/x-7z-compressed'
actual_name += '.7z'
elsif magic_type == 'application/x-xz'
actual_name += '.xz'
elsif is_some_sort_of_zipfile(extension_type)
# extension claims zipfile but magic disagrees, this happens a lot
actual_name.sub!(/\.zip$|\.tar\.gz$|\.tar\.bz2?$|\.tgz$|\.tbz2?$/i, '')
if magic_type == 'text/x-python'
actual_name += '.py'
elsif magic_type.nil? || magic_type.text? # chances are it's a vimscript...?
actual_name += '.vim' # http://www.vim.org/scripts/script.php?script_id=29
elsif magic_type == 'application/x-tar'
actual_name += '.tar'
else
# just need to hope that nobody makes this mistake anymore. if they do, fix this function.
raise "unknown failed zip type for #{actual_name}: #{magic_type}"
end
elsif magic_type == 'application/x-macbinary'
copy_file(repo, actual_name + '.macbinary', File.read(pkgfile))
actual_name = nil
elsif !magic_type && actual_name =~ /^exUtility-[0-9.]*.tar$/
# odd that magic couldn't sense this valid tarfile. owell, process it as normal.
elsif extension_type.include?('application/x-tar') && magic_type == nil
# magic has a bug where tarfiles are missed: http://github.com/minad/mimemagic/issues/#issue/1
# if this happens, we just blindly trust the extension.
else
# there's such a range of reasons why this will happen that there's nothing to do but
# have a human improve this function to handle this case. :(
raise "differing mime types for #{actual_name}, ext claims #{extension_type} but magic is #{magic_type.inspect}"
end
end
return actual_name
end
def add_version(repo, version, script)
# adds all the files in the package to the repo
pkgfile = download_package(version, script)
actual_name = sense_file(repo, version['filename'], pkgfile)
tough_zipfiles = [
# ruby's built-in unzipper can't handle these perfectly valid zipfiles:
'packages/2441 - pyflakes.vim/2009-12-29 1.57 pyflakes-vim.zip',
'packages/2441 - pyflakes.vim/2010-06-17 1.62 pyflakes-vim.zip',
# plus a bunch of files from scripts 3114, 3123, 3148, 3150, and 3169.
# to be safe we'll just shell out to unzip for everything # newer than 3110.
# http://www.ruby-forum.com/topic/211146#936159
]
case actual_name
when nil then # do nothing
when /#{$textext}/ then smart_copy_file(repo, script, actual_name, File.read(pkgfile))
when /\.zip$/i then tough_zipfiles.include?(pkgfile) || script['script_id'].to_i > 3110 ? unshell(repo, script, pkgfile, ['/usr/bin/unzip']) : copy_zipfile(repo, script, pkgfile)
when /\.tar$/ then File.open(pkgfile, 'r') { |file| copy_tarfile(repo, script, file) }
when /\.t?gz$/i then ungzip(pkgfile) { |contents| sense_zipped_file(repo, script, version['filename'].sub(/\.t?gz$/, ''), contents) }
when /\.t?bz2?$/i then unbzip2(pkgfile) { |contents| sense_zipped_file(repo, script, version['filename'].sub(/\.t?bz2?$/, ''), contents) }
when /\.xz$/ then unxz(pkgfile) { |contents| sense_zipped_file(repo, script, version['filename'].sub(/\.xz$/, ''), contents) }
when /\.7z$/ then unshell(repo, script, pkgfile, ['/usr/bin/7za', 'x'])
when /\.rar$/ then unshell(repo, script, pkgfile, ['/usr/bin/unrar', 'x'])
when /\.vba$|\.vimball$/ then unvimball(repo, script, pkgfile)
else
# probably need to add a new text file extension to $textext.
# if not, it's probably a new compression format.
raise "unknown filetype: #{actual_name}"
end
end
def tag_version(repo, version)
ENV['GIT_COMMITTER_NAME'] = $vimscripts_name
ENV['GIT_COMMITTER_EMAIL'] = $vimscripts_email
sver = script_version(version)
repo.git_tag('-a', gittagify(sver), '-m', "tag #{sver}")
ENV.delete('GIT_COMMITTER_NAME')
ENV.delete('GIT_COMMITTER_EMAIL')
end
def find_version(repo, version)
tagname = gittagify(script_version(version))
# gitrb doesn't handle annotated tags so we call git directly
tag = repo.git_tag('-l', tagname).chomp
tag = nil if tag =~ /^\s*$/
return tag
end
def copy_tree(repo, script, tree, inpath=[])
tree.to_a.each do |name, value|
path = inpath + [name]
if value.type == :blob
smart_copy_file(repo, script, path.join('/'), value.data)
else
copy_tree(repo, script, value, path)
end
end
end
def check_for_lone_dir(repo, script)
return unless repo.root.count == 1
loner = repo.root.first.first
return if $vimdirs.include? loner
if repo.root[loner].type == :tree
# copy contents of the directory into the root
copy_tree repo, script, repo.root.delete(loner)
end
end
def check_for_readme_file(repo)
# we drop a README file into each repo. don't want to conflict with one already there.
repo.root.to_a.each do |name, value|
if name =~ /^README$/i
raise "already have a readme.orig!" if repo.root.to_a.find { |n,v| n =~ /^readme\.orig$/i }
repo.root[name + '.orig'] = repo.root.delete(name)
end
end
end
def store_versions_in_repo(repo, script)
committer = Gitrb::User.new($vimscripts_name, $vimscripts_email)
puts "Processing script #{script['script_id']}: #{script['name']}"
count = 0
script['versions'].reverse.each do |version|
author = Gitrb::User.new("#{version['author']['first_name']} #{version['author']['last_name']}",
version['author']['email'], Time.new(*version['date'].split('-'), 0, 0, 0, 0))
if find_version(repo, version)
puts " skipped #{version['filename']} #{version['date']} #{script_version(version)}"
else
puts " adding #{version['filename']} #{version['date']} #{script_version(version)}"
repo.transaction(version['release_notes'], author, committer) do
# delete all existing blobs since we replace everything with the new commit
repo.root.to_a.map { |name,value| repo.root.delete(name) }
add_version(repo, version, script)
check_for_lone_dir(repo, script)
check_for_readme_file(repo)
copy_file(repo, 'README', "This is a mirror of #{script_id_to_url(script['script_id'])}\n\n" + script['description'] + "\n") unless repo.root['README']
end
tag_version(repo, version)
count += 1
end
end
count
end
def perform_push(repo_name)
creds = Hashie::Mash.new(JSON.parse(File.read('creds.json')))
github = Octopussy::Client.new(creds)
repo = Gitrb::Repository.new(:path => repo_name.dup, :bare => true)
script = JSON.parse(File.read(File.join(repo_name, $git_script_file)))
start = Time.now
api_calls = 0
remote = github.repo("vim-scripts/#{script['name']}") rescue nil
api_calls += 1
if remote
# make sure this actually is the same repo
remote.homepage =~ /script_id=(\d+)$/
raise "bad url on github repo #{script['name']}" unless $1
raise "remote #{script['name']} exists but id is for #{$1}" if script['script_id'] != $1
end
unless remote
retryable(:tries => 4, :on => Timeout::Error, :sleep => 10) do |retries|
puts "Creating #{script['script_id']} - #{script['name']}#{retries > 0 ? " TRY #{retries}" : ""}"
remote = github.create(
:name => script['name'],
:description => "#{script['summary']}",
:homepage => script_id_to_url(script['script_id']),
:public => true)
end
api_calls += 1
# turn off the issues and wiki tabs (wish 'create' would do that)
retryable(:tries => 4, :on => [Timeout::Error, Octopussy::Unauthorized], :sleep => 10) do |retries|
puts " disabling wiki/issues for #{script['script_id']} - #{script['name']}#{retries > 0 ? " TRY #{retries}" : ""}"
github.set_repo_info(
# http://support.github.com/discussions/api/97-api-requires-that-dots-be-escaped
'vim-scripts/' + script['name'].gsub('.', '%2E'),
:has_issues => false, :has_wiki => false)
end
api_calls += 1
end
repo.git_remote('rm', 'origin') rescue nil
# http://help.github.com/multiple-keys
repo.git_remote('add', 'origin', "git@vim-scripts.github.com:vim-scripts/#{script['name']}")
retryable(:tries => 6, :on => Gitrb::CommandError, :sleep => 15) do |retries|
# Gitrb::CommandError is as close to a network timeout error as we're going to get
puts "Pushing #{script['script_id']} - #{script['name']}#{retries > 0 ? " TRY #{retries}" : ""}"
repo.git_push('--tags', 'origin', 'master')
end
# sleep to avoid bumping into github's 60-per-minute API limit
# the push doesn't count toward the API limit
stop = Time.now
sleep api_calls-(stop-start) if stop-start < api_calls
# we don't want any provision for forcing pushes. if you delete
# and recreate a repo, you must manually delete and recreate it
# on github. forcing is bad, avoid at all costs.
# we should have a script that will compare the full list of
# repos on github and here and print any differences. that is
# not a part of this script's job.
# Octopussy.list_repos('vim-scripts')
# Octopussy.delete("vim-scripts/#{ghname}")
# no need to reset the remote because presumably we created this
# repo and the remote is already set correctly.
end
def perform_scrape(script_id)
write_script(resolve_name_conflicts(scrape_script(script_id)))
end
def perform_download(script_file)
return [nil,-1] unless script_file
script = JSON.parse(File.read(script_file))
repo_path = File.join($repos_dir, "#{'%04d' % script['script_id']} - #{filenameify(script['name'])}.git")
# if gitrb is dying on the following line, you need to upgrade
repo = Gitrb::Repository.new(:path => repo_path, :bare => true, :create => true)
File.open(File.join(repo_path, $git_script_file), 'w') { |f| f.write(JSON.pretty_generate(script)+"\n") }
count = store_versions_in_repo(repo, script)
[repo_path, count]
end
def perform_all(script_id, always_push=false)
repo_name,count = perform_download(perform_scrape(script_id))
perform_push(repo_name) if always_push or count > 0
end
def perform_rss
url = 'http://feed43.com/vim-scripts.xml'
feed = Feedzirra::Feed.fetch_and_parse(url)
puts "feed contains #{feed.entries.map { |e| script_id_from_url(e.url) }.inspect}"
last_rss_id_path = 'last_rss_id'
last_rss_id = File.read(last_rss_id_path) rescue nil
entries = if last_rss_id
last_rss_id.gsub!(/[^A-Fa-f0-9]/, '')
last_index = feed.entries.index { |e| e.entry_id == last_rss_id }
if last_index
last_index == 0 ? [] : feed.entries[0..(last_index - 1)]
else
# if this happens very often, the scraper could just initiate a full scrape now.
# since it takes hours, however, we'd have to add locking so the cron job doesn't
# stomp all over itself (and worry about stalled jobs or stale pids).
raise "Lost feed data. Need to perform a full scrape!"
puts "#{last_rss_id.inspect} not found in feed, pulling all #{feed.entries.count} items in feed."
feed.entries
end
else
puts "last_rss_id not found, pulling all #{feed.entries.count} items in feed."
puts "WARNING: assuming you just did a full scrape. Bad news if you didn't!"
feed.entries
end
puts " processing #{entries.map { |e| script_id_from_url(e.url) }.inspect}"
entries.reverse.each do |entry|
script_id = script_id_from_url(entry.url)
puts "Fetching script #{script_id} for rss #{entry.entry_id}"
# always_push=true because there's a chance the previous scrape failed due to
# a transient network error. next time we run, we may find that there are
# no versions to download, but the repo still needs to be pushed.
perform_all(script_id, true)
File.open(last_rss_id_path, 'w') { |f| f.write(entry.entry_id) }
end
end
if ARGV.empty?
$ignore_cache = 1
perform_rss
else
ARGV.each do |arg|
if arg =~ /^\d+$/
$ignore_cache = 1
perform_all arg
elsif arg =~ /^-(\d+)$/
perform_scrape arg
elsif test ?f, arg
perform_download arg
elsif test ?d, arg
perform_push arg
else
raise "Could not recognize argument #{arg}"
end
end
end
Jump to Line
Something went wrong with that request. Please try again.