/
scraper
executable file
·1664 lines (1397 loc) · 61 KB
/
scraper
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env ruby
# vim scripts scraping monster
# This code is Copyright (c) 2010 Scott Bronson
# Released under the MIT License.
#
# DEPENDENCIES
# This script requires Ruby 1.9.2.
# Make sure you have unzip, unrar, 7za, and xz installed.
# Ubuntu: sudo apt-get install unzip unrar p7zip-full xz-utils
# Macintosh: sudo port install unrar p7zip xz
# Also, for gems:
# Ubuntu: sudo apt-get install libxml2-dev libxslt1-dev zlib1g-dev libbz2-dev libcurl4-openssl-dev
# Fedora: sudo yum install bzip2-devel libxml2-devel libxslt-devel libcurl-devel
#
# FULL SCRAPE:
# To start a full scrape, do this:
# rm -f state.json
# FOREVER=1 ./scraper
# Now the scraper is warm and ready to perform continuous RSS scraping.
#
# AUTOMATIC RSS Scrape:
# Run the scraper with no args to perform an rss scrape. It downloads
# new scripts in the rss feed and remembers its position for next time.
# ./scraper
#
# MANUAL Debugging:
# - with a positive number, does a full scrape / compile / upload cycle
# ./scraper 987
# ./scraper $(seq 1001 2000)
# - with a negative number, scrapes but does not compile or upload
# ./scraper -987
# - with a .json file, compiles the git repo
# ./scraper scripts/0987*
# - with a bare git repo, pushes the repo up to github
# ./scraper repos/0987*
#
# TESTING:
# You may think that this script has no tests. Not true! The run-test
# script creates a repo for every script in the system, runs git log on
# it, and dumps the results. If anything has changed, you'll see it.
# JOBS=4 ./run-test stock
# cd result/stats-stock
# git status; git diff; etc.
# JOBS (optional): splits args into N groups and runs them in parallel.
#
# Because authors are always renaming scripts, changing email addrs and
# readmes, and deleting revisions, a from-scratch scrape will always
# be different from the repos in github. Therefore, you should do this
# after every few scrapes:
# ./scraper --dump
# In the scraper-stats repo, the master branch contains the ongoing repos
# and the test branch contains the recreate-from-scratch each time repos.
#
# RECOMPILING:
# Sometimes the scraper generates bad repos but you don't discover this
# until after they've been pushed to github. No problem! Create a
# text file containing the names of the repos that need fixing and then
# delete and recreate them:
# ./delete-repos $(cat BADNAMES)
# FOREVER=1 ./scraper $(cat BADNAMES)
# If there's a network or github error, just rerun the command until it
# succeeds.
#
# RESCUING:
# If you deleted a script repo and want to restore your local copy
# with the one on github:
# git clone --bare git://github.com/vim-scripts/SCRIPTNAME
# ./scraper SCRIPTID # to restore internal state
# Now the script is exactly the same as if it had been scraped
# locally from the start.
$:.unshift './lib'
require 'rubygems'
require 'bundler'
Bundler.require
require 'hpricot' # hpricot gem
require 'open-uri'
require 'cgi'
require 'json' # json gem
require 'zlib'
require 'bzip2' # bzip2-ruby gem
require 'mime/types'
require 'mimemagic' # mimemagic gem
require 'tmpdir'
require 'tempfile'
require 'find'
require 'octokit' # octokit gem
require 'hashie' # hashie gem
require 'htmlentities' # htmlentities gem
require 'feedzirra' # feedzirra gem
require 'erubis' # erubis gem
require 'mail' # mail gem
require 'fileutils'
require 'open3'
require 'retryable'
require 'github'
require 'gitrepo'
include Retryable
# :on => [] means that we won't retry anything unless the caller
# specifies the exact exceptions that it wants to be retried.
retryable_options :detect_nesting => true, :tries => 4,
:sleep => lambda { |n| 4 ** n }, :on => []
# This is the name and email address of the git committer.
$vimscripts_name = "Able Scraper"
$vimscripts_email = 'scraper@vim-scripts.org'
$rss_url = 'http://feed43.com/vim-scripts.xml'
$idle_count = 10 # number of scripts to scrape when we have nothing else to do
$max_run_time = 14.minutes + 30.seconds # script exits normally once this time has elapsed
$repos_dir = ENV['REPOS_DIR'] || 'repos' # if a repo has been renamed there will be duplicate ids in here
$scripts_dir = 'scripts' # should never be any duplicate ids in here
$packages_dir = 'packages'
$webcache_dir = 'webcache'
$git_script_file = 'vim-script.json' # the file in the bare repo that stores the script that generated it
$state_file = 'state.json' # keeps track of what mode we're in (full vs. rss scrape)
$pushing = true # set this to false to prevent pushing (normally, if we find changes in a repo, we push)
$ignore_cache = false # we pull from the webcache if we can, setting to true forces pulling from the network
$vimdirs = %w{after autoload bin compiler colors doc ftdetect ftplugin indent keymap plugin syntax}
$textext = %w{au3 bat c cpp csh diff h klip patch pl pm ps py rb set sh snip snippet snippets tcl txt xml vim vim.orig}.map { |x| "\\.#{x}$" }.join('|')
# vim.org doesn't offer any way to delete a script so people have invented all sorts of ways of doing it.
# 1022 2668 2730, and 3080 have also been deleted but I give up. This is good enough.
# 3519 is simply an obsolete copy of jquery and really should be deleted
$deleted_scripts = %w{364 548 549 550 762 1032 1129 1263 1280 1295 1301 1430 1436 1452 1509 1562 1669 1789 1824 1949 2056 2172 2306 2309 2313 2316 2318 2323 2352 2456 2498 2664 2861 3076 3080 3519}
# not sure why an author would leave a corrupt package on vim scripts forever but owell.
# also we can't trust the script version and date pair to be unique: script 2709 SudoEdit.vim
$skip_packages = {
1609 => ["2006-10-06 3.8", "2006-10-13 3.9", "2006-11-07 4.1.0", "2006-11-08 4.2.0", "2006-11-18 4.3.0"],
3075 => ["2010-07-28 0.12", "2010-07-28 0.11"],
}
# some scripts were created with the wrong type. this fixes the ones that matter.
# http://groups.google.com/group/vim_use/msg/6f9f82e8c6fb4faa
# note that you must re-scrape after adding a fix ("./scraper 1780").
# regenerating ("./scraper scripts/1780*") is not sufficient in this case.
$script_type_fixes = {
93 => 'ftplugin',
1780 => 'syntax',
}
# if the regex matches the path, it is run through gsub and the result
# used as the new path. If the replace string is nil, the file is
# suppressed. Doesn't work for gifs and a few others (easy to fix).
$file_location_fixes = {
284 => { /([^\/]+\.vim)$/ => 'ftplugin/tex/\1' }, # all .vim files go in ftplugin/tex
1095 => { /^.*\/([^\/]+\.vim)$/ => 'ftplugin/tex/\1' }, # all .vim files go in ftplugin/tex
1771 => { /readme$/ => nil }, # has identical README and readme files.
2651 => { /^(.*)Syntax(.*)$/ => '\1syntax\2' }, # breaks on case sensitive filesystems
3027 => { /^root\/\.vim\/(.*)$/ => '\1' }, # grsecurity balled up his entire root dir
}
# at least one script is actually multiple scripts -- 790 has python.vim and python3.0.vim
# this forces package names matching the regex out to a different branch
$branch_versions = { 790 => { :branch => 'python3', :regex => /^python3\.0\.vim$/ } }
# Version nazi
raise "Must run under Ruby 1.9.2" unless RUBY_VERSION == "1.9.2"
Dir.mkdir $scripts_dir unless test ?d, $scripts_dir
Dir.mkdir $repos_dir unless test ?d, $repos_dir
Dir.mkdir $packages_dir unless test ?d, $packages_dir
Dir.mkdir $webcache_dir unless test ?d, $webcache_dir
# this mime magic is far too vague. it false-triggers all the time.
MimeMagic.remove 'application/x-gmc-link'
class ScrapeError < RuntimeError; end # retryable problem when scraping
class SourceForgeError < ScrapeError; end # sourceforge being stupid
class NoContentError < ScrapeError; end # page appears to be rendered incorrectly
# Turns out Ruby isn't very good about limiting the types of errors
# we need to handle... These are the ones that make sense to retry.
def retryable_errors
[
ScrapeError,
Errno::ECONNRESET,
Timeout::Error,
Errno::ETIMEDOUT, # Connection timed out - connect(2) (Errno::ETIMEDOUT)
OpenURI::HTTPError,
SocketError, # getaddrinfo: Name or service not known (SocketError)
GitRepo::GitError,
]
end
# http://github.com/hpricot/hpricot/issues#issue/25
# super ugly that hpricot manages to screw up charset encodings so badly.
# hopefully the next version of hpricot allows us to get rid of these monkeypatches
module Hpricot
module Traverse
def inner_text
if respond_to?(:children) and children
# children.map { |x| x.inner_text }.join
children.map { |x| str = x.inner_text;
str = str.dup.force_encoding('ISO-8859-1').encode('UTF-8') if str.encoding.to_s == 'ASCII-8BIT' || str.encoding.to_s == 'ISO-8859-1';
str
}.join
else
""
end
end
end
def self.uxs(str)
str = str.to_s
str = str.dup.force_encoding('ISO-8859-1').encode('UTF-8') if str.encoding.to_s == 'ASCII-8BIT' || str.encoding.to_s == 'ISO-8859-1'
str.gsub(/\&(\w+);/) { [NamedCharacters[$1] || 63].pack("U*") }.
gsub(/\&\#(\d+);/) { [$1.to_i].pack("U*") }
end
end
# and and hopefully a future version will allow us to remove these monkeypatches
module Hpricot
module Traverse
def inner_content
if respond_to?(:children) and children
children.map { |x| x.inner_content }.join
else
''
end
end
end
class Elements < Array
def inner_content
map { |x| x.inner_content }.join
end
end
class Text
def inner_content
str = content.to_s
str = str.force_encoding('ISO-8859-1').encode("UTF-8") if str.encoding.to_s == 'ASCII-8BIT'
CGI.unescapeHTML(HTMLEntities.new.decode(str.gsub(/ /, " ")))
end
end
class CData
alias_method :inner_content, :content
end
end
# There's a bizarre bug when passing a Hashie::Mash to JSON.pretty_generate.
# See https://github.com/bronson/whose-bug for an attempt to track it down.
# This routine just converts to plain hashes before generating.
def json_pretty arg
arg = arg.to_hash if arg.kind_of? Hashie::Mash
JSON.pretty_generate(arg) + "\n"
end
# When running normally, we can be in one of two states: rss and full.
# rss is when the rss is synced and we can update only the scripts that have changed.
# full is when we're just starting or we have lost the rss sync.
def read_state
json = JSON.parse(File.read($state_file)) rescue nil
Hashie::Mash.new json
end
def write_state state
File.open($state_file, 'w') { |f| f.write json_pretty(state) }
end
# vim.org has added a new email obfuscation trick: replacing @ and . with images.
def unfuddle_email elem
elem.search('img[@src*=emailat]' ).each { |e| e.swap("@") }
elem.search('img[@src*=emaildot]').each { |e| e.swap(".") }
elem.inner_text
end
def cached_open url, retries
path = File.join $webcache_dir, filenameify(url)
# ignore_cache should be true if we're running a real scrape and should
# never use stale data. Otherwise, leave it false so your testing and
# debugging will just use the local file system.
if $ignore_cache || !File.exist?(path) || File.size(path) <= 0
File.open(path, 'w') do |f|
puts "downloading #{url}#{retries > 0 ? " RETRY #{retries}" : ""}"
open(url) { |u| f.write(u.read) }
end
end
File.open(path) { |f| yield f }
end
def scrape_page url, retries
doc = cached_open(url, retries) { |f| Hpricot(f) }
# vim.org (SourceForge) seems to have a bug where every thousand requests or so
# it returns a page indicating that it's lost its head. No big deal, immediately
# retrying always seems to fix it.
if doc.search('title').inner_text == "Unknown Site"
puts "Sourceforge lost its head for #{url} on try #{retries}, trying again."
raise SourceForgeError.new "Sourceforge blew #{url}"
end
return doc
end
def scrape_author user_id, retries
$authors ||= []
unless $authors[user_id.to_i]
doc = scrape_page "http://www.vim.org/account/profile.php?user_id=#{user_id}", retries
unless doc.at('td[text()="user name"]')
puts " no content received #{retries}"
raise NoContentError.new "bad page"
end
u = {'user_id' => user_id }
u['user_name'] = doc.at('td[text()="user name"]').next_sibling.inner_content
u['first_name'] = doc.at('td[text()="first name"]').next_sibling.inner_content
u['last_name'] = doc.at('td[text()="last name"]').next_sibling.inner_content
u['email'] = unfuddle_email doc.at('td[text()="email"]').next_sibling
u['homepage'] = doc.at('td[text()="homepage"]').next_sibling.inner_content
$authors[user_id.to_i] = u
end
return $authors[user_id.to_i]
end
def script_id_to_url(script_id)
"http://www.vim.org/scripts/script.php?script_id=#{script_id}"
end
def script_id_from_url(url)
url =~ /[?&;]script_id=(\d+)/ or raise "Could not parse a script id from <<#{url}>>"
$1
end
def scrape_script(script_id)
script_id = script_id.to_s
if $deleted_scripts.include? script_id
puts "Skipped #{script_id} -- deleted."
return nil
end
s = {'script_id' => script_id}
doc = nil
retryable(:on => retryable_errors) do |retries|
doc = scrape_page script_id_to_url(script_id), retries
if doc.search('title').inner_text == "Error : vim online"
puts "Skipped #{script_id} -- doesn't exist."
return nil
end
s['display_name'], s['summary'] = doc.search('.txth1').inner_content.split(" : ", 2)
unless s['display_name']
puts " no content received #{retries}"
raise NoContentError.new "bad page"
end
end
s['name'] = githubify(s['display_name'])
s['script_type'] = $script_type_fixes[script_id.to_i] ||
doc.at('td[text()="script type"]').parent.next_sibling.children.first.inner_content
desc = doc.at('td[text()="description"]').parent.next_sibling.children.first
desc.search('br').each do |br|
# restore the newline to every element preceeding a br.
prev = br.previous;
if prev && prev.text?
prev.content = prev.content + "\r" unless prev.content.end_with?("\r")
else
br.before "\r"
end
end
s['description'] = desc.inner_content.gsub("\r", "\n")
if false # we don't use this info and it generates too much noise
doc.search('td.lightbg~td').find { |e| e.inner_text =~ /Rating.*\s(-?\d+)\/(\d+),.*Downloaded[^\d]*(\d+)/m }
s['rating_total'], s['rating_votes'], s['downloads'] = $1, $2, $3 # http://www.vim.org/karma.php
end
s['install_details'] = doc.at('td[text()="install details"]').parent.next_sibling.children.first.inner_content.gsub("\r", "\n")
# reject links with targets so download links in the description don't appear to be a version (script 1843)
s['versions'] = doc.search('a[@href*="download_script.php?"]').select { |e| e.attributes['target'].empty? }.to_a.map do |a|
v = {'url' => 'http://www.vim.org/scripts/' + a.attributes['href'],
'filename' => a.inner_content}
row = a.parent
v['script_version'] = row.siblings_at(1).inner_content
v['date'] = row.siblings_at(2).inner_content
v['vim_version'] = row.siblings_at(3).inner_content
retryable(:on => retryable_errors) do |retries|
v['author'] = scrape_author(row.siblings_at(4).at('a').attributes['href'].match(/\d+/)[0], retries)
end
v['release_notes'] = row.siblings_at(5).inner_content.gsub("\r", "\n")
v
end
if s['versions'].empty?
puts "Skipped #{script_id} -- empty."
return nil
end
s
end
def fix_encoding(h)
# see the Hpricot monkey patch above. It gave us random encodings,
# we need to force them back to their default before converting to utf8.
# ugly!!
if h.kind_of? Hash
o = Hash.new
h.each_pair { |k,v| o[fix_encoding(k)] = fix_encoding(v) }
o
elsif h.kind_of? Array
a = Array.new
h.each { |v| a.push fix_encoding(v) }
a
elsif h.kind_of? String
if h.encoding.to_s == 'ASCII-8BIT' || h.encoding.to_s == 'ISO-8859-1'
h = h.dup.force_encoding('ISO-8859-1').encode('UTF-8')
end
h
else
h
end
end
def check_encoding(h)
# recursively prints the encoding of every key/value/element etc
if h.kind_of? Hash
h.each_pair { |k,v| check_encoding(k); check_encoding(v) }
elsif h.kind_of? Array
h.each { |v| check_encoding(v) }
elsif h.kind_of? String
puts "#{h.encoding}: #{h[0..50]}"
else
h
end
end
def h(*args)
CGI.escapeHTML(*args)
end
def scripts_recent good_scripts
good_scripts.map { |s|
recent_author_name, recent_author_email = fix_email_address(s.versions.last.author)
{
:n => s.name,
:t => s.script_type,
:s => s.summary,
:rv => s.versions.last.script_version,
:rd => s.versions.last.date,
:ra => recent_author_name,
:re => recent_author_email
}
}.to_json
end
def generate_doc_files doc_dir
puts " reading scripts"
# a hash of all scripts (including ones abandoned by renames) indexed by script_id
repos = Dir.entries($repos_dir).reject { |e| %w{. .. .git}.include?(e) }
all_scripts = repos.sort.map do |dir|
Hashie::Mash.new(JSON.parse(File.read(File.join($repos_dir, dir, $git_script_file))))
end
return nil if all_scripts.empty?
# just the official scripts -- no renames, no deletions
script_files = Dir.entries($scripts_dir).reject { |e| %w{. .. .git}.include?(e) }
good_scripts = script_files.sort.map do |file|
Hashie::Mash.new(JSON.parse(File.read(File.join($scripts_dir, file))))
end
files = {
'scripts.json' => lambda { good_scripts.map { |s| s.name }.to_json },
'script_ids.json' => lambda { good_scripts.reduce({}) { |a,v| a[v.script_id.to_i] = v.name; a }.to_json },
'script_original_names.json' => lambda { all_scripts.reduce({}) { |a,v| a[v.display_name] = v.name; a }.to_json },
'scripts_recent.json' => lambda { scripts_recent good_scripts }
}
files.each do |name,proc|
puts " generating #{doc_dir}/api/#{name}"
File.open("#{doc_dir}/api/#{name}", 'w') do |f|
f.write proc.call
end
end
return files.keys.map { |name| "#{doc_dir}/api/#{name}" }
end
def generate_docs
# wish we could use a bare repo to keep the docs but they don't support merging
doc_dir = 'vim-scraper.github.com'
puts "generating docs"
site = nil
unless test ?d, doc_dir
site = GitRepo.new :root => doc_dir, :clone => "git@github.com:vim-scraper/vim-scraper.github.com.git"
site.remote_add 'vim-scripts', "git@github.com:vim-scraper/vim-scripts.git"
end
site ||= GitRepo.new :root => doc_dir
site.pull 'vim-scripts', 'master'
updated_docs = generate_doc_files doc_dir
if updated_docs
author = { :name => $vimscripts_name, :email => $vimscripts_email }
site.commit('new scrape', author) do |commit|
updated_docs.each { |file| commit.add file, File.read(file) }
end
end
site.push 'origin', 'master'
end
def filenameify(s)
# replace unsafe path chars. keep posessive Michaels, not Michael-s
s.gsub(/^\s*|\s*$/, '').gsub(/'s/i, 's').gsub(/[^ A-Za-z0-9_\-!#\@\$^%&:;<?>+=(){|},.\[\]]/, '-')
end
def gittagify(s)
# replace any chars git might take issue with (space, backslash, ^:)?]
# git doesn't like it when a tag begins or ends with periods or dashes
s.gsub(/^\s*|\s*$/, '').gsub(/[^A-Za-z0-9_\-!#\@\$%&;<>+=(){|},.\]]/, '-').gsub(/^\./, '0.').gsub(/\.$/, '.0').gsub(/^-*|-*$/, '').gsub(/^\./, '0.').gsub(/\.$/, '.0').gsub(/\.\./, '._')
end
def githubify(s)
# these guys only allow A-Za-z0-9._- yet we need to try to keep the name readable.
s.gsub(/^\s*|\s*$/, '').gsub(/\s+-|-\s+/, '-').gsub(/\+\+/, 'pp').gsub(/([CF])#/i, "#{$1}sharp").gsub('::', '.').gsub('&', 'and').gsub(/\s+|:|\+/, '-').gsub(/^-|-$/, '').gsub(/[^A-Za-z0-9_.-]/, '')
end
def script_version(version)
# some scripts don't assign a version so just use the date
s = version['script_version']
s = version['date'] if s =~ /^\s*$/
s
end
def hashkeyify name
# converts a repo name into something worthy of matching against
name.downcase
end
def script_filename(script)
# if you change the filename format, also change script_extract_*
File.join($scripts_dir, "#{'%04d' % script['script_id']} - #{filenameify(script['name'])}.json")
end
def script_extract_id script_name
# ignore leading 0s, otherwise to_i might mistakenly think it's octal
script_name =~ /^0*([0-9]+) - .+\.json$/ or raise "can't match #{script_name}"
$1
end
def script_extract_name script_name
script_name =~ /^[0-9]+ - (.+)\.json$/ or raise "can't match #{script_name}";
$1
end
def repo_filename script_id, script_name
File.join($repos_dir, "#{filenameify(script_name)}.git")
end
def repo_extract_name repo_name
repo_name =~ /^(.+)\.git$/ or raise "can't match #{repo_name}";
$1
end
def list_existing_scripts
# returns a hash of key=script name, value=filename
Hash[Dir.entries($repos_dir).reject { |e| %w{. .. .git}.include?(e) }.
map { |e| [hashkeyify(repo_extract_name(e)), e]}]
end
def highest_script_id
Dir.entries($scripts_dir).reject { |e| %w{. .. .git}.include?(e) }.
map { |e| script_extract_id(e).to_i }.max
end
# if a script has been renamed, it will have multiple repos with
# the same id (the old ones will have a README pointing to the new one)
def find_scripts_by_id script_id
Dir.entries($scripts_dir).reject { |e| %w{. .. .git}.include?(e) }.
select { |e| script_extract_id(e).to_i == script_id.to_i }
end
# we don't try too hard to unobfuscate addresses but we definitely want them to be legal
# these rules were created by fiddling until most results looked plausible and all were legal.
def fix_email_address author
email = author['email'].dup
email = "unspecified@example.com" if email =~ /^\s*$/
email.gsub!(/\s+[\[(]?at[)\]]?\s+/i, '@')
email.gsub!(/\s+[\[(]?dot[)\]]?\s+/i, '.')
# not sure how this next one will do with IDNs?
# actually, without it we only fail on 9 of 1643 addresses
# email.gsub!(/[^A-Za-z0-9!#\$%&'*+\/=?^`{|}~_@.-]/, '-')
email.gsub!(/\s+|[:<>\[\]()"]/, '-') # some common evil chars
email.gsub!(/^\-+|\-+$/, '') # no dashes at start or end
email = "unspecified@example.com" if email =~ /^\s*$/
email = "X#{email}" if email =~ /^@/ # fix "@gmail" with no local part
email = "invalid@#{email}" unless email.include?('@')
email = "#{email}.example.com" unless email =~ /[A-Za-z0-9]$/
addr = Mail::Address.new(email) rescue Mail::Address.new("unparseable@example.com")
addr.display_name = [author['first_name'], author['last_name']].select { |s| s =~ /\S/ }.join(' ').gsub(/\s+/, ' ')
[addr.display_name, addr.address]
end
def fix_release_notes version
msg = version['release_notes']
msg.gsub!(/[ \t]+$/u, '') # remove trailing whitespace on each line
if msg.length > 70 || msg.include?("\n")
# message is too long, we'll insert our own first line
msg = "Version #{version['script_version']}\n\n#{msg}"
else
msg = "Version #{version['script_version']}: #{msg}"
end
msg + "\n"
end
def author script
# returns the author's first and last name or nil if this script has multiple authors.
# can't just check author id because a number of authors have abandoned old accounts and created new ones
first_name = script['versions'][0]['author']['first_name']
last_name = script['versions'][0]['author']['last_name']
if first_name =~ /^\s*$/ && last_name =~ /^\s*$/
# user declined to state first and last name so we're forced to check by login
last_name = script['versions'][0]['author']['user_name']
script['versions'][1..-1].each { |v|
return nil unless v['author']['user_name'] == last_name
}
else
if last_name =~ /^\s*$/
# if author states first name but not last name, we swap em.
last_name = first_name
first_name = ""
end
script['versions'][1..-1].each { |v|
return nil unless v['author']['first_name'] == first_name && v['author']['last_name'] == last_name
}
end
return [first_name, last_name]
end
def name_conflict_exists all_scripts, script
# if the script's name doesn't conflict with any script in all_scripts,
repo_dir = all_scripts[hashkeyify(script['name'])]
return nil unless repo_dir # no conflict
# or if it's the same script, then that's OK.
new_script = JSON.parse(File.read(File.join($repos_dir, repo_dir, $git_script_file)))
return nil if new_script['script_id'].to_i == script['script_id'].to_i
# there's a conflict. return the conflicting script.
new_script
end
# can't have two scripts with the same name on github.
# try to figure out an intelligent name for the newer repo.
def resolve_name_conflicts script
return nil unless script
all_scripts = list_existing_scripts
existing_script = name_conflict_exists(all_scripts, script)
return script unless existing_script
# if the author is different, try that first
script_author = author(script)
if script_author && script_author != author(existing_script)
script['display_name'] += ' -- ' + script_author[1]
script['name'] += '--' + githubify(script_author[1])
end
existing_script = name_conflict_exists(all_scripts, script)
return script unless existing_script
# otherwise, see if we can differentiate by type
if script['script_type'] != existing_script['script_type']
script['display_name'] += ' ' + script['script_type']
script['name'] += '-' + githubify(script['script_type'])
end
existing_script = name_conflict_exists(all_scripts, script)
return script unless existing_script
script['display_name'] += ' B'
script['name'] += '-B'
# otherwise, just tack a sequence letter on the end. didn't want to use a
# number because "php.vim 2" looks like a newer release of "php.vim")
while existing_script = name_conflict_exists(all_scripts, script)
script['name'][-1] = (script['name'][-1].ord + 1).chr
script['display_name'][-1] = (script['name'][-1].ord + 1).chr
raise "what the heck?" if script['name'][-1] > 'Z'
end
script
end
def open_repo script_id, script_name
repo_path = repo_filename script_id, script_name
GitRepo.new(:root => repo_path, :bare => true, :create => true)
end
def mark_repo_as_duplicate dupe, new_script
# add a commit that deletes all files and creates a README pointing to the new repo
repo = open_repo(script_extract_id(dupe), script_extract_name(dupe))
committer = { :name => $vimscripts_name, :email => $vimscripts_email }
repo.commit("Renamed to #{new_script['display_name']}", committer, committer) do |commit|
commit.empty_index
commit.add 'README', "This script has been renamed to #{new_script['display_name']}.\n\n#{repo_url new_script}\n"
end
end
# when a script gets renamed we copy the local repo so the git
# objects don't change and then install a README file in the old repo
def resolve_renamed_scripts script
return nil unless script
dupes = find_scripts_by_id(script['script_id']).reject do |x|
# the new script isn't a duplicate
script_extract_name(x) == script['name']
end
dupes.each do |dupe|
puts "RENAMED: #{dupe} to #{script['display_name']}"
new_repo = repo_filename(script['script_id'], script['name'])
old_repo = repo_filename(script_extract_id(dupe), script_extract_name(dupe))
# copy the existing repo so the objects don't change
# (the new repo shouldn't exist but it's not worth dying if it does)
FileUtils.cp_r old_repo, new_repo unless test ?d, new_repo
# old repos have a bad timezone, use filter-branch to fix it before pushing
# http://vim-scripts.org/news/2011/06/23/picky-about-timezones.html
Dir.chdir(new_repo) do
system "git filter-branch --env-filter '' --tag-name-filter cat HEAD"
raise "git filter-branch failed: #{$?}" unless $?.success?
end
mark_repo_as_duplicate dupe, script
puts " pushing obsolete repo"
perform_push old_repo
# delete the obsolete script file
File.delete File.join($scripts_dir, dupe)
end
script
end
def write_script script
return unless script
filename = script_filename(script)
puts "Scraped #{filename}"
File.open(filename, 'w') do |f|
farg = fix_encoding(script)
# check_encoding(farg)
f.write json_pretty(farg)
end
filename
end
def compute_unique_tag tag, seen
while seen[tag]
# ack, it's a dupe!
tag.sub! /@(\d+)$/, ''
tag += "@" + (($1||0).to_i + 1).to_s
end
seen[tag] = true
tag
end
def dedup_script_versions script
# some scripts have versions with identical version numbers. :(
seen = {}
script['versions'].reverse.each do |version|
version['script_version'] = compute_unique_tag version['script_version'], seen
end
end
def download_file url, dest
retryable(:on => OpenURI::HTTPError, :task => " downloading #{url} to #{dest}") do |retries|
open(url, 'rb') do |u|
File.open(dest, 'wb') { |f| f.write(u.read) }
end
end
end
def copy_file commit, filename, contents
# skip swapfiles or crap Apple files that authors accidentally check in
unless filename =~ /\.[^\/]+\.sw[n-p]$/ || filename =~ /~$/ || filename =~ /\.(?:_\.)?DS_Store$/ || filename =~ /(?:^|\/)\._/
commit.add filename, contents
end
end
def cleanpath path # lifted from git-wiki
path = path.gsub /^[\/\s]*/, ''
names = path.split('/').reject { |str| str =~ /^\s*$/ }
i = 0
while i < names.length
case names[i]
when '..'
names.delete_at(i)
if i>0
names.delete_at(i-1)
i-=1
end
when '.'
names.delete_at(i)
else
i+=1
end
end
names.join('/')
end
# wish the site had a compiler file type. as it is, we need to
# sniff the file contents to determine if it's a compiler plugin.
def is_compiler_file contents
contents.lines.each do |l|
next if l =~ /^\s*("|$)/; # skip blank lines and comments
# afaict 'if exists("current_compiler")' on the first line means compiler plugin
return l =~ /^\s*if\s*\(?\s*exists\s*\(\s*["']current_compiler["']\s*\)/
end
return false
end
# sniff file contents to determine if it's a keymap file
def is_keymap_file contents
# thanks to http://github.com/vim-scripts/greek_polytonic.vim, we can't assume
# that we'll find keymap_name near the top of the file.
# also see check_for_keymap_helper
contents.lines.find { |line| line =~ /^\s*let\s+b:keymap_name\s*=/ }
end
# returns the new path if a change was made, or nil if not.
# returns true if the file should just be suppressed.
def fix_file_location script, path
fix = $file_location_fixes[script['script_id'].to_i]
if fix
fix.each do |re,sub|
if sub.nil?
# suppress the file if re matches
return path =~ re ? true : nil
else
# path substitution
newpath = path.gsub re, sub
return newpath if newpath != path
end
end
end
return nil
end
def smart_copy_file repo, script, filename, contents
filename = cleanpath(filename)
if newpath = fix_file_location(script, filename)
copy_file repo, newpath, contents unless newpath == true
elsif filename =~ /^[^\/]+\.vim$/
# vimfile in the root directory
encoded_contents = contents.dup # this encoding stuff is killing me
encoded_contents.force_encoding "ASCII-8BIT"
if filename =~ /_options\.vim$/
# convention seems to be to put example options to copy into
# your vimrc in a file in the root dir called plugin_options.name
copy_file(repo, filename, contents)
elsif is_compiler_file encoded_contents
copy_file(repo, "compiler/" + filename, contents)
elsif is_keymap_file encoded_contents
copy_file(repo, "keymap/" + filename, contents)
else
case script['script_type']
when 'color scheme' then copy_file(repo, "colors/" + filename, contents)
when 'ftplugin' then copy_file(repo, "ftplugin/" + filename, contents)
when 'game' then copy_file(repo, "plugin/" + filename, contents)
when 'indent' then copy_file(repo, "indent/" + filename, contents)
when 'syntax' then copy_file(repo, "syntax/" + filename, contents)
when 'utility' then copy_file(repo, "plugin/" + filename, contents)
when 'patch' then copy_file(repo, "plugin/" + filename, contents)
else
# if this fires, they must have added more script types?!
raise "Don't know where to put #{filename} for #{script['script_type']}"
end
end
elsif filename =~ /^[^\/]+\.txt$/
# docfile in the root directory
copy_file(repo, "doc/" + filename, contents)
elsif filename =~ /(autoload|after)\/(#{$vimdirs.join('|')})\/([^\/]+)$/
# vimdir in autoload or after: a/b/autoload/plugin/fixit.vim
copy_file(repo, "#{$1}/#{$2}/#{$3}", contents)
elsif filename =~ /^[^\/]+\/(#{$vimdirs.join('|')})\/([^\/]+)$/
# developer put vimfiles in a subdir, i.e. fixit/plugin/fixit.vim.
copy_file(repo, "#{$1}/#{$2}", contents)
else
copy_file(repo, filename, contents)
end
end
def common_prefix set
# http://stackoverflow.com/questions/1916218/find-the-longest-common-starting-substring-in-a-set-of-strings
chars = set.map {|w| w.split('') }
chars[0].zip(*chars[1..-1]).map { |c| c.uniq }.take_while { |c| c.size == 1 }.join
end
def copy_filesystem repo, script, dir, opts={}
paths = []
Find.find(dir) do |path|
if test(?l, path) or path =~ /(?:^|\/)(\.git|.hg|\.bzr|\.svn)$/i
Find.prune
else
# make sure all subdirs are readable
File.chmod 0700, path if test ?d, path
# only work on files, and ignore anything that the caller says should be skipped
paths << path if test(?f, path) && (block_given? ? !yield(path) : true)
end
end
return if paths.empty? # script 1433 is all directories, no files!
prefix = paths.count == 1 ? File.dirname(paths.first) : common_prefix(paths)
prefix.sub! /\/+[^\/]*$/, '' unless test ?d, prefix # trim any partial filenames
prefix.sub! /(?:^|\/)#{Regexp.union $vimdirs}(?:\/.*|$)/, '' # don't trim any vim dirs
paths.each do |path|
# make all files rw and preserve the executable bit
mode = File::Stat.new(path).mode
File.chmod 0600 | (mode & 0700), path
if opts[:smart]
# trim as much as we can off the front of the path
localpath = path.sub /^#{Regexp.escape prefix}\/*/, ''
else
# use the archive location as the path
localpath = path.sub /^#{Regexp.escape dir}\/*/, ''
end
if opts[:smart]
smart_copy_file repo, script, localpath, File.read(path)
else
copy_file repo, localpath, File.read(path)
end
end
end
def corrupt_vimball where
puts " corrupt vimball at #{where}"
throw :corrupt
end