-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.py
32 lines (25 loc) · 976 Bytes
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import sys
import urllib.request
from bs4 import BeautifulSoup
from collections import OrderedDict
from pprint import pprint
import subprocess
import git
git_clone = "git clone %s"
def scrape(url, output_dir):
response = urllib.request.urlopen(url)
html = response.read()
soup = BeautifulSoup(html, 'lxml')
for link in soup.find("div", {"id": "org-repositories"}).find_all("a", {"itemprop": "name codeRepository"}):
base_link = link.get('href').strip()
git_link = "https://github.com" + base_link + ".git"
out_dir = output_dir + '/' + link.getText().strip()
process = subprocess.Popen(['mkdir', out_dir], stdout=subprocess.PIPE)
output, error = process.communicate()
print("cloning [%s] to [%s]" % (git_link, out_dir))
git.Git(out_dir).clone(git_link)
if __name__ == "__main__":
giturl = sys.argv[1]
print(giturl)
output_dir = sys.argv[2]
scrape(giturl, output_dir)