In [1]:
import requests
import json
from functools import wraps
import time
import logging
import pandas as pd

HEADERS = {"User-Agent" : "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.6) ",
  "Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  "Accept-Language" : "en-us",
  "Connection" : "keep-alive",
  "Accept-Charset" : "GB2312,utf-8;q=0.7,*;q=0.7"}


def make_interval(interval_second=1):
    """装饰器，每次执行之前停顿interval_second秒"""
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
            logging.info("interval waiting: {}s".format(interval_second))
            time.sleep(interval_second)
            return func(*args, **kwargs)
        return wrapper
    return decorator


class GithubCommentCrawer(object):
    def __init__(self, user_name, repo_name):
        self.url_temp = "https://api.github.com/repos/{}/{}/commits"
        self.user_name = user_name
        self.repo_name = repo_name
        self.headers = HEADERS
        self.per_page = 1000
        self.shas = set()
        self.files = []

    @make_interval(interval_second=6)
    def get_html_content(self, url, params):
        """直接获取url的内容，Exception由外部处理"""
        r = requests.get(url, params, timeout=5)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text

    def init_shas(self):
        """爬指定repo的全部commit链接以供之后使用"""
        url = self.url_temp.format(self.user_name, self.repo_name)
        page = 1
        try:
            while True:
                html_content = self.get_html_content(url, params={'headers': self.headers,
                                                                  'per_pages': self.per_page,
                                                                  'page': page})
                json_contents = json.loads(html_content)
                for content in json_contents:
                    self.shas.add(content.get('sha'))
                page += 1

        except Exception as e:
            logging.info("Fail!", type(e), str(e))
        finally:
            print("End with page {}".format(page - 1))

    def init_files(self):
        """遍历链接池，获取commit的内容paste字段"""
        for sha in self.shas:
            url = self.url_temp.format(self.user_name, self.repo_name) + '/' + sha
            try:
                html_content = self.get_html_content(url, params={'headers': self.headers})
                json_contents = json.loads(html_content)
                for files in json_contents.get('files'):
                    if files.get('filename').endswith('.py'):
                        self.files.append({'patch':files.get('patch'),
                                           'sha':sha,
                                           'parents_sha':json_contents.get('parents').get('sha')})
            except Exception as e:
                logging.info("Fail!", type(e), str(e))
            finally:
                print("End with length {}".format(len(self.files)))

    def save_shas(self):
        with open("{}-{}-shas.txt".format(self.user_name, self.repo_name), 'w') as f:
            for sha in self.shas:
                f.write(sha + '\n')

    def load_shas(self, shas_file_path, limit=100):
        logging.info("current shas number: {}".format(len(self.shas)))
        shas_temp = []
        with open(shas_file_path, 'r') as f:
            for line in f:
                shas_temp.append(line.strip(' \n'))
        self.shas = set(shas_temp[:limit])
        logging.info("after loaded shas number: {}".format(len(self.shas)))

    def save_file(self, text, sha):
        with open("{}-{}-files-{}.txt".format(self.user_name, self.repo_name, sha), 'w') as f:
            f.write(text)

    def save_files(self, filepath):
        df = pd.DataFrame(data=self.files, columns=['patch', 'sha', 'parents_sha'])
        print(df)
        df.to_pickle(filepath)


In [2]:
def test_github_crawler():
    gc = GithubCommentCrawer(user_name='tensorflow', repo_name='tensorflow')
    # gc.init_shas()
    # gc.save_shas()

    gc.load_shas("tensorflow-tensorflow-shas.txt")
    gc.init_files()
    gc.save_files("tensorflow-tensorflow-shas.tar.bz2")


In [3]:
logging.basicConfig(
    level=logging.INFO
)

In [4]:
test_github_crawler()

INFO:root:current shas number: 0
INFO:root:after loaded shas number: 100
INFO:root:interval waiting: 6s
--- Logging error ---
Traceback (most recent call last):
  File "<ipython-input-1-8cb4e4bfffde>", line 75, in init_files
    'parents_sha':json_contents.get('parents').get('sha')})
AttributeError: 'list' object has no attribute 'get'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Program Files\Python37\lib\logging\__init__.py", line 1034, in emit
    msg = self.format(record)
  File "C:\Program Files\Python37\lib\logging\__init__.py", line 880, in format
    return fmt.format(record)
  File "C:\Program Files\Python37\lib\logging\__init__.py", line 619, in format
    record.message = record.getMessage()
  File "C:\Program Files\Python37\lib\logging\__init__.py", line 380, in getMessage
    msg = msg % self.args
TypeError: not all arguments converted during string formatting
Call stack:
  File "C:\Program Files\Python

End with length 0
End with length 0
End with length 0


KeyboardInterrupt: 

In [9]:
url = "https://api.github.com/repos/tensorflow/tensorflow/commits"
text = getHTMLContent(url)
text[:1000]

'[\n  {\n    "sha": "fd471575c744cf36621c5484e433d09f0f90e22e",\n    "node_id": "MDY6Q29tbWl0NDU3MTcyNTA6ZmQ0NzE1NzVjNzQ0Y2YzNjYyMWM1NDg0ZTQzM2QwOWYwZjkwZTIyZQ==",\n    "commit": {\n      "author": {\n        "name": "A. Unique TensorFlower",\n        "email": "gardener@tensorflow.org",\n        "date": "2021-01-12T02:00:09Z"\n      },\n      "committer": {\n        "name": "TensorFlower Gardener",\n        "email": "gardener@tensorflow.org",\n        "date": "2021-01-12T02:13:07Z"\n      },\n      "message": "Internal change\\n\\nPiperOrigin-RevId: 351274556\\nChange-Id: I8b7286bcfb13292e2b186533f5f0e2702cb7ea44",\n      "tree": {\n        "sha": "cbaea6d54b8634bfc69adf8343ad3a045ef3cd88",\n        "url": "https://api.github.com/repos/tensorflow/tensorflow/git/trees/cbaea6d54b8634bfc69adf8343ad3a045ef3cd88"\n      },\n      "url": "https://api.github.com/repos/tensorflow/tensorflow/git/commits/fd471575c744cf36621c5484e433d09f0f90e22e",\n      "comment_count": 0,\n      "verification":

In [7]:
len(text)

125963

In [11]:
import json
json_file = json.loads(text)
json_file

[{'sha': 'fd471575c744cf36621c5484e433d09f0f90e22e',
  'node_id': 'MDY6Q29tbWl0NDU3MTcyNTA6ZmQ0NzE1NzVjNzQ0Y2YzNjYyMWM1NDg0ZTQzM2QwOWYwZjkwZTIyZQ==',
  'commit': {'author': {'name': 'A. Unique TensorFlower',
    'email': 'gardener@tensorflow.org',
    'date': '2021-01-12T02:00:09Z'},
   'committer': {'name': 'TensorFlower Gardener',
    'email': 'gardener@tensorflow.org',
    'date': '2021-01-12T02:13:07Z'},
   'message': 'Internal change\n\nPiperOrigin-RevId: 351274556\nChange-Id: I8b7286bcfb13292e2b186533f5f0e2702cb7ea44',
   'tree': {'sha': 'cbaea6d54b8634bfc69adf8343ad3a045ef3cd88',
    'url': 'https://api.github.com/repos/tensorflow/tensorflow/git/trees/cbaea6d54b8634bfc69adf8343ad3a045ef3cd88'},
   'url': 'https://api.github.com/repos/tensorflow/tensorflow/git/commits/fd471575c744cf36621c5484e433d09f0f90e22e',
   'comment_count': 0,
   'verification': {'verified': False,
    'reason': 'unsigned',
    'signature': None,
    'payload': None}},
  'url': 'https://api.github.com/repos

In [12]:
len(json_file)

30

In [14]:
sha = json_file[0]['sha']
single_commit = getHTMLContent(url+'/'+sha)
single_commit

'{\n  "sha": "fd471575c744cf36621c5484e433d09f0f90e22e",\n  "node_id": "MDY6Q29tbWl0NDU3MTcyNTA6ZmQ0NzE1NzVjNzQ0Y2YzNjYyMWM1NDg0ZTQzM2QwOWYwZjkwZTIyZQ==",\n  "commit": {\n    "author": {\n      "name": "A. Unique TensorFlower",\n      "email": "gardener@tensorflow.org",\n      "date": "2021-01-12T02:00:09Z"\n    },\n    "committer": {\n      "name": "TensorFlower Gardener",\n      "email": "gardener@tensorflow.org",\n      "date": "2021-01-12T02:13:07Z"\n    },\n    "message": "Internal change\\n\\nPiperOrigin-RevId: 351274556\\nChange-Id: I8b7286bcfb13292e2b186533f5f0e2702cb7ea44",\n    "tree": {\n      "sha": "cbaea6d54b8634bfc69adf8343ad3a045ef3cd88",\n      "url": "https://api.github.com/repos/tensorflow/tensorflow/git/trees/cbaea6d54b8634bfc69adf8343ad3a045ef3cd88"\n    },\n    "url": "https://api.github.com/repos/tensorflow/tensorflow/git/commits/fd471575c744cf36621c5484e433d09f0f90e22e",\n    "comment_count": 0,\n    "verification": {\n      "verified": false,\n      "reason": "

In [16]:
json_commit = json.loads(single_commit)
json_commit

{'sha': 'fd471575c744cf36621c5484e433d09f0f90e22e',
 'node_id': 'MDY6Q29tbWl0NDU3MTcyNTA6ZmQ0NzE1NzVjNzQ0Y2YzNjYyMWM1NDg0ZTQzM2QwOWYwZjkwZTIyZQ==',
 'commit': {'author': {'name': 'A. Unique TensorFlower',
   'email': 'gardener@tensorflow.org',
   'date': '2021-01-12T02:00:09Z'},
  'committer': {'name': 'TensorFlower Gardener',
   'email': 'gardener@tensorflow.org',
   'date': '2021-01-12T02:13:07Z'},
  'message': 'Internal change\n\nPiperOrigin-RevId: 351274556\nChange-Id: I8b7286bcfb13292e2b186533f5f0e2702cb7ea44',
  'tree': {'sha': 'cbaea6d54b8634bfc69adf8343ad3a045ef3cd88',
   'url': 'https://api.github.com/repos/tensorflow/tensorflow/git/trees/cbaea6d54b8634bfc69adf8343ad3a045ef3cd88'},
  'url': 'https://api.github.com/repos/tensorflow/tensorflow/git/commits/fd471575c744cf36621c5484e433d09f0f90e22e',
  'comment_count': 0,
  'verification': {'verified': False,
   'reason': 'unsigned',
   'signature': None,
   'payload': None}},
 'url': 'https://api.github.com/repos/tensorflow/tensor

In [20]:
print(json_commit['files'][0]['patch'])

@@ -1785,10 +1785,6 @@ def on_epoch_end(self, epoch, logs=None):
   def on_train_end(self, logs=None):
     if self.stopped_epoch > 0 and self.verbose > 0:
       print('Epoch %05d: early stopping' % (self.stopped_epoch + 1))
-    if self.restore_best_weights:
-      if self.verbose > 0:
-        print('Restoring model weights from the end of the best epoch.')
-      self.model.set_weights(self.best_weights)
 
   def get_monitor_value(self, logs):
     logs = logs or {}


In [8]:
url = "https://api.github.com/repos/superlova/codeclf/commits"
text = getHTMLContent(url)
text[:1000]

fail 404 Client Error: Not Found for url: https://api.github.com/repos/superlova/codeclf/commits


TypeError: 'NoneType' object is not subscriptable

In [21]:
def get_json_content(json_string):
    json_content = json.loads(json_string)
    return json_content

In [24]:
url_temp = "https://api.github.com/repos/{}/{}/commits"
user_name = "tensorflow"
repo_name = "tensorflow"
url = url_temp.format(user_name, repo_name)
json_contents = get_json_content(getHTMLContent(url, params={'per_page':100}))
shas = []
for content in json_contents:
    shas.append(content['sha'])
len(shas)

100

In [25]:
sha_example = shas[-1]
sha_example

'133a9711dd63a2bdaf3930f837247c399743fecc'

In [30]:
text = getHTMLContent(url+'/'+sha_example, params=params)
json_content = get_json_content(text)
json_content

{'sha': '133a9711dd63a2bdaf3930f837247c399743fecc',
 'node_id': 'MDY6Q29tbWl0NDU3MTcyNTA6MTMzYTk3MTFkZDYzYTJiZGFmMzkzMGY4MzcyNDdjMzk5NzQzZmVjYw==',
 'commit': {'author': {'name': 'Jacques Pienaar',
   'email': 'jpienaar@google.com',
   'date': '2021-01-08T22:04:58Z'},
  'committer': {'name': 'TensorFlower Gardener',
   'email': 'gardener@tensorflow.org',
   'date': '2021-01-08T22:10:33Z'},
  'message': 'Use flib of attached context.\n\nStacks not part of proto. Moved to TF2 only test and run with TF2_BEHAVIOR env set.\n\nPiperOrigin-RevId: 350832704\nChange-Id: I342bdf724a9842e9f1b3c49095d8db1ec0c56076',
  'tree': {'sha': '287e514d3884bac49f909e93df5c6610ae21cb98',
   'url': 'https://api.github.com/repos/tensorflow/tensorflow/git/trees/287e514d3884bac49f909e93df5c6610ae21cb98'},
  'url': 'https://api.github.com/repos/tensorflow/tensorflow/git/commits/133a9711dd63a2bdaf3930f837247c399743fecc',
  'comment_count': 0,
  'verification': {'verified': False,
   'reason': 'unsigned',
   'signa

In [31]:
with open('json_content.txt', 'w+') as f:
    json.dump(json_content, f)

In [28]:
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:11.0) Gecko/20100101 Firefox/11.0',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Encoding': 'gzip,deflate,sdch',
    'Accept-Language': 'zh-CN,zh;q=0.8'
}

params = {'headers':HEADERS}

In [None]:
get