In [None]:
import requests
import datetime
import json
import os
import argparse
import glob
import logging
import tqdm
import time
import pymongo
log = logging.getLogger('gerry')

In [None]:
def config_logging(data_dir):
    global log
    log.setLevel(logging.DEBUG)
    log_name = os.path.join(data_dir, 'gerry-crawl.log')
    formatter = logging.Formatter('%(asctime)s %(levelname)-8s %(message)s')
    file_handler = logging.FileHandler(log_name)
    file_handler.setFormatter(formatter)
    log.addHandler(file_handler)
    return log


def create_time_frames(from_datetime, to_datetime, frame_size):
    # [from_datetime, to_datetime[
    result = []
    time_frame_start = from_datetime
    time_frame_end = from_datetime + frame_size + \
        datetime.timedelta(milliseconds=-1)
    while time_frame_end <= to_datetime:
        result += [(time_frame_start, time_frame_end)]
        time_frame_start += frame_size
        time_frame_end += frame_size
    return result


def datetime_to_string(date):
    return date.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]

class Gerry(object):
    def __init__(self, name, url, start_date, end_date,
                 directory='./gerry_data/'):
        self.name = name
        self.url = url
        self.directory = os.path.join(directory, name)
        self.start_date = start_date
        self.end_date = end_date
        os.makedirs(self.directory, exist_ok=True)

    def wait_for_server(status_code):
        # https://cloud.google.com/service-control/troubleshooting#how_do_i_perform_a_retry_on_api_errors
        GOOGLE_SERVER_WAITING_TIME = {429: 31, 500: 1, 503: 1}
        if status_code in GOOGLE_SERVER_WAITING_TIME:
            time.sleep(GOOGLE_SERVER_WAITING_TIME[status_code])

    def handle_exception(exception, change_type):
        if isinstance(exception, requests.exceptions.RequestException):
            if exception.response is not None:
                log.error('GET %s failed with http status %i' % (
                    change_type, exception.response.status_code))
                Gerry.wait_for_server(
                    exception.response.status_code)
            else:
                log.error('GET %s %s failed with error: %s' % (change_type,
                                                               exception))
        elif isinstance(exception, json.JSONDecodeError):
            log.error(
                'Reading JSON for %s failed' % (change_type))
        elif isinstance(exception, Exception):
            log.error('Unknown error occurred for %s %s: %s' % (change_type,
                                                                exception))

    def get_changes(self, day):
        from_datetime = day
        to_datetime = from_datetime + \
            datetime.timedelta(hours=24) + datetime.timedelta(milliseconds=-1)
        more_changes = True
        changes = []
        offset = 0

        while more_changes:
            changes_subset = []
            url = '%s/changes/?q=after:{%s} AND before:{%s}&o=ALL_COMMITS&o=ALL_REVISIONS&S=%i' % (
                self.url, datetime_to_string(from_datetime), datetime_to_string(to_datetime), offset)
            response = requests.get(url)
            response.raise_for_status()

            changes_subset = json.loads(response.text[5:])
            if changes_subset:
                more_changes = '_more_changes' in changes_subset[-1]
                changes += changes_subset
            else:
                more_changes = False
            offset += len(changes_subset)
        return changes

    def get_change(self, change_number, folder):
        # Extract comments
        comments_inlines = {'comments': [], 'inlines': []}
        url = '%s/changes/%s/detail/?o=DETAILED_LABELS&o=MESSAGES&o=DETAILED_ACCOUNTS&o=REVIEWED&o=ALL_FILES&o=ALL_COMMITS&o=ALL_REVISIONS' % (
            self.url, change_number)
        if self.name != 'libreoffice':
            url += '&o=REVIEWER_UPDATES'
        response = requests.get(url)
        response.raise_for_status()
        details = json.loads(response.text[5:])
#         comments = {'_number': details['_number'], 'change_id': details['change_id'], 'project': details['project'], 
#                    'created': details['created'], 'updated': details['updated'], 'owner': details['owner'],
#                    'revisionNums': details['revisions'][details['current_revision']]['_number'],
#                    'insertions': details['insertions'], 'deletions': details['deletions']}
        comments = {'_number': details['_number'], 'change_id': details['change_id'], 'project': details['project'], 
                   'created': details['created'], 'updated': details['updated'], 'owner': details['owner'],
                   'revisionNums': details['revisions'][details['current_revision']]['_number'],
                   'insertions': details['insertions'], 'deletions': details['deletions'], 'messages': details['messages']}
        comments_inlines['comments'] = comments
        #print({'_number': details['_number'], 'updated': details['updated'], 'revisionNums': details['revisions'][details['current_revision']]['_number']})
        # Extract inlines
        revision_numbers = int(details['revisions'][details['current_revision']]['_number'])
        inlines = []
        for revision_number in range(1, revision_numbers+1):
            url = '%s/changes/%s/revisions/%s/comments' % (
                self.url, change_number, revision_number)
            response = requests.get(url)
            response.raise_for_status()
            inline = json.loads(response.text[5:])
            inline_replaced = {}
            for fileKey in inline.keys():
                inline_replaced[fileKey.replace('.', '_')] = inline[fileKey]
            inline_replaced['_number'] = details['_number']
            inline_replaced['change_id'] = details['change_id']
            inline_replaced['project'] = details['project']
            #print(inline_replaced)
            inlines.append(inline_replaced)
        comments_inlines['inlines'] = inlines
        return comments_inlines
        
        # Dump data
#         file_name = str(change_number) + '.json'
#         with open(os.path.join(folder, file_name), 'w') as json_file:
#             json.dump(change, json_file)

    def run(self, changes_collection, comments_collection, inlines_collection):
        all_day_paths = []
        for time_frame in create_time_frames(
                self.start_date, self.end_date, datetime.timedelta(hours=24)):
            day_str = time_frame[0].strftime('%Y-%m-%d')
            #os.makedirs(os.path.join(self.directory, 'changes', day_str), exist_ok=True)
            all_day_paths.append(day_str)

        #all_day_paths = glob.glob(os.path.join(self.directory, 'changes', '*'))
        print(all_day_paths)
        complete = False

        #while not complete:
        complete = True
#             day_paths_pending = [
#                 day_path for day_path in all_day_paths if not os.listdir(day_path)]
        day_paths_pending = [
            day_path for day_path in all_day_paths]            

        log.info(
            'Started new crawl iteration to crawl %i pending days' % (len(day_paths_pending)))

        for day_path in tqdm.tqdm(day_paths_pending):
            change_numbers = []
            changes = []
            log.info(
                'Crawling review data on %s ' % (day_path))
            day = datetime.datetime.strptime(
                day_path, '%Y-%m-%d')
            try:
                # Extract Basic Review Info
                changes = self.get_changes(day)
                if len(changes) > 0:
                    changes_collection.insert_many(changes)
                else:
                    logging.info('There is no change on %s'  % (day))
            except Exception as exception:
                Gerry.handle_exception(exception, 'changes on ' + str(day))
                complete = False

            change_numbers += [change['_number'] for change in changes]

            for change_number in change_numbers:
                try:
                    comments_inline = self.get_change(change_number, day_path)
                    #print(type(comments_inline['comments']))
                    if len(comments_inline['comments']) > 0:
                        comments_collection.insert_many([comments_inline['comments']])
                    if len(comments_inline['inlines']) > 0:
                        #print(type(comments_inline['inlines']))
                        inlines_collection.insert_many(comments_inline['inlines'])
                except Exception as exception:
                    Gerry.handle_exception(
                        exception, 'change ' + str(change_number))
                    complete = False

In [None]:
if __name__ == '__main__':
    db_name = 'android'
    project_url = 'https://android-review.googlesource.com'
    client = pymongo.MongoClient()
    db = client[db_name]
    changes_collection = db['reviews']
    comments_collection = db['comments']
    inlines_collection = db['inlines']
    
    gerry = Gerry(db_name, project_url, 
                  datetime.datetime(2008, 7, 1), datetime.datetime(2018, 8, 1), directory='./gerry_data/') 
    config_logging(gerry.directory)
    
    gerry.run(changes_collection, comments_collection, inlines_collection)