In [None]:
import requests
import json
import datetime
from tqdm import tqdm
import pandas as pd

import requests_futures.sessions
import concurrent.futures
from requests.auth import HTTPBasicAuth

#the url for the gerrit repo we want to mine.
#For a new repo, we need to know the url of the new repo
libreoffice_url = 'https://gerrit.libreoffice.org/changes/'
example_url = 'https://gerrit.XX.com/a/changes/'

#adjust this variable to point to the url that the rest of the code will use
current_project_name = libreoffice_url

#set the dates to choose the earliest date to mine the repo.
#by default the end date is the current date
start_year = 2011
start_month = 11
start_day = 14

#to be filled with relevant info. It is case sensitive
username = "umarIftikhar"
httpassword = "0pZR3tEp61m195KHpl+YRV7f/IOeWgqNj4N3HoZqBg"


#HTTP adapter initiation
retries = requests.packages.urllib3.util.retry.Retry(
    total=10, 
    backoff_factor=2, 
    status_forcelist=[401, 429, 500, 502, 503, 504]
)
http_adapter = requests.adapters.HTTPAdapter(max_retries=retries)
timeout = 10*60

#data slics function
def date_slices(from_date, to_date):
    for ordinal in range(from_date.toordinal(), to_date.toordinal()):
        start = datetime.datetime.fromordinal(ordinal)
        end = start + datetime.timedelta(days=1, milliseconds=-1)
        yield (str(start), str(end)[:-3])

#Note: update the earliest date you are interested in mining/or available in your repo
first_change = datetime.datetime(year=start_year, month=start_month, day=start_day)
days = list(date_slices(first_change, datetime.datetime.today()))

#helper function to print dataframe as json needed during debugging
def print_file_data(filename):
    # Open the JSON file for reading
    with open(filename, 'r') as file:
        # Load the JSON data
        data = json.load(file)
    # Print the JSON data with proper indentation
    print(json.dumps(data, indent=4))
    
    

In [None]:
#-------extracting all changes from current_project_name between dates stated above
with requests_futures.sessions.FuturesSession(max_workers=4) as session:
    session.mount("https://", http_adapter)
    # Set up basic authentication
    #authentication code
    session.auth = HTTPBasicAuth(username, httpassword)
    params = [()]
  
    changes_futures = [session.get(current_project_name,params={'q': 'after:"{}" AND until:"{}"'.format(start, end)}) for (start, end) in days]
    
    for future in tqdm(concurrent.futures.as_completed(changes_futures), total=len(changes_futures)): 
        future.done()

changes = []
for f in changes_futures:
    r = json.loads(f.result().text[len("]}\\\'\n"):])
    assert isinstance(r, list), 'Parsed response is not a list'
    changes += r

#-------saving the changes after filtered by project ------------ 
changes_list = pd.DataFrame(changes)
changes_list.to_csv("new_changes.csv")
print("all new changes written to file")   

#Uncomment if you want to print the changes
#for change in changes:
#    print(change["_number"],change["total_comment_count"],change["created"])

In [None]:
#-----------------comments info extraction
with requests_futures.sessions.FuturesSession(max_workers=8) as session:
     session.mount("https://", http_adapter)

     # authentication code
     session.auth = HTTPBasicAuth(username, httpassword)

     comments_futures = [session.get(current_project_name+'{}/comments'.format(change['_number'])) for change in changes]
    
     for future in tqdm(concurrent.futures.as_completed(comments_futures), total=len(comments_futures)):
         future.done()

comments = {}
for f in comments_futures:
    try:
        r = json.loads(f.result().text[len("]}\\\'\n"):])
    except:
        print(f.result().text)
    comments[f.result().url.split('/')[4]] = r
  
with open('rawLibreOfficeData.json', 'w') as json_file:
    json.dump(comments, json_file, indent=4)


In [None]:
import requests
import json
import pandas as pd

#This is the same file we saved in the last section
filename = 'rawLibreOfficeData.json'

# Opening JSON file
f = open(filename)
# returns JSON object as 
# a dictionary
data = json.load(f)
 
# Iterating through the json
# list
single_comment_list = []
comments_for_proj = []

for rec,value in data.items():
    for key,comment_values in value.items():
        for list_comment_data in comment_values: 
            comment = str(list_comment_data['message'])
            #remove tabs and carriage returns from the comments
            comment = comment.replace('\n','')
            comment = comment.replace('\r','')
            comment = comment.replace('\t','')
            single_comment_list = [rec, key, list_comment_data['author']['name'],list_comment_data['unresolved'],list_comment_data['patch_set'],list_comment_data['updated'],comment,list_comment_data['commit_id'], list_comment_data['author']['_account_id']]
            #print(single_comment_list)
            comments_for_proj.append(single_comment_list)

tobe_saved_comments = pd.DataFrame(comments_for_proj, columns=('ChangeID', 'FileName', 'AuthorName', 'Unresolved','PatchSet','UpdateDate','comment','CommitID','AuthorID'))
#The csv will be the input to the data preprocessing code in 'PreprocessingTM_Gerrit.ipynb'
tobe_saved_comments.to_csv("review_comments.csv")

#remove illegal characters before saving the file as xlsx
tobe_saved_comments = tobe_saved_comments.applymap(lambda x: x.encode('unicode_escape').decode('utf-8') if isinstance(x, str) else x)
tobe_saved_comments.to_excel("review_comments00.xlsx")


In [None]:
# import requests
# import json
# import datetime
# from tqdm import tqdm
# import pandas as pd

# #helper function to print dataframe as json needed during debugging
# def print_file_data(filename):
#     # Open the JSON file for reading
#     with open(filename, 'r') as file:
#         # Load the JSON data
#         data = json.load(file)
#     # Print the JSON data with proper indentation
#     print(json.dumps(data, indent=4))

# print_file_data("rawLibreOfficeData.json")   