In [1]:
import json
import requests
import regex as re
import time
import datetime
from bs4 import BeautifulSoup

######### Part 1 #########

#Getting details of Top 100 Hadoop github repo contributors.

url_hadoop = "https://api.github.com/repos/apache/hadoop/contributors"

#To get 100 results in the page
parameters = {"per_page":"100"} #Defaults to first page

token = ""#Add token here

headers = {'Authorization': 'token ' + token}

#Calling the URL
hadoop = requests.get(url = url_hadoop, params = parameters, headers = headers).text

#Loading the json into a variable
hadoop_contributors = json.loads(hadoop)

#Printing only the first contributor details
print(hadoop_contributors[0])


{'login': 'szetszwo', 'id': 907380, 'node_id': 'MDQ6VXNlcjkwNzM4MA==', 'avatar_url': 'https://avatars.githubusercontent.com/u/907380?v=4', 'gravatar_id': '', 'url': 'https://api.github.com/users/szetszwo', 'html_url': 'https://github.com/szetszwo', 'followers_url': 'https://api.github.com/users/szetszwo/followers', 'following_url': 'https://api.github.com/users/szetszwo/following{/other_user}', 'gists_url': 'https://api.github.com/users/szetszwo/gists{/gist_id}', 'starred_url': 'https://api.github.com/users/szetszwo/starred{/owner}{/repo}', 'subscriptions_url': 'https://api.github.com/users/szetszwo/subscriptions', 'organizations_url': 'https://api.github.com/users/szetszwo/orgs', 'repos_url': 'https://api.github.com/users/szetszwo/repos', 'events_url': 'https://api.github.com/users/szetszwo/events{/privacy}', 'received_events_url': 'https://api.github.com/users/szetszwo/received_events', 'type': 'User', 'site_admin': False, 'contributions': 911}


In [2]:
######### Part 2 #########

#Getting the number of repos and contributions for each username

for i in range(10):
    
    user = hadoop_contributors[i]['login']
    user_url = hadoop_contributors[i]['url']
    user_html_url = hadoop_contributors[i]['html_url']
    
    #Accessing the API inside the URL and obtaining the user's repo count
    user_api = requests.get(url = user_url, headers = headers).text
    user_json = json.loads(user_api)
    
    #Extracting the user's repo count
    user_repo_count = user_json['public_repos']
    
    #We will need to extract the contributions for each user by parsing it from their HTML pages
    
    #First extracting the year when the user created their GitHub profile
    user_year_created = int(user_json['created_at'][0:4])
    
    #Creating an empty list to store contributions of each year
    yearly_contibutions = list(range(2023-user_year_created))
    contri_count = 0
    
    #Getting contributions of the user year-wise - then summing it up
    
    for year in range(user_year_created, 2023):
        
        #Getting URL for each year. The output of this URL will be parsed to get the contribution count
        year_user_url = f"https://github.com/{user}?tab=overview&from={year}-01-01&to={year}-12-31"
        
        #Storing the html into a variable
        year_user_url_content = requests.get(url = year_user_url, headers = headers)
        
        #Parsing into a BeautifulSoup object
        year_user_bs = BeautifulSoup(year_user_url_content.text, 'lxml')
        
        contri_text = year_user_bs.find("h2", class_= "f4 text-normal mb-2")
        contri_text = re.split("      ",contri_text.text.replace(",",""))[1] #As the data had spaces before the number
        contri_count = contri_count + int(contri_text)
    
    
    print(f"User {i+1}: {user}  ( {user_html_url} ) has {user_repo_count} repositories and {contri_count} contributions.")
    
    
    time.sleep(5)



User 1: szetszwo  ( https://github.com/szetszwo ) has 5 repositories and 3744 contributions.
User 2: aajisaka  ( https://github.com/aajisaka ) has 29 repositories and 2113 contributions.
User 3: vinoduec  ( https://github.com/vinoduec ) has 1 repositories and 3 contributions.
User 4: arp7  ( https://github.com/arp7 ) has 12 repositories and 3127 contributions.
User 5: cnauroth  ( https://github.com/cnauroth ) has 36 repositories and 1206 contributions.
User 6: jlowe  ( https://github.com/jlowe ) has 8 repositories and 4185 contributions.
User 7: umbrant  ( https://github.com/umbrant ) has 33 repositories and 618 contributions.
User 8: toddlipcon  ( https://github.com/toddlipcon ) has 103 repositories and 6698 contributions.
User 9: steveloughran  ( https://github.com/steveloughran ) has 49 repositories and 7731 contributions.
User 10: anuengineer  ( https://github.com/anuengineer ) has 18 repositories and 2622 contributions.


In [5]:
######### Part 3 #########

#Printing the difference between the timestamp of last commit and (last-100)th commit on the repo.

url_commits_api = "https://api.github.com/repos/apache/hadoop/commits"
parameters_page2 = {"per_page":"100","page":"2"} #To get 100 results in the page

#Getting two pages
commits_hadoop_page1 = requests.get(url = url_commits_api, params = parameters, headers = headers).text
commits_hadoop_page2 = requests.get(url = url_commits_api, params = parameters_page2, headers = headers).text

#Converting to JSON
commits_hadoop_json_page1 = json.loads(commits_hadoop_page1)
commits_hadoop_json_page2 = json.loads(commits_hadoop_page2)


#Converting the strings to timestamps
latest_commit_timestamp = commits_hadoop_json_page1[0]['commit']['committer']['date']
latest_commit_timestamp = datetime.datetime.strptime(latest_commit_timestamp, "%Y-%m-%dT%H:%M:%SZ")

hundredth_commit_time_stamp = commits_hadoop_json_page2[0]['commit']['committer']['date']
hundredth_commit_time_stamp = datetime.datetime.strptime(hundredth_commit_time_stamp, "%Y-%m-%dT%H:%M:%SZ")

print(f'The latest commit was on {latest_commit_timestamp}')
print(f'The commit which was 100 commits before this was on {hundredth_commit_time_stamp}')

#Printing the time difference
print(f'The difference in these two timestamps is {latest_commit_timestamp-hundredth_commit_time_stamp}')



The latest commit was on 2022-02-21 17:08:56
The commit which was 100 commits before this was on 2021-12-23 09:13:18
The difference in these two timestamps is 60 days, 7:55:38
