# Get Repository Information via Web-Scraping

---

## Do the imports!

In [None]:
import requests
from bs4 import BeautifulSoup

---

## Assess the problem

We are currently in the `NSS-Data-Analytics-Cohort-2` cohort. 
In GitHub, the repositories for this cohort can be found at https://github.com/NSS-Data-Analytics-Cohort-2.

## The Goal
1. Access every repository URL associated with the `NSS-Data-Analytics-Cohort-2` organization.
2. For every repository..
    * Number of Commits (for the master branch)
    * Number of branches

---

## Goal 1
Accessing the repository urls. 
In order to do this.. 
We will need to go through every repository page and grab the repositories.

The first question is.. 
How many pages should we traverse?
In order to answer this, we need to locate the pagination element on the main page, and see how many options there are to select. 
Specifically, we need to figure out what the last number is!

In [None]:
# setting up the base URLs for the project
GITHUB_URL = 'https://github.com'
ORG_URL = f'{GITHUB_URL}/NSS-Data-Analytics-Cohort-2'

In [None]:
# pull data from the page
resp = requests.get(ORG_URL)

# We want the print statement to be 200
print(resp.status_code)

In [None]:
# Establish the "soup" object so we can traverse the website content
soup = BeautifulSoup(resp.text, 'html.parser')

---

### Goal 1.A
Figure out what the last page is!
To do this, we need to access the pagination elements.

In [None]:
# Now that we've created the soup.. Let's find the pagination elements.
# Check the repositories page to find out what to look for!
len(soup.findAll('div', {'class': 'pagination'}))

In [None]:
# since `soup.findAll` returns a list of elements.. We need to extract the first result.
pagination_elems = soup.findAll('div', {'class': 'pagination'})
pagination_elem = pagination_elems[0]

print(pagination_elem.prettify())

In [None]:
# now, let's get all of the links from the pagination element.
links = pagination_elem.findAll('a')
links

In [None]:
# In our case.. We are interested in the link that represents the final page. 
# So, 2 from the back.
final_page = links[-2]
final_page

In [None]:
# Now, we can get the text, and convert to an integer.
final_page_number = int(final_page.text)
final_page_number

---

### Goal 1.B
Now that we have the reference to the last page, we can use it to grab the links for all the pages.
Speaking of which.. 
Notice the structure of the links from the pagination.

Example: 
```
/NSS-Data-Analytics-Cohort-2?page=3
```

It looks like if we want to be able to go through all the pages.. 
We would need to structure our links like the ones from the pagination links.

So, our links should look like: 
```
https://github.com/NSS-Data-Analytics-Cohort-2?page=1
https://github.com/NSS-Data-Analytics-Cohort-2?page=2
...
https://github.com/NSS-Data-Analytics-Cohort-2?page=10
```

In [None]:
# Use a list comprehension to build our structures!
repo_pages = [f'{ORG_URL}?page={i+1}' for i in range(final_page_number)]
repo_pages

---

### Goal 1.C

Perfect. 
Next, we need to figure out a pattern to grab all of the repository links..

Looking at the main page (`https://github.com/NSS-Data-Analytics-Cohort-2`) we can see that all of the repository elements are included in a `div` element with the `id` equal to `org-repositories`. 
Within that `div`, there is an unordered list (`ul`) with separate list elements (`li`) containing our info. 

Let's try to access those for the first pass.

In [None]:
org_repositories = soup.find(id='org-repositories') \
    .find('ul') \
    .findAll('li')


print(f'Total repositories on page: {len(org_repositories)}')
print(org_repositories[0].prettify())

In [None]:
# Sweet. Now, let's grab the link for the first one.
# If we can grab that one.. We can grab the rest.
first_repo = org_repositories[0]
first_repo_a_elem = first_repo.find('a')
print(first_repo_a_elem.prettify())

In [None]:
# Now.. The link!
first_repo_a_elem.get('href')

---

### Goal 1.D

Ok. 
Now.. 
We have the ability to get all the links!
We just need to stitch all of our code together.

In [None]:
def get_page_soup(url):
    print(f'Fetching website data for: {url}')
    resp = requests.get(url)
    return BeautifulSoup(resp.text, 'html.parser')


def get_org_repositories(soup):
    print('\tGetting org repositories')
    return soup.find(id='org-repositories') \
        .find('ul') \
        .findAll('li')


def extract_org_repository_links(org_repositories):
    print('\tGetting links from repositories')
    return [repo.find('a').get('href') for repo in org_repositories]

In [None]:
all_links = []

for url in repo_pages:
    soup = get_page_soup(url)
    org_repositories = get_org_repositories(soup)
    links = extract_org_repository_links(org_repositories)
    
    all_links.extend(links)

In [None]:
# insert mic-drop here
all_links

In [None]:
print(f'Total number of links: {len(all_links)}')

---

### Goal 2.A

Next, we need to pull some information per repository.

> For every repository..
* Number of Commits (for the master branch)
* Number of branches

Like before, let's start with a single repository.

In [None]:
repo_link = all_links[0]
repo_link

In [None]:
# Just like before!
resp = requests.get(f'{GITHUB_URL}{repo_link}')
print(f'Status code is: {resp.status_code}')

soup = BeautifulSoup(resp.text, 'html.parser')

---

### Goal 2.B

Now that we have the soup.. 
Let's focus on locating the appropriate elements. 

Looking at the website we notice that the elements of interest are inside list elements (`li`) within an unordered list (`ul`) having the class `numbers-summary`.
Within those list elements are `span` elements with a `class` equal to `num`. 
Those are what we want.

In [None]:
numbers_summary = soup.find('ul', {'class': 'numbers-summary'})
print(numbers_summary.prettify())

In [None]:
metrics = []

for li in numbers_summary.findAll('li'):
    
    # grabbing the text, and doing a little cleanup
    metric = li.find('span', {'class': 'num'}).text \
        .replace('\n', '') \
        .strip()
    
    # going ahead and casting it to an integer if something was found!
    if metric:
        metric = int(metric)
    else:
        metric = None
    
    metrics.append(metric)

In [None]:
metrics

In [None]:
commits, branches, *_ = metrics
commits, branches

---

### Goal 2.C

Now, we put it all together! 
Just like before. 
This time, let's keep track of the repo link.

In [None]:
def get_numbers_summary(soup):
    return soup.find('ul', {'class': 'numbers-summary'})


def clean_metric(metric):
    metric = metric \
        .replace('\n', '') \
        .strip()
    
    if metric:
        metric = int(metric)
    else:
        metric = None

    return metric


def get_metrics(numbers_summary):
    metrics = []

    for li in numbers_summary.findAll('li'):

        # grabbing the text, and doing a little cleanup
        metric = li.find('span', {'class': 'num'}).text
        metrics.append(clean_metric(metric))
        
    return metrics

In [None]:
# Check to make sure our functions work!
numbers_summary = get_numbers_summary(soup)
commits, branches, *_ = get_metrics(numbers_summary)
commits, branches

In [None]:
# Let's just look at our recent project.. HCBB

hcbb_links = [l for l in all_links if 'healthcare-bluebook' in l]

In [None]:
# Ok.. Functions work for one.. 
# Time to try a few of them!!
results = []

for repo_ref in hcbb_links:
    
    url = f'{GITHUB_URL}{repo_ref}'
    soup = get_page_soup(url)
    
    numbers_summary = get_numbers_summary(soup)
    commits, branches, *_, contributors = get_metrics(numbers_summary)
    commits, branches, contributors
    
    results.append((repo_ref, commits, branches, contributors))

In [None]:
results

---

### Goal 3!!!

Now, of course, pandas!

In [None]:
import pandas as pd

df = pd.DataFrame(results, columns=['url', 'commits', 'branches', 'contributors'])
df

In [None]:
# The rest is for you to explore on your own time!