# Repository Scraper
Scrapes all the topics from github, stores their topic_name, topic_url and description in csv file
For each topic mentioned above, scraper scrapes top 30 repositories according to stars and stores their username, repo_name, repo_url, stars, description(if provided)

In [None]:
!pip install pandas
!pip install bs4

In [None]:
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup

def convertStarsToInt(string):
    string = string.strip()
    if (string[-1] == 'k'): # if last char is k (thousand)
        return int(float(string[:-1]) * 1000)
    return int(string)

def getRepoInfo(h3_tag, star_tag, description):
    a_tags = h3_tag.find_all("a")
    repo_url = "https://github.com" + a_tags[1]["href"]
    username = a_tags[0].text.strip()
    repo_name = a_tags[1].text.strip()
    stars = convertStarsToInt(star_tag.text.strip())
    
    desc = description.find( # get the description of the repository
        "div",
        {"class": "color-bg-primary"})
    desc = desc.find(
        "div",
        {"class": "px-3 pt-3"})
    if (desc == None): # if description is not given by repo owner
        return repo_url, username, repo_name, stars, ""
    desc = desc.find(
        "div"
        )
    
    return repo_url, username, repo_name, stars, desc.text.strip()

def getTopicRepos(topic_url):
    # Download the page
    flag = True
    
    while (flag):
        response = requests.get(topic_url) # response is an object here
        if (response.status_code != 200): # Check whether response is OK
            #raise Exception("Failed to load page " + topics_url)
            flag = True
        else:
            flag = False
        
    # Parse using BeautifulSoup
    topic_doc = BeautifulSoup(response.text, "html.parser")
    h3_tags = topic_doc.find_all(
                "h3",
                {"class": "f3 color-text-secondary text-normal lh-condensed"}
                )
    stars = topic_doc.find_all(
            "a",
            "social-count float-none"
            )
    
    
    description = topic_doc.find_all(
    "article",
    {"class": "border rounded color-shadow-small color-bg-secondary my-4"}
    )
  
    
    topic_repos_dict = {
        "username": [],
        "repo_name": [],
        "repo_url": [],
        "stars": [],
        "description": []
    }

    for i in range(len(h3_tags)):
        repo_info = getRepoInfo(h3_tags[i], stars[i], description[i])
        topic_repos_dict["username"].append(repo_info[1])
        topic_repos_dict["repo_name"].append(repo_info[2])
        topic_repos_dict["repo_url"].append(repo_info[0])
        topic_repos_dict["stars"].append(repo_info[3])
        topic_repos_dict["description"].append(repo_info[4])
    
    return pd.DataFrame(topic_repos_dict)

def scrapeTopicsRepos(): # scrape for all the topics, all the repos
    topicsUrl = "https://github.com/topics"
    pageNumber = 1
    
    
    topic_titles = []
    topic_descriptions = []
    topic_urls = []
    print("Fetching Topics...")
    while (pageNumber <= 7): # On github we have a "Load More" button, when we click there the url changes to /?page=2, /?page=3, we use this url to access furthur topics
        flag = True
        while (flag):
            response = requests.get(topicsUrl + "?page=" + str(pageNumber)) # response is an object here
            if (response.status_code != 200): # Check whether response is OK
                #raise Exception("Failed to load page " + topics_url)
                flag = True
            else:
                flag = False
    
        parsedDoc = BeautifulSoup(response.text, 'html.parser')

        topic_title_tags = parsedDoc.find_all(
            "p", 
            {"class": "f3 lh-condensed mb-0 mt-1 Link--primary"}
        )

        topic_description_tags = parsedDoc.find_all(
            "p",
            {"class": "f5 color-text-secondary mb-0 mt-1"}
        )

        topic_divs = parsedDoc.find_all(
            "div",
            {"class": "py-4 border-bottom"}
        )

        topic_link_tags = [] # inside the divs, find the link tags
        for i in range(len(topic_divs)):
          topic_link_tags += topic_divs[i].find_all("a", recursive = False)


        for tag in topic_title_tags:
          topic_titles.append(tag.text) # tag.text gives innerText of a tag



        for tag in topic_description_tags:
          topic_descriptions.append(tag.text.strip()) # .strip() removes all empty space in beginning and end



        for tag in topic_link_tags:
          topic_urls.append("https://github.com" + tag["href"])
        pageNumber += 1
        
    
    topics_dict = {
        "title": topic_titles, # first column
        "description": topic_descriptions, # second column
        "url": topic_urls # third column
    }
    
    # topics_df is HERE
    topics_df = pd.DataFrame(topics_dict)
    
    os.mkdir("Repository-Scraper")
    os.mkdir("Repository-Scraper/topics")
    topics_df.to_csv("Repository-Scraper/topics.csv", index = None)
    
    for i in range(len(topic_urls)):
        print("Fetching top repositories for the topic " + topic_titles[i])
        getTopicRepos(topic_urls[i]).to_csv("Repository-Scraper/topics/" + topic_titles[i] + ".csv", index = None)
    return

scrapeTopicsRepos()   

# Future work to be done:
Also get all other tags from the repositories