# ~ Objective : Scapping Top Repositories for Topics on GitHub


Here are the steps we'll follow:

- We're going to scrape https://github.com/topics
- We'll get a list of topics. For each topic, we'll get topic title, topic page URL and topic description
- For each topic, we'll get the top 25 repositories in the topic from the topic page
- For each repository, we'll grab the repo name, username, stars and repo URL
- For each topic we'll create a CSV file in the following format:
    - Repo Name,Username,Stars,Repo URL
    - three.js,mrdoob,69700,https://github.com/mrdoob/three.js
    - libgdx,libgdx,18300,https://github.com/libgdx/libgdx

# ~ Implementation

In [1]:
# Importing necessory modules
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import os

## Scrape the list of topics from Github
- Explain how you'll do it.

- use requests to downlaod the page
- user BS4 to parse and extract information
- convert to a Pandas dataframe

Let's write a function to download the page.

In [2]:
# This function will fetch trending topics from github
# Function returns pandas dataframe

def fetch_topics_detail(topics_soup):
    # fetching topic name
    topic_name_class = 'f3 lh-condensed mb-0 mt-1 Link--primary'
    topic_name = topics_soup.find_all('p',{'class':topic_name_class})
    topic_titles = []
    for i in topic_name:
        topic_titles.append(i.text)

    # fetching topic Descirption
    topic_name_disc_class = 'f5 color-text-secondary mb-0 mt-1'
    topic_name_disc = topics_soup.find_all('p',{'class':topic_name_disc_class})
    topic_disc = []
    for i in topic_name_disc:
        topic_disc.append(i.text.strip())

    # fetching topic link
    topic_name_link = topics_soup.find_all('a',{'class':'d-flex no-underline'})
    topic_link = []
    for i in topic_name_link:
        topic_link.append(base_link+i['href'])
    
    # returning details in panda's dataframe
    return pd.DataFrame({'t_name':topic_titles, 't_desc': topic_disc, 't_link':topic_link})

## Get the top repositories from a topic page

In [3]:
# This function will fetch repository details.

def fetch_repo_details(lin):
# defining necessory variables
    user_name = []
    repo_name = []
    repo_link = []
    star = []
    repo_selection_class = 'f3 color-text-secondary text-normal lh-condensed'
    star_class = 'social-count float-none'


    resp = requests.get(lin)
    
    if resp.status_code != 200:
#         raise Exception('Failed to load page {}'.format(lin))
        return
    raw_topic = BeautifulSoup(resp.text, "html.parser")
    
# code to fetch repo_username, repo_name, repo_link
    repo_details = raw_topic.find_all('h1', {'class':repo_selection_class})
    for i in repo_details:
        user_name.append(i.find_all('a')[0].text.strip())
        repo_link.append(base_link+i.find_all('a')[0]['href'])
        repo_name.append(i.find_all('a')[1].text.strip())

# code to fetch stars of repo
    star_count = raw_topic.find_all('a', {'class':star_class})
    for i in star_count:
        if i.text.strip()[-1] == 'k':
            star.append(int(float(i.text.strip()[:-1]))*1000)
        else:
            star.append(int(i.text.strip()))
            
# creating dictionary to store the data & returning as pandas dataframe       
    final_dict = {'repo user': user_name,
             'repo title': repo_name,
             'repo stars': star,
             'repo link': repo_link}

    return pd.DataFrame(final_dict)

## Putting it all together
- We have a funciton to get the list of topics.
- We have a function to get data of repos from a topics page.
- Let's create a function to put them together & create and save as csv file.

In [4]:
def create_csv(t_df):

# Checking for folder 'data'. If it doesn't exists then create one.

    if os.path.exists('data'):
        pass
    else:
        os.mkdir('data')

# move to directory: 'data'
    os.chdir('data')    
    
# iterate over dataframe passed as parameter.
    for index, row in t_df.iterrows():
        topic_name = row[0]
        topic_link = row[2]

        csv_name = topic_name+'.csv'
        repo_df = fetch_repo_details(topic_link)
        repo_df.to_csv(csv_name, index=None)
        print('csv created for {}'.format(topic_name))        
#         if os.path.exists(csv_name):
#             print('The file {} already exists. Skipping...'.format(csv_name))
#             pass
#         else:
#             repo_df = fetch_repo_details(topic_link)
#             if type(repo_df) != None:
#                 repo_df.to_csv(csv_name, index=None)
#                 print('csv created for {}'.format(topic_name))
#             else:
#                 pass

In [5]:
base_link = 'https://github.com'

topics_result = requests.get("https://github.com/topics")
topics_soup = BeautifulSoup(topics_result.text, "html.parser")
create_csv(fetch_topics_detail(topics_soup))

csv created for 3D
csv created for Ajax
csv created for Algorithm
csv created for Amp
csv created for Android
csv created for Angular
csv created for Ansible
csv created for API
csv created for Arduino
csv created for ASP.NET
csv created for Atom
csv created for Awesome Lists
csv created for Amazon Web Services
csv created for Azure
csv created for Babel
csv created for Bash
csv created for Bitcoin
csv created for Bootstrap
csv created for Bot
csv created for C
csv created for Chrome
csv created for Chrome extension
csv created for Command line interface
csv created for Clojure
csv created for Code quality
csv created for Code review
csv created for Compiler
csv created for Continuous integration
csv created for COVID-19
csv created for C++
