# README Downloader

**Author**: Bhavyai Gupta

Downloads the README files from GitHub using GitHub API

+ Downloaded files are randomly chosen but unique
+ Downloaded files belong only to the Software Development repository
+ Downloaded files have size greater than 2 KB

### Necessary imports

In [0]:
import json
import pandas as pd
import random
import requests

### Necessary inputs

In [0]:
# readme files we want to download
files_download = 10

# github personal access token for using github api
github_personal_access_token = dbutils.fs.head("/FileStore/project/GitHub_PAT.txt")

# directory to store downloaded readme files
dbutils.fs.mkdirs('file:/databricks/driver/readme_files')

Out[2]: True

### Reading CSVs

In [0]:
def read_CSV_to_DF(filepath):
  """
  Reads a csv file into a spark dataframe
  and returns Pandas dataframe
  """
  df = (spark.read
        .option("multiline", "true")
        .option("quote", '"')
        .option("header", "true")
        .option("escape", "\\")
        .option("escape", '"')
        .csv(filepath)
        )
  
  return df.toPandas()

### Main code

In [0]:
class Readme:
  def __init__(self):
    '''
    Attributes:
      _readme_count (int)
      _request_countdown (int)

      _api_repository_url (str)
      _api_repo_base_url (str)
      _api_raw_url (str)
      _api_header (dict)

      _original_repos_nd (ndarray)
      _new_repos_df (dataframe)

      _default_folders (list)
      _markdown_name (list)
      _markdown_exts (list)
    '''

    # variables to track how many README files we want to download
    # --------------------------------------------------------------------------------
    self._readme_count = files_download
    # --------------------------------------------------------------------------------
    
    
    # storing dummy value that gets updated as the program runs
    # --------------------------------------------------------------------------------
    self._request_countdown = 1
    # --------------------------------------------------------------------------------

    
    # create the api urls
    # --------------------------------------------------------------------------------
    self._api_repository_url = 'https://api.github.com/repositories'
    self._api_repo_base_url = 'https://api.github.com/repos'
    self._api_raw_url = 'https://raw.githubusercontent.com'
    # --------------------------------------------------------------------------------


    # create the api_header
    # --------------------------------------------------------------------------------
    self._api_header = {'accept': 'application/vnd.github.v3+json', 'authorization': 'token ' + github_personal_access_token}
    # --------------------------------------------------------------------------------


    # seed the random for 'since' in get_api_options()
    # --------------------------------------------------------------------------------
    random.seed(10)
    # --------------------------------------------------------------------------------


    # create pd dataframe to track new dataset
    # --------------------------------------------------------------------------------
    self._new_repos_df = pd.DataFrame(columns=['url', 'readme_location', 'saved_as'])
    # --------------------------------------------------------------------------------


    # create lists to track supported folder, name, and extensions of README files
    # --------------------------------------------------------------------------------
    # choosing only what research paper has done
    self._default_folders = [None]
    self._markdown_name = ['README']
    self._markdown_exts = ['md']
    # --------------------------------------------------------------------------------


  def get_api_options(self):
    '''
    Returns updated api_options with a random number within the limit
    according to the research paper
    '''

    api_options = {'since': random.randrange(0, 100000000, 1)}
    return api_options


  def fetch_request(self, api_url, api_headers=None, api_options=None):
    '''
    Fetches the response of the request of api_url using options
    api_headers and api_options
    '''

    r = requests.get(api_url, headers=api_headers, params=api_options)
    return r


  def fetch_json(self, api_url, api_headers=None, api_options=None):
    '''
    Fetches the JSON response using the api_url, api_headers,
    and api_options
    '''

    r = self.fetch_request(api_url, api_headers, api_options)

    if r.status_code != requests.codes.ok:
      return (r.status_code, None, None)

    return (r.status_code, dict(r.headers), json.loads(r.text))


  def check_new(self, repo_url):
    '''
    Checks if repo_url exists in the ndarray new_repos_nd
    '''

    # extract the column 'url' from the dataset new_repos_df
    new_repos_nd = self._new_repos_df['url'].values

    if repo_url in new_repos_nd:
      return True

    return False


  def save_file(self, qualified_name, html_url, request):
    '''
    Saves the contents of the request as name derived from
    qualified_name in the directory directory_project_data
    '''
    # qualified_name = [repo_name, default_branch, readme_folder, readme_name, readme_extension]
    filename = ".".join(qualified_name[0].split('/') + [qualified_name[4]])
    filelocation = "readme_files/" + filename

    if qualified_name[2] is None:
      readme_location = str(qualified_name[3] + '.' + qualified_name[4])

    else:
      readme_location = str(qualified_name[2] + '/' + qualified_name[3] + '.' + qualified_name[4])

    with open(filelocation, 'wb') as fd:
      for chunk in request.iter_content(chunk_size=128):
        fd.write(chunk)

    self._new_repos_df = self._new_repos_df.append({'url': html_url, 'readme_location': readme_location, 'saved_as': filename}, ignore_index=True)


  def find_readme(self, repo_name, default_branch):
    '''
    Finds and returns the downloadable url of a readme of
    repo repo_name, if the README exists
    '''

    repo_raw_url = self._api_raw_url + '/' + repo_name

    for x in self._default_folders:

      # optimization - check if the default subfolder exists before querying
      if x is not None:
        api_contents_url = self._api_repo_base_url + '/' + repo_name + '/contents'
        r1, r2, r3 = self.fetch_json(api_contents_url, self._api_header)

        flag = False

        if r1 == requests.codes.ok:
          self._request_countdown = int(r2['X-RateLimit-Remaining'])

          while len(r3) != 0:
            folder = r3.pop()

            if folder['name'] == x:
              flag = True
              # break from the while loop
              break

          # if no default subfolder was found, don't querying inside it
          if flag == False:
            continue

      for y in self._markdown_name:
        for z in self._markdown_exts:
          test_url = ''

          if x is None:
            test_url = repo_raw_url + '/' + default_branch + '/' + y + '.' +  z

          else:
            test_url = repo_raw_url + '/' + default_branch + '/' + x + '/' + y + '.' +  z

          if self.fetch_request(test_url).status_code == requests.codes.ok:
            return test_url, [repo_name, default_branch, x, y, z]

    return None, None


  def download(self):
    '''
    Downloads randon README files
    '''

    while(self._request_countdown > 0 and self._readme_count != 0):
      # fetch the list of repositories
      response_code, repository_header, repository_array = self.fetch_json(
        self._api_repository_url, api_headers=self._api_header, api_options=self.get_api_options())

      if response_code != requests.codes.ok:
        continue

      # update the value of remaining API calls
      self._request_countdown = int(repository_header['X-RateLimit-Remaining'])

      # loop through all items in the repository_array
      while(self._request_countdown > 0 and self._readme_count != 0 and len(repository_array) != 0):
        # fetch one item from the JSON array
        repository = repository_array.pop()

        # if repo is already downloaded, skip it
        if self.check_new(repository['html_url']):
          continue

        else:
          # create the API url for the repo details
          api_repo_url = self._api_repo_base_url + "/" + repository['full_name']

          # fetch details for the repo
          response_code, repo_header, repo_response = self.fetch_json(
            api_repo_url, api_headers=self._api_header)

          if response_code != requests.codes.ok:
            continue

          # update the value of remaining API calls
          self._request_countdown = int(repo_header['X-RateLimit-Remaining'])
          # logging.info(f'Remaining API calls {self._request_countdown}')

          # if the repo is not a Software Development repo, skip it
          if repo_response['language'] is None:
            continue

          else:
            readme_url, qualified_name = self.find_readme(
              repo_response['full_name'], repo_response['default_branch'])

            # if the repo doesn't has any readme, skip it
            if readme_url is None:
              continue

            else:
              sr = self.fetch_request(readme_url, api_headers=self._api_header)

              # if the README size is less than 2KB, ignore the file
              if sr.content.__len__() < 2048:
                continue

              self.save_file(qualified_name, repo_response['html_url'], sr)
              self._readme_count -= 1
              print(f'Remaining files to download {self._readme_count}')

### Run code

In [0]:
Readme().download()

Remaining files to download 9
Remaining files to download 8
Remaining files to download 7
Remaining files to download 6
Remaining files to download 5
Remaining files to download 4
Remaining files to download 3
Remaining files to download 2
Remaining files to download 1
Remaining files to download 0


### Move files

Move downloaded README files from local system to dbfs

In [0]:
%fs mv -r file:/databricks/driver/readme_files dbfs:/FileStore/project/

In [0]:
%fs ls /FileStore/project/

path,name,size
dbfs:/FileStore/project/GitHub_PAT.txt,GitHub_PAT.txt,40
dbfs:/FileStore/project/OR13.OpenCyc.md,OR13.OpenCyc.md,2068
dbfs:/FileStore/project/ddikman.enable-media-replace.md,ddikman.enable-media-replace.md,2124
dbfs:/FileStore/project/franklinted.k-vim.md,franklinted.k-vim.md,9645
dbfs:/FileStore/project/jasonmit.flexi.md,jasonmit.flexi.md,12620
dbfs:/FileStore/project/octobox.octobox.md,octobox.octobox.md,9780
dbfs:/FileStore/project/pescarcena.ui-date.md,pescarcena.ui-date.md,5618
dbfs:/FileStore/project/raphaelmun.mmbot.md,raphaelmun.mmbot.md,4304
dbfs:/FileStore/project/shyiko.opentype.js.md,shyiko.opentype.js.md,12814
dbfs:/FileStore/project/skitterm.map-tour-storytelling-template-js.md,skitterm.map-tour-storytelling-template-js.md,38938
