In [None]:
!pip install PyGithub
!pip install nbformat

# Get all repos

Replace `GH_ACCESS_TOKEN` with your GitHub access token
- You may want to go ahead without `GH_ACCESS_TOKEN` set. In this case, 60 requests / hour is the rate limit. 
- With `GH_ACCESS_TOKEN` set, the rate limit is 5000 requests / hour.
- [Reference](https://docs.github.com/en/rest/overview/resources-in-the-rest-api?apiVersion=2022-11-28#rate-limits-for-requests-from-personal-accounts)

In [2]:
from github import Github

def get_repos(username, access_token=None, include_fork=False):
  g = Github(access_token)
  user = g.get_user(username)

  results = []
  for repo in user.get_repos():
      if repo.fork is False:
        results.append(repo)
      else:
        if include_fork is True:
          results.append(repo)

  return results

In [3]:
repos = get_repos("deep-diver", "GH_ACCESS_TOKEN")

In [4]:
from pprint import pprint

print(len(repos))
pprint(repos)

108
[Repository(full_name="deep-diver/-"),
 Repository(full_name="deep-diver/AlexNet"),
 Repository(full_name="deep-diver/Baseball_Data_Analysis"),
 Repository(full_name="deep-diver/book-tracking-react"),
 Repository(full_name="deep-diver/calculator"),
 Repository(full_name="deep-diver/CIFAR10-img-classification-tensorflow"),
 Repository(full_name="deep-diver/CIFAR10-VGG19-Tensorflow"),
 Repository(full_name="deep-diver/complete-mlops-system-workflow"),
 Repository(full_name="deep-diver/conn"),
 Repository(full_name="deep-diver/Continuous-Adaptation-for-Machine-Learning-System-to-Data-Changes"),
 Repository(full_name="deep-diver/Continuous-Adaptation-with-VertexAI-AutoML-Pipeline"),
 Repository(full_name="deep-diver/Data-Analysis-on-RedWine"),
 Repository(full_name="deep-diver/Data-Analysis-on-Titanic"),
 Repository(full_name="deep-diver/Data-Wrangling-on-OpenStreeMap"),
 Repository(full_name="deep-diver/deep-diver"),
 Repository(full_name="deep-diver/deeplearning-in-3-steps-book"),
 R

# Extract source codes and save in CSV

The following code snippet works in the following manner:

1. Get list of files (*.py and *.ipynb) in the target repositories (`target_repos`)
2. Grasp the content of each file and decode it with `base64`
  - for `*.py`, plain text will be extracted
  - for `*.ipynb`, the contents of all the code cells will be extracted and merged as a single string
3. Create a `pd.DataFrame` of `["reponame", "filepath", "content"]` column to store repository, filepath, and the extracted content
4. Iterate 1 ~ 3 steps for all target repositories, and append DataFrame to `df` which contains all records

In [5]:
target_repos = [
  "Continuous-Adaptation-for-Machine-Learning-System-to-Data-Changes",
  "semantic-segmentation-ml-pipeline",
  "mlops-hf-tf-vision-models",
]

In [6]:
import base64
import pandas as pd
from nbformat import reads, NO_CONVERT

df = pd.DataFrame(columns=["reponame", "filepath", "content"])

from github import GithubException

def get_py_files(repo, file_list, path="."):
  contents = repo.get_contents("")
  while contents:
    file_content = contents.pop(0)
    if file_content.type == "dir":
      contents.extend(repo.get_contents(file_content.path))
    else:
      if file_content.name[-2:] == "py":
        file_list.append(file_content)
      elif file_content.name[-5:] == "ipynb":
        file_list.append(file_content)

for repo in repos:
  if repo.name in target_repos:
    file_list = []
    get_py_files(repo, file_list)

    if len(file_list) != 0:
      for file in file_list:
          if file.name.endswith("py"):
            content = file.content
            content_str = base64.b64decode(content).decode('utf-8')

            if content != '':
              df = pd.concat(
                  [
                      df, 
                      pd.DataFrame.from_dict([{
                        "reponame": repo.name,
                        "filepath": file.path,
                        "content": content_str
                      }])
                  ])

In [7]:
df

Unnamed: 0,reponame,filepath,content
0,Continuous-Adaptation-for-Machine-Learning-Sys...,custom_components/batch_pred_evaluator.py,"""""""\nThis component evaluates the performance ..."
0,Continuous-Adaptation-for-Machine-Learning-Sys...,custom_components/batch_prediction_vertex.py,"""""""\nThis component launches a Batch Predictio..."
0,Continuous-Adaptation-for-Machine-Learning-Sys...,custom_components/file_list_gen.py,"""""""\nGenerate a txt file formatted required by..."
0,Continuous-Adaptation-for-Machine-Learning-Sys...,custom_components/span_preparator.py,"""""""\nThis component is responsible for separat..."
0,Continuous-Adaptation-for-Machine-Learning-Sys...,custom_components/training_pipeline_trigger.py,"""""""\nComponent responsible for triggering a tr..."
...,...,...,...
0,semantic-segmentation-ml-pipeline,training_pipeline/pipeline/components/HFPusher...,# Copyright 2022 The TensorFlow Authors. All R...
0,semantic-segmentation-ml-pipeline,training_pipeline/pipeline/components/HFPusher...,# Copyright 2022 The TensorFlow Authors. All R...
0,semantic-segmentation-ml-pipeline,training_pipeline/pipeline/components/HFPusher...,# Copyright 2022 The TensorFlow Authors. All R...
0,semantic-segmentation-ml-pipeline,training_pipeline/pipeline/components/HFPusher...,"from huggingface_hub import ModelCard, ModelCa..."


### Save the resuling `DataFrame` to CSV

In [8]:
df.to_csv("chansung.csv")

# Include Ipynb content (optional)

In [None]:
a = None

for file in file_list:
  if file.name[-5:] == "ipynb":
    a = file
    break

In [None]:
import base64
from nbformat import reads, NO_CONVERT

content = a.content
content = base64.b64decode(content).decode('utf-8')

notebook = reads(content, NO_CONVERT)

cells = notebook['cells']
code_cells = [c for c in cells if c['cell_type'] == 'code']
for cell in code_cells:
    print(cell['source'])

!gcloud init
from google.colab import auth

auth.authenticate_user()
TARGET_ROOT_DIR = "cifar10"
TARGET_TRAIN_DIR = TARGET_ROOT_DIR + "/span-1/train"
TARGET_TEST_DIR = TARGET_ROOT_DIR + "/span-1/test"

!mkdir -p {TARGET_TRAIN_DIR}
!mkdir -p {TARGET_TEST_DIR}
import tensorflow_datasets as tfds

# Generate TFRecords with TFDS
builder = tfds.builder("cifar10")
builder.download_and_prepare()
!cp {builder.data_dir}/cifar10-train.tfrecord-00000-of-00001 {TARGET_TRAIN_DIR}/cifar10-train.tfrecord
!cp {builder.data_dir}/cifar10-test.tfrecord-00000-of-00001 {TARGET_TEST_DIR}/cifar10-test.tfrecord
!ls -R {TARGET_ROOT_DIR}
#@title GCS
#@markdown You should change these values as per your preferences. The copy operation can take ~5 minutes. 
BUCKET_PATH = "gs://cifar10-csp-public2" #@param {type:"string"}
REGION = "us-central1" #@param {type:"string"}

!gsutil mb -l {REGION} {BUCKET_PATH}
!gsutil -m cp -r {TARGET_ROOT_DIR}/* {BUCKET_PATH}
!gsutil ls -R {BUCKET_PATH}/
!pip install tfx==1.2.0
from tf

In [None]:
import base64
import pandas as pd
from nbformat import reads, NO_CONVERT

df = pd.DataFrame(columns=["reponame", "filepath", "content"])

from github import GithubException

def get_py_files(repo, file_list, path="."):
  contents = repo.get_contents("")
  while contents:
    file_content = contents.pop(0)
    if file_content.type == "dir":
      contents.extend(repo.get_contents(file_content.path))
    else:
      if file_content.name[-2:] == "py":
        file_list.append(file_content)
      elif file_content.name[-5:] == "ipynb":
        file_list.append(file_content)

for repo in repos:
  if repo.name in target_repos:
    file_list = []
    get_py_files(repo, file_list)

    if len(file_list) != 0:
      for file in file_list:
          if file.name.endswith("py"):
            content = file.content
            content_str = base64.b64decode(content).decode('utf-8')

            if content != '':
              df = pd.concat(
                  [
                      df, 
                      pd.DataFrame.from_dict([{
                        "reponame": repo.name,
                        "filepath": file.path,
                        "content": content_str
                      }])
                  ])
          elif file.name.endswith("ipynb"):
            content = file.content
            content_str = base64.b64decode(content).decode('utf-8')
                        
            code_cell_str = ""
            notebook = reads(content_str, NO_CONVERT)

            code_cells = [
              c for c in notebook['cells'] 
              if c['cell_type'] == 'code'
            ]
            
            for cell in code_cells:
              code_cell_str += cell['source']

            df = pd.concat(
                [
                    df, 
                    pd.DataFrame.from_dict([{
                      "reponame": repo.name,
                      "filepath": file.path,
                      "content": code_cell_str
                    }])
                ])