In [1]:
import re
import json
import base64
import requests

# OAuth2

In [30]:
with open('../git_secrets.json', 'r') as file:
    secrets = json.loads(file.read())
    file.close()

client_id = secrets['client_id']
client_secret = secrets['client_secret']
pat = secrets['pat']

In [32]:
auth_params = {
    'username': client_id,
    'token': pat,
    'client_id': client_id,
    'client_secret': client_secret,
    'redirect_uri': 'http://localhost:1234/path'
}

response = requests.post('https://api.github.com/login/oauth/authorize', headers=auth_params)
print(response.content.decode())

{"message":"Not Found","documentation_url":"https://developer.github.com/v3"}


In [33]:
curl_string = f'{client_id}:{client_secret}'

In [34]:
curl_string

'120fd2f1da4a2f1d5351:5e876ff7d1f1561b94b0d57adc005c4667a10556'

In [2]:
response = requests.get('https://api.github.com/')
content = response.content.decode()

In [3]:
json.loads(content)

{'message': "API rate limit exceeded for 98.224.131.229. (But here's the good news: Authenticated requests get a higher rate limit. Check out the documentation for more details.)",
 'documentation_url': 'https://developer.github.com/v3/#rate-limiting'}

# New Goal

The new goal is to build a function that given an owner and a repository, returns all of the python files within 2 subdirectories of that repository

`github v3 api: /repos/:owner/:repo/contents`

In [28]:
response = requests.get('https://api.github.com/repos/zbloss/TransformerModel/contents')
content = json.loads(response.content.decode())
print(type(content))

<class 'list'>


In [134]:
def get_python_files(owner: str, repo: str):
    """
    Given a Github owner and repository, constructs a valid Github V3 api call and 
    returns a list of file objects in the provided directory.
    
    :param owner: (str) The username associated with the repository.
    :param repo: (str) The repository to pull from Github.
    :returns: list of dictionaries containing urls to the python files
    """
    dirs = []
    files = []
    python_files_to_ignore = ['__init__.py', 'setup.py']
    
    try:
        response = requests.get(f'https://api.github.com/repos/{owner}/{repo}/contents')
        assert response.status_code == 200
        
    except Exception as e:
        print(f'Requests Exception: {e}')
    
    content = response.content
    try:
        content = json.loads(content)
    except Exception as e:
        print(f'JSON Loads Exception: {e}')
        
    for item in content:
        # adding python files to the list of files
        if item['type'] == 'file' and \
           item['path'][-3:] == '.py' and \
           item['path'] not in python_files_to_ignore:
            item.pop('name')
            files.append(item)
            
        # searching the directory trees for files 
        # and appending them to files.
        elif item['type'] == 'dir':
            
            if 'git_url' in list(item.keys()):
                tree = item['git_url']
                response  = requests.get(tree)
                content = json.loads(response.content.decode())

                if type(content['tree']) == type([]):

                    for file in content['tree']:
                        # This will check the last 3 characters of the filename to see if they are '.py'
                        # indicating a python file.
                        if file['path'][-3:] == '.py' and file['path'] not in python_files_to_ignore:
                            files.append(file)
                                                 
    return files

def get_python_data(py_files: list):
    """
    Given a list of valid Github V3 api called python files, returns a dictionary
    containing the filename as the key and code as the value.
    
    :param py_files: (list) list of dictionaries containing a `url` key.
    :returns: dictionary containing filename and code.
    """
    py_data = {}
    
    for item in py_files:
        if 'url' in list(item.keys()):
            
            try:
                response = requests.get(item['url'])
                content = response.content.decode()
            except Exception as e:
                print(f'Requests Exception: {e}')
                pass
            
            # Grabbing the Base64 encoded python file
            try:
                content = json.loads(content)
                content = content['content']
            except Exception as e:
                print(f'JSON Loads Exception: {e}')
                pass
            
            py = base64.b64decode(content)
            
            py_data[item['path']] = py.decode()
            
    return py_data

def python_code_cleaner(code: str):
    """
    Given the python code as a string, pads around the '\n' characters,
    replaces '    ' with <TAB>, and replaces \"\"\" with [DOCSTRING].
    
    :param code: (str) python code as a string.
    :returns: python code as a string
    """
    
    pad_n = re.sub('\n', ' \n ', code)
    pad_tab = re.sub(' '*4, '[TAB]', pad_n)
    pad_docstring = re.sub('\"\"\"', '[DOCSTRING]', pad_tab)
    
    return pad_docstring

def docstring_extractor(code: str):
    """
    Given cleaned python function as a string, extracts the docstring and returns a tuple
    containing the docstring and the function with the docstring removed.
    
    :param code: (str) python function as a string.
    :returns: (tuple) of (docstring, python function)
    """
    
    

In [135]:
transformer_files = get_python_files(owner='zbloss', repo='TransformerModel')
print(f'Retreived {len(transformer_files)} Python Files')

Requests Exception: 


TypeError: string indices must be integers

In [133]:
python_files.keys()

dict_keys(['example.py', 'attention.py', 'custom_scheduler.py', 'data_process.py', 'decoder.py', 'decoder_layer.py', 'encoder.py', 'encoder_layer.py', 'evaluate.py', 'masker.py', 'positional_encoder.py', 'training.py', 'transformer.py', 'transformer_xl.py'])

In [130]:
python_files = get_python_data(transformer_files)
print(f'Retrieved code from {len(python_files)} files')

JSON Loads Exception: 'content'


TypeError: argument should be a bytes-like object or ASCII string, not 'dict'

In [None]:
python_files.keys()

In [124]:
clean_code = python_code_cleaner(python_files['attention.py'])
print(clean_code)

import tensorflow as tf 
  
  
 class MultiHeadAttention(tf.keras.layers.Layer): 
  
[TAB] def __init__(self, d_model, num_heads): 
[TAB][TAB] """ 
[TAB][TAB] :param d_model: The Dimensionality of the Attention mechanism 
[TAB][TAB] :param num_heads: The number of heads to use in the attention mechanism 
[TAB][TAB] """ 
[TAB][TAB] super(MultiHeadAttention, self).__init__() 
[TAB][TAB] self.num_heads = num_heads 
[TAB][TAB] self.d_model = d_model 
  
[TAB][TAB] assert d_model % self.num_heads == 0 
  
[TAB][TAB] self.depth = d_model // self.num_heads 
  
[TAB][TAB] self.wq = tf.keras.layers.Dense(d_model) 
[TAB][TAB] self.wk = tf.keras.layers.Dense(d_model) 
[TAB][TAB] self.wv = tf.keras.layers.Dense(d_model) 
  
[TAB][TAB] self.dense = tf.keras.layers.Dense(d_model) 
  
[TAB] @staticmethod 
[TAB] def scaled_dot_product_attention(q, k, v, mask): 
[TAB][TAB] """ 
[TAB][TAB] :param q: The query vector 
[TAB][TAB] :param k: The key vector 
[TAB][TAB] :param v: The value vector 
[TAB][TAB] 

In [127]:
print(re.sub('\"\"\"', '[DOCSTRING]', clean_code))

import tensorflow as tf 
  
  
 class MultiHeadAttention(tf.keras.layers.Layer): 
  
[TAB] def __init__(self, d_model, num_heads): 
[TAB][TAB] [DOCSTRING] 
[TAB][TAB] :param d_model: The Dimensionality of the Attention mechanism 
[TAB][TAB] :param num_heads: The number of heads to use in the attention mechanism 
[TAB][TAB] [DOCSTRING] 
[TAB][TAB] super(MultiHeadAttention, self).__init__() 
[TAB][TAB] self.num_heads = num_heads 
[TAB][TAB] self.d_model = d_model 
  
[TAB][TAB] assert d_model % self.num_heads == 0 
  
[TAB][TAB] self.depth = d_model // self.num_heads 
  
[TAB][TAB] self.wq = tf.keras.layers.Dense(d_model) 
[TAB][TAB] self.wk = tf.keras.layers.Dense(d_model) 
[TAB][TAB] self.wv = tf.keras.layers.Dense(d_model) 
  
[TAB][TAB] self.dense = tf.keras.layers.Dense(d_model) 
  
[TAB] @staticmethod 
[TAB] def scaled_dot_product_attention(q, k, v, mask): 
[TAB][TAB] [DOCSTRING] 
[TAB][TAB] :param q: The query vector 
[TAB][TAB] :param k: The key vector 
[TAB][TAB] :param v: The v