# Objective 1

Get a list of the last 1M repos from Github, load data into a warehouse, and run an exploratory data analysis

In [1]:
import requests, logging, json, os
import pandas as pd
from google.oauth2 import service_account

# CONFIGURING LOGGING
logging.basicConfig(filename= 'github_api_logs.log', level= logging.DEBUG)

def get_repo_data(url, headers):
    try:
        # MAKING A GET REQUEST TO THE API
        response= requests.get(url, headers= headers)
        logging.info("Making an API call")
        
        # WE CAN USE PAGINATION TO GET MORE REPO DATA. I DIDN't USE PAGINATION HERE SINCE GITHUB PLACES A LIMIT ON THE NUMBER OF API CALLS YOU CAN MAKE, 
        # AND I DIDN'T KNOW HOW MUCH DATA THERE WOULD BE, SO I RESTRICTED THE RESULTS TO ONLY 100 REPOSITORIES.

        # CONVERT JSON RESPONSE INTO PYTHON DICTIONARY
        data= response.json()
        logging.info(f"API call successful")
        return data
    
    # IF ERROR
    except requests.exceptions.HTTPError as err_http:
        logging.error(f"HTTP Error: {err_http}")
        return None
    except requests.exceptions.ConnectionError as err_conn:
        logging.error(f"Error Connecting: {err_conn}")
        return None
    except requests.exceptions.Timeout as err_timeout:
        logging.error(f"Timeout Error: {err_timeout}")
        return None
    except requests.exceptions.RequestException as err:
        logging.error(f"Something went wrong: {err}")
        return None

# EXTRACTING REPOSITORIES URL FROM RESPONSE AND APPENDING THEM TO THE LIST
def get_repo_url(data):
    url_lst= []
    logging.info("Empty List Created for URL")
    try:
        for num in range(len(data)):
            url_lst.append(data[num]['url'])
        logging.info("URLs appended")
        return url_lst
    except Exception as err:
        logging.error(f"Can't get data. Error: {err}")

# REQUESTING DATA FROM REPO URLs        
def get_data(urls):
    repo_data= []
    logging.info("Empty List Created for REPO Data")
    
    for url in urls:
        response= requests.get(url)
        data = response.json()
        repo_data.append(data)
    logging.info("Repo Data Appended")
    # CONVERTING PYTHON LIST TO JSON DOCUMENT
    repo_data= json.dumps(repo_data)
    return repo_data

def json_to_df(repo_data):
    
    # KEYS LIST TO EXTRACT THEIR DATA FROM JSON WHILE CONVERTING INTO DATAFRAME
    keys = ['name', 'html_url', 'created_at', 'updated_at', 'language', 'watchers', 'forks', 'open_issues', 'subscribers_count']
    df = pd.DataFrame.from_dict(repo_data)
    df = df[keys]
    logging.info("Converted into Pandas Dataframe")
    return df

def remove_null(df):
    # REMOVING ALL RECORDS WITH NULL VALUES. HALF OF THE COLUMNS CONTAIN NULL VALUES AND IT IS BECAUSE GIT SERVER FORBIDDEN CALLS FROM MY API: 
    # DEBUG:urllib3.connectionpool:https://api.github.com:443 "GET /repos/sr/tasks HTTP/1.1" 403 279 DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.github.com:443
    df= df.dropna(how= 'all')
    logging.info('Dropping all columns')
    return df

# SENDING JSON DATA TO GOOGLE BIGQUERY
def df_to_gbq(df):
    try:
        # SERVICE ACCOUNT KEY 
        credentials = service_account.Credentials.from_service_account_file(
            'spiritual-slate-374706-a81759778bb5.json',
            scopes=["https://www.googleapis.com/auth/cloud-platform"],
        )
        logging.info('Connection Successful')

    except Exception as e:
        logging.error(f"Couldn't establish connection with BigQuery. Error: {e}")
    # UPLOADING DATAFRAME TO BIGQUERY
    df.to_gbq('github.repos', project_id='spiritual-slate-374706', if_exists='append', credentials=credentials)

if __name__ == "__main__":
    headers = {
    "Accept" : "application/vnd.github+json",
    "Authorization" : "Bearer <Token>", # Write your github api
    "X-GitHub-Api-Version" : "2022-11-28"
    }
    url= 'https://api.github.com/repositories'
    data= get_repo_data(url, headers)
    url_list= get_repo_url(data)
    repo_data= get_data(url_list)
    df= json_to_df(json.loads(repo_data))
    df= remove_null(df)
    df_to_gbq(df)