# Objective 1

Get a list of the last 1M repos from Github, load data into a warehouse, and run an exploratory data analysis

In [1]:
import requests, logging, json, os
import pandas as pd
from google.oauth2 import service_account

# CONFIGURING LOGGING
logging.basicConfig(filename= 'github_api_logs.log', level= logging.DEBUG)

def get_repo_data(url, headers):
    try:
        # MAKING A GET REQUEST TO THE API
        response= requests.get(url, headers= headers)
        logging.info("Making an API call")
        
        # WE CAN USE PAGINATION TO GET MORE REPO DATA. I DIDN't USE PAGINATION HERE SINCE GITHUB PLACES A LIMIT ON THE NUMBER OF API CALLS YOU CAN MAKE, 
        # AND I DIDN'T KNOW HOW MUCH DATA THERE WOULD BE, SO I RESTRICTED THE RESULTS TO ONLY 100 REPOSITORIES.

        # CONVERT JSON RESPONSE INTO PYTHON DICTIONARY
        data= response.json()
        logging.info(f"API call successful")
        return data
    
    # IF ERROR
    except requests.exceptions.HTTPError as err_http:
        logging.error(f"HTTP Error: {err_http}")
        return None
    except requests.exceptions.ConnectionError as err_conn:
        logging.error(f"Error Connecting: {err_conn}")
        return None
    except requests.exceptions.Timeout as err_timeout:
        logging.error(f"Timeout Error: {err_timeout}")
        return None
    except requests.exceptions.RequestException as err:
        logging.error(f"Something went wrong: {err}")
        return None

# EXTRACTING REPOSITORIES URL FROM RESPONSE AND APPENDING THEM TO THE LIST
def get_repo_url(data):
    url_lst= []
    logging.info("Empty List Created for URL")
    try:
        for num in range(len(data)):
            url_lst.append(data[num]['url'])
        logging.info("URLs appended")
        return url_lst
    except Exception as err:
        logging.error(f"Can't get data. Error: {err}")

# REQUESTING DATA FROM REPO URLs        
def get_data(urls):
    repo_data= []
    logging.info("Empty List Created for REPO Data")
    
    for url in urls:
        response= requests.get(url)
        data = response.json()
        repo_data.append(data)
    logging.info("Repo Data Appended")
    # CONVERTING PYTHON LIST TO JSON DOCUMENT
    repo_data= json.dumps(repo_data)
    return repo_data

def json_to_df(repo_data):
    
    # KEYS LIST TO EXTRACT THEIR DATA FROM JSON WHILE CONVERTING INTO DATAFRAME
    keys = ['name', 'html_url', 'created_at', 'updated_at', 'language', 'watchers', 'forks', 'open_issues', 'subscribers_count']
    df = pd.DataFrame.from_dict(repo_data)
    df = df[keys]
    logging.info("Converted into Pandas Dataframe")
    return df

def remove_null(df):
    # REMOVING ALL RECORDS WITH NULL VALUES. HALF OF THE COLUMNS CONTAIN NULL VALUES AND IT IS BECAUSE GIT SERVER FORBIDDEN CALLS FROM MY API: 
    # DEBUG:urllib3.connectionpool:https://api.github.com:443 "GET /repos/sr/tasks HTTP/1.1" 403 279 DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): api.github.com:443
    df= df.dropna(how= 'all')
    logging.info('Dropping all columns')
    return df

# SENDING JSON DATA TO GOOGLE BIGQUERY
def df_to_gbq(df):
    try:
        # SERVICE ACCOUNT KEY 
        credentials = service_account.Credentials.from_service_account_file(
            'spiritual-slate-374706-a81759778bb5.json',
            scopes=["https://www.googleapis.com/auth/cloud-platform"],
        )
        logging.info('Connection Successful')

    except Exception as e:
        logging.error(f"Couldn't establish connection with BigQuery. Error: {e}")
    # UPLOADING DATAFRAME TO BIGQUERY
    df.to_gbq('marketlytics.github', project_id='spiritual-slate-374706', if_exists='append', credentials=credentials)

In [2]:
headers = {
    "Accept" : "application/vnd.github+json",
    "Authorization" : "Bearer <Token>",
    "X-GitHub-Api-Version" : "2022-11-28"
}

url= 'https://api.github.com/repositories'

In [3]:
data= get_repo_data(url, headers)

In [4]:
data[0]

{'id': 1,
 'node_id': 'MDEwOlJlcG9zaXRvcnkx',
 'name': 'grit',
 'full_name': 'mojombo/grit',
 'private': False,
 'owner': {'login': 'mojombo',
  'id': 1,
  'node_id': 'MDQ6VXNlcjE=',
  'avatar_url': 'https://avatars.githubusercontent.com/u/1?v=4',
  'gravatar_id': '',
  'url': 'https://api.github.com/users/mojombo',
  'html_url': 'https://github.com/mojombo',
  'followers_url': 'https://api.github.com/users/mojombo/followers',
  'following_url': 'https://api.github.com/users/mojombo/following{/other_user}',
  'gists_url': 'https://api.github.com/users/mojombo/gists{/gist_id}',
  'starred_url': 'https://api.github.com/users/mojombo/starred{/owner}{/repo}',
  'subscriptions_url': 'https://api.github.com/users/mojombo/subscriptions',
  'organizations_url': 'https://api.github.com/users/mojombo/orgs',
  'repos_url': 'https://api.github.com/users/mojombo/repos',
  'events_url': 'https://api.github.com/users/mojombo/events{/privacy}',
  'received_events_url': 'https://api.github.com/users/mo

In [5]:
url_list= get_repo_url(data)

In [6]:
url_list[:5]

['https://api.github.com/repos/mojombo/grit',
 'https://api.github.com/repos/wycats/merb-core',
 'https://api.github.com/repos/rubinius/rubinius',
 'https://api.github.com/repos/mojombo/god',
 'https://api.github.com/repos/vanpelt/jsawesome']

In [7]:
repo_data= get_data(url_list)

In [8]:
df= json_to_df(json.loads(repo_data))

In [9]:
df

Unnamed: 0,name,html_url,created_at,updated_at,language,watchers,forks,open_issues,subscribers_count
0,grit,https://github.com/mojombo/grit,2007-10-29T14:37:16Z,2023-01-14T23:54:00Z,Ruby,1963.0,540.0,30.0,71.0
1,merb-core,https://github.com/wycats/merb-core,2008-01-12T05:50:53Z,2023-01-09T07:03:49Z,Ruby,436.0,61.0,13.0,4.0
2,rubinius,https://github.com/rubinius/rubinius,2008-01-12T16:46:52Z,2023-01-13T01:27:20Z,C,3032.0,605.0,15.0,101.0
3,god,https://github.com/mojombo/god,2008-01-13T05:16:23Z,2023-01-06T23:15:58Z,Ruby,2202.0,515.0,248.0,91.0
4,jsawesome,https://github.com/vanpelt/jsawesome,2008-01-13T06:04:19Z,2022-12-12T07:43:26Z,JavaScript,68.0,7.0,1.0,3.0
...,...,...,...,...,...,...,...,...,...
95,,,,,,,,,
96,,,,,,,,,
97,,,,,,,,,
98,,,,,,,,,


In [10]:
df= remove_null(df)

In [11]:
df

Unnamed: 0,name,html_url,created_at,updated_at,language,watchers,forks,open_issues,subscribers_count
0,grit,https://github.com/mojombo/grit,2007-10-29T14:37:16Z,2023-01-14T23:54:00Z,Ruby,1963.0,540.0,30.0,71.0
1,merb-core,https://github.com/wycats/merb-core,2008-01-12T05:50:53Z,2023-01-09T07:03:49Z,Ruby,436.0,61.0,13.0,4.0
2,rubinius,https://github.com/rubinius/rubinius,2008-01-12T16:46:52Z,2023-01-13T01:27:20Z,C,3032.0,605.0,15.0,101.0
3,god,https://github.com/mojombo/god,2008-01-13T05:16:23Z,2023-01-06T23:15:58Z,Ruby,2202.0,515.0,248.0,91.0
4,jsawesome,https://github.com/vanpelt/jsawesome,2008-01-13T06:04:19Z,2022-12-12T07:43:26Z,JavaScript,68.0,7.0,1.0,3.0
5,jspec,https://github.com/wycats/jspec,2008-01-13T15:50:31Z,2022-12-23T09:32:22Z,JavaScript,83.0,13.0,2.0,4.0
6,exception_logger,https://github.com/defunkt/exception_logger,2008-01-14T03:32:19Z,2022-09-10T09:13:19Z,Ruby,242.0,92.0,2.0,2.0
7,ambition,https://github.com/defunkt/ambition,2008-01-14T06:28:56Z,2022-12-12T07:44:06Z,Ruby,163.0,25.0,1.0,4.0
8,restful-authentication,https://github.com/technoweenie/restful-authen...,2008-01-14T14:44:23Z,2022-12-12T07:44:07Z,Ruby,1575.0,279.0,27.0,16.0
9,attachment_fu,https://github.com/technoweenie/attachment_fu,2008-01-14T14:51:56Z,2022-11-08T14:16:49Z,Ruby,1023.0,325.0,35.0,9.0


In [12]:
df_to_gbq(df)

100%|██████████| 1/1 [00:00<00:00, 77.06it/s]
