## Extracting Data from an API Using the Request Module

In [1]:
import requests

response = requests.get('https://api.github.com/repositories',
    headers={'Accept': 'application/vnd.github.v3+json'})

print(response.status_code)

200


In [2]:
print(response.encoding)
print(response.headers['Content-Type'])
print(response.headers['server'])

utf-8
application/json; charset=utf-8
GitHub.com


In [3]:
import json

print(json.dumps(response.json()[0], indent=2)[:200])

{
  "id": 1,
  "node_id": "MDEwOlJlcG9zaXRvcnkx",
  "name": "grit",
  "full_name": "mojombo/grit",
  "private": false,
  "owner": {
    "login": "mojombo",
    "id": 1,
    "node_id": "MDQ6VXNlcjE=",



In [4]:
response = requests.get('https://api.github.com/search/repositories',
    headers={'Accept': 'application/vnd.github.v3.text-match+json'},
    params={'q': 'data_science+language:python'})
print(response.status_code)

200


In [5]:
from IPython.display import display, Markdown

def printmd(string):
    display(Markdown(string))

In [6]:
for item in response.json()['items'][:5]:
    printmd('**' + item['name'] + '**' + ': repository ' +
            item['text_matches'][0]['property'] + ' - \"*' +
            item['text_matches'][0]['fragment'] + '*\" matched with ' + '**' +
            item['text_matches'][0]['matches'][0]['text'] + '**')

**awesome**: repository description - "*Awesome resources on Bioinformatics, data science, machine learning, programming language (Python, Golang, R, Perl) and miscellaneous stuff.*" matched with **data science**

**hu-dsf**: repository description - "*Introduction course to data science using the Python programming language in the form of Jupyter Notebooks*" matched with **data science**

**Python**: repository description - "*this resporatory have ml,ai,nlp,data science etc.python language related material from many websites eg. datacamp,geeksforgeeks,linkedin,youtube,udemy etc. also it include programming challange/competion solutions*" matched with **data science**

**math-server-docker**: repository description - "*The ideal multi-user Data Science server with Jupyterhub and RStudio, ready for Python, R and Julia languages.*" matched with **Data Science**

**python**: repository description - "*A short course introducing students to the Python programming language for data science*" matched with **Python**

In [7]:
response = requests.get('https://api.github.com/repos/pytorch/pytorch/issues/comments')
print('Response Code:', response.status_code)
print('Number of comments:', len(response.json()))

Response Code: 200
Number of comments: 30


### Pagination

In [8]:
response.links

{'next': {'url': 'https://api.github.com/repositories/65600975/issues/comments?page=2',
  'rel': 'next'},
 'last': {'url': 'https://api.github.com/repositories/65600975/issues/comments?page=1000',
  'rel': 'last'}}

In [9]:
import pandas as pd

def get_all_pages(url, params=None, headers=None):
    output_json = []
    response = requests.get(url, params=params, headers=headers)
    if response.status_code == 200:
        output_json = response.json()
        if 'next' in response.links:
            next_url = response.links['next']['url']
            if next_url is not None:
                output_json += get_all_pages(next_url, params=params, headers=headers)
    return output_json

out = get_all_pages('https://api.github.com/repos/pytorch/pytorch/issues/comments', 
    params = {
        'since': '2020-07-01T10:00:01Z',
        'sorted': 'created',
        'direction':'desc'
    },
    headers={'Accept': 'application/vnd.github.v3+json'})
df = pd.DataFrame(out)
print(df['body'].count())
df[['id', 'created_at', 'body']].sample(1)

1650


Unnamed: 0,id,created_at,body
1399,2040836983,2024-04-06T01:01:59Z,```\r\nIn [1]: import torch\r\n\r\nIn [2]: a =...


### Rate Limiting

In [10]:
response = requests.head(
    'https://api.github.com/repos/pytorch/pytorch/issues/comments')
print('X-RateLimit-Limit:', response.headers['X-RateLimit-Limit'])
print('X-RateLimit-Remaining:', response.headers['X-RateLimit-Remaining'])

#Converting UTC time to human readable format
import datetime
print('Rate Limits reset at',
    datetime.datetime.fromtimestamp(int(response.headers['X-RateLimit-Reset'])).strftime('%c'))

X-RateLimit-Limit: 60
X-RateLimit-Remaining: 1
Rate Limits reset at Wed Apr 10 03:04:14 2024


In [11]:
json = [
        {
            "name": "John",
            "age": 43
        },
        {
            "name": "Peter",
            "age": 51
        }
    ]

pd.DataFrame(json)

Unnamed: 0,name,age
0,John,43
1,Peter,51


In [12]:
from datetime import datetime
import time

def handle_rate_limits(response):
    now = datetime.now()
    reset_time = datetime.fromtimestamp(int(response.headers['X-RateLimit-Reset']))
    remaining_requests = int(response.headers['X-RateLimit-Remaining'])
    remaining_time = (reset_time - now).total_seconds()
    intervals = remaining_time / (1.0 + int(remaining_requests))
    print('Sleeping for', intervals)
    time.sleep(intervals)
    return True

In [15]:
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

retry_strategy = Retry (
    total=5,
    status_forcelist=[500, 503, 504],
    backoff_factor=1
)

retry_adapter = HTTPAdapter(max_retries=retry_strategy)

http = requests.Session()
http.mount("https://", retry_adapter)
http.mount("http://", retry_adapter)

response = http.get('https://api.github.com/search/repositories',
                    params={'q': 'data_science+language:python'})

for item in response.json()['items'][:5]:
    print('Name:', item['name'], 'Stars:', item['stargazers_count'])

Name: awesome Stars: 584
Name: hu-dsf Stars: 25
Name: Python Stars: 71
Name: math-server-docker Stars: 71
Name: python Stars: 10


In [16]:
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

retry_strategy = Retry (
    total=5,
    status_forcelist=[500, 503, 504],
    backoff_factor=1
)

retry_adapter = HTTPAdapter(max_retries=retry_strategy)

http = requests.Session()
http.mount("https://", retry_adapter)
http.mount("http://", retry_adapter)

def get_all_pages(url, params=None, headers=None):
    output_json = []
    response = http.get(url, params=params, headers=headers)
    if response.status_code == 200:
        output_json = response.json()
        if 'next' in response.links:
            next_url = response.links['next']['url']
            if (next_url is not None) and (handle_rate_limits(response)):
                output_json += get_all_pages(next_url, params=params, headers=headers)
    return output_json

## Extracting Twitter Data with Tweepy