In [None]:
import numpy as np
import pandas as pd
import json
from pandas.io.json import json_normalize
import requests
from bs4 import BeautifulSoup

# Reading data from csv files

In [None]:
df = pd.read_csv('file.csv')
df

In [None]:
pd.read_csv('file.csv', header=None)

In [None]:
pd.read_csv('file.csv',names=['column 1','column 2','column 3','column 4'])

In [None]:
pd.read_csv('file.csv', index_col=0)

In [None]:
df.dtypes

In [None]:
df2 = pd.read_csv('file.csv',  dtype = { 'b' : np.float64})
df2.dtypes

In [None]:
pd.read_csv("file.csv", usecols=['a', 'b'])

# Reading data from excel files

In [None]:
pd.read_excel('data.xls')

In [None]:
pd.read_excel('data.xls', sheet_name='Sheet2')

In [None]:
pd.read_excel('data.xls',usecols=[0,1])

In [None]:
pd.read_json('frame.json')

In [None]:
pd.read_json('books.json')

In [None]:
file = open('books.json', 'r')
json_string = file.read()
dictionary = json.loads(json_string)

In [None]:
json_normalize(dictionary, 'books')

# HTML files

In [None]:
page=requests.get('https://web.archive.org/web/20180908144902/http://en.proverbia.net/shortfamousquotes.asp')

In [None]:
page.text[0:100]

In [None]:
page.status_code

In [None]:
soup = BeautifulSoup(page.text, 'html.parser')

In [None]:
quotes = soup.find_all('blockquote')

In [None]:
quotes

In [None]:
quotes[0].text

In [None]:
quote_list = []
for quote in quotes:
    string = quote.text
    quote_list.append(string)


In [None]:
df = pd.DataFrame(quote_list, columns=['Quote'])
df

In [None]:
authors=soup.find_all('p', class_="a")

In [None]:
authors[0].text

In [None]:
authors[0].text[1:-1]

In [None]:
author_list=[]
for author in authors:
    string = author.text[1:-1]
    author_list.append(string)
df['Author']=author_list
df

# Getting data from the web using APIs

In [None]:
import json

# Load credentials
with open('client-credentials.json') as file:
    client_credentials = json.load(file)

print('Credentials:', list(client_credentials.keys())) # ['client_id', 'client_secret']

In [None]:
# print(client_credentials['client_id']) # Client ID

Reach API using Python Requests
---

In [None]:
from urllib.parse import urlencode

# Generate link that users can copy/paste in their browser to authorize our app
oauth_params = {
    'client_id': client_credentials['client_id'],
    'scope': 'read_all,profile:read_all,activity:read_all',
    'redirect_uri': 'https://localhost',
    'response_type': 'code'
}
print('https://www.strava.com/oauth/authorize' + '?' + urlencode(oauth_params))

In [None]:
from getpass import getpass

# After authorizing the app, user is redirected to
authorization_response = getpass(prompt='Full callback URL')

In [None]:
from urllib.parse import urlparse, parse_qs

# Extract Authorization Code from URL
authorization_code = parse_qs(urlparse(authorization_response).query)['code'][0]

In [None]:
# urlparse(authorization_response).query # state=&code=...&scope=...

In [None]:
# parse_qs(urlparse(authorization_response).query)

In [None]:
# print(authorization_code)

In [None]:
import requests

# Exchange Authorization Code for Access Token
r = requests.post('https://www.strava.com/oauth/token', data={
    'client_id': client_credentials['client_id'],
    'client_secret': client_credentials['client_secret'],
    'code': authorization_code,
    'grant_type': 'authorization_code'
})
r.status_code

In [None]:
# print(r.text) # ex. {"token_type":"Bearer","expires_at": ... }

In [None]:
# r.json()

In [None]:
# Token saver
def token_saver(token_obj):
    with open('token.json', 'w') as file:
        json.dump(token_obj, file, indent=4)

token_saver(r.json())

In [None]:
# Token loader
def get_token():
    with open('token.json', 'r') as file:
        return json.load(file)

token = get_token()
token.keys() # 'token_type', 'expires_at', 'expires_in', 'refresh_token', 'access_token', 'athlete'

In [None]:
print('Expires in:', token['expires_in']) # initially: 21600 (6 hours)
print('Expires at:', token['expires_at']) # in seconds

In [None]:
from datetime import datetime, timedelta

print('Expires at:', datetime.fromtimestamp(token['expires_at'])) # date, time
print('Expires in:', timedelta(seconds=token['expires_in'])) # time delta

In [None]:
# Refresh expired Access Tokens
r = requests.post('https://www.strava.com/oauth/token', data={
    'client_id': client_credentials['client_id'],
    'client_secret': client_credentials['client_secret'],
    'refresh_token': token['refresh_token'],
    'grant_type': 'refresh_token'
})
token_saver(r.json())
token = get_token()

In [None]:
# List activities
r = requests.get('https://www.strava.com/api/v3/athlete/activities', params={
    'access_token': token['access_token']
})
r.status_code

In [None]:
# Save actvitivies
with open('activities.json', 'w') as file:
    json.dump(r.json(), file, indent=4)

In [None]:
# Load data into DataFrame
activities_df = pd.read_json(r.text)
activities_df[['name', 'type', 'distance', 'elapsed_time', 'max_speed']]

With requests_oauthlib
---

In [None]:
from requests_oauthlib import OAuth2Session

# Create a session for initialization
init_session = OAuth2Session(
    client_credentials['client_id'],
    redirect_uri='https://localhost',
    scope='read_all,profile:read_all,activity:read_all'
)

# Get authorization link
user_link, state = init_session.authorization_url('https://www.strava.com/oauth/authorize')
print('Visit link:', user_link)
authorization_response = getpass(prompt='Full callback URL')

In [None]:
# Get Access Token
token = init_session.fetch_token(
    'https://www.strava.com/oauth/token',
    authorization_response=authorization_response,
    include_client_id=True,
    client_secret=client_credentials['client_secret']
)
token_saver(token)

In [None]:
# Create a session for reaching the API
api_session = OAuth2Session(
    client_credentials['client_id'],
    token=token, # pass Access Token
    
    # Automatically refresh expired token
    auto_refresh_url='https://www.strava.com/oauth/token',
    auto_refresh_kwargs={
        'client_id': client_credentials['client_id'],
        'client_secret': client_credentials['client_secret']
    },
    token_updater=token_saver # automatically saves new tokens
)

In [None]:
# List activities
r = api_session.get('https://www.strava.com/api/v3/athlete/activities')
r.status_code

In [None]:
activities_df = pd.read_json(r.text)
activities_df[['name', 'type', 'distance', 'elapsed_time', 'max_speed']]

With custom libraries - stravalib
---

In [None]:
from stravalib import Client

# Create client
client = Client()

# Get Authorization URL
user_link = client.authorization_url(
    client_id=client_credentials['client_id'],
    redirect_uri='https://localhost',
    scope=['read_all', 'profile:read_all', 'activity:read_all']
)
print('Visit link:', user_link)
authorization_response = getpass(prompt='Full callback URL')
authorization_code = parse_qs(urlparse(authorization_response).query)['code'][0]

In [None]:
# Get access token
token = client.exchange_code_for_token(
    client_id=client_credentials['client_id'],
    client_secret=client_credentials['client_secret'],
    code=authorization_code)
token_saver(token)

In [None]:
import time

# Refresh token if necessary
if time.time() > token['expires_at']:
    token = client.refresh_access_token(
        client_id=client_credentials['client_id'],
        client_secret=client_credentials['client_secret'],
        refresh_token=token['refresh_token'])
    token_saver(token)

In [None]:
# Get activities
activities = client.get_activities(limit=5)
activities

In [None]:
for activity in activities:
    print(activity)

In [None]:
a = list(activities)[0] # Get the first activity

print('Activity name:', a.name)
print('Distance:', a.distance)
print('Athlete name:', a.athlete.firstname)
print('Average heart rate:', a.average_heartrate)