In [1]:
import numpy as np
import pandas as pd
import json
from pandas.io.json import json_normalize
import requests
from bs4 import BeautifulSoup

# Reading data from csv files

In [2]:
df = pd.read_csv('file.csv')
df

Unnamed: 0,a,b,c,d
0,yellow,10,2,3.2
1,green,2,3,8.1
2,blue,7,1,0.4


In [3]:
pd.read_csv('file.csv', header=None)

Unnamed: 0,0,1,2,3
0,a,b,c,d
1,yellow,10,2,3.2
2,green,2,3,8.1
3,blue,7,1,0.4


In [4]:
pd.read_csv('file.csv',names=['column 1','column 2','column 3','column 4'])

Unnamed: 0,column 1,column 2,column 3,column 4
0,a,b,c,d
1,yellow,10,2,3.2
2,green,2,3,8.1
3,blue,7,1,0.4


In [5]:
pd.read_csv('file.csv', index_col=0)

Unnamed: 0_level_0,b,c,d
a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
yellow,10,2,3.2
green,2,3,8.1
blue,7,1,0.4


In [6]:
df.dtypes

a     object
b      int64
c      int64
d    float64
dtype: object

In [7]:
df2 = pd.read_csv('file.csv',  dtype = { 'b' : np.float64})
df2.dtypes

a     object
b    float64
c      int64
d    float64
dtype: object

In [8]:
pd.read_csv("file.csv", usecols=['a', 'b'])

Unnamed: 0,a,b
0,yellow,10
1,green,2
2,blue,7


# Reading data from excel files

In [9]:
pd.read_excel('data.xls')

Unnamed: 0,varA,varB,varC
0,0.391723,-0.155122,0.381104
1,0.575125,-0.105817,0.232245
2,0.672305,0.424688,-0.694795
3,0.766115,-0.79135,-0.028739
4,0.677259,-0.817543,-0.537088
5,-0.029702,-0.891848,-0.682719
6,-0.161366,-0.6596,-0.727898
7,0.031672,0.016607,-0.940479
8,0.833212,-0.503236,-0.88721
9,0.907753,0.265177,-0.390762


In [10]:
pd.read_excel('data.xls', sheet_name='Sheet2')

Unnamed: 0,varD,varE,varF
0,0.907753,0.265177,-0.390762
1,0.755019,-0.768056,-0.528307
2,0.850692,-0.537159,-0.601387
3,0.131663,0.941327,0.240073
4,0.5744,0.091735,-0.395277
5,0.81663,0.875612,-0.880044
6,0.536732,0.175428,-0.473053
7,-0.084641,-0.042827,0.053344
8,0.268271,-0.010628,-0.090952
9,0.166792,-0.872579,-0.556899


In [11]:
pd.read_excel('data.xls',usecols=[0,1])

Unnamed: 0,varA,varB
0,0.391723,-0.155122
1,0.575125,-0.105817
2,0.672305,0.424688
3,0.766115,-0.79135
4,0.677259,-0.817543
5,-0.029702,-0.891848
6,-0.161366,-0.6596
7,0.031672,0.016607
8,0.833212,-0.503236
9,0.907753,0.265177


In [12]:
pd.read_json('frame.json')

Unnamed: 0,col1,col2,col3,col4
row1,0,1,2,3
row2,4,5,6,7
row3,8,9,10,11
row4,12,13,14,15


In [13]:
pd.read_json('books.json')

Unnamed: 0,books
0,"{'isbn': '9781593275846', 'title': 'Eloquent J..."
1,"{'isbn': '9781449331818', 'title': 'Learning J..."
2,"{'isbn': '9781449365035', 'title': 'Speaking J..."


In [14]:
file = open('books.json', 'r')
json_string = file.read()
dictionary = json.loads(json_string)

In [15]:
json_normalize(dictionary, 'books')

Unnamed: 0,author,description,isbn,pages,published,publisher,subtitle,title,website
0,Marijn Haverbeke,JavaScript lies at the heart of almost every m...,9781593275846,472,2014-12-14T00:00:00.000Z,No Starch Press,A Modern Introduction to Programming,"Eloquent JavaScript, Second Edition",http://eloquentjavascript.net/
1,Addy Osmani,"With Learning JavaScript Design Patterns, you'...",9781449331818,254,2012-07-01T00:00:00.000Z,O'Reilly Media,A JavaScript and jQuery Developer's Guide,Learning JavaScript Design Patterns,http://www.addyosmani.com/resources/essentialj...
2,Axel Rauschmayer,"Like it or not, JavaScript is everywhere these...",9781449365035,460,2014-02-01T00:00:00.000Z,O'Reilly Media,An In-Depth Guide for Programmers,Speaking JavaScript,http://speakingjs.com/


# HTML files

In [16]:
page=requests.get('https://web.archive.org/web/20180908144902/http://en.proverbia.net/shortfamousquotes.asp')

In [17]:
page.text[0:100]

'\n<!DOCTYPE html>\n\n<html lang="en" xml:lang="en">\n<head><script src="//archive.org/includes/analytics'

In [18]:
page.status_code

200

In [19]:
soup = BeautifulSoup(page.text, 'html.parser')

In [20]:
quotes = soup.find_all('blockquote')

In [21]:
quotes

[<blockquote>There is a natural aristocracy among men. The grounds of this are virtue and talents. </blockquote>,
 <blockquote>All our words from loose using have lost their edge. </blockquote>,
 <blockquote>God couldn't be everywhere, so he created mothers </blockquote>,
 <blockquote>Be not afraid of going slowly, be afraid only of standing still. </blockquote>,
 <blockquote>Learn from yesterday, live for today, hope for tomorrow. </blockquote>,
 <blockquote>Do not confine your children to your own learning, for they were born in another time. </blockquote>,
 <blockquote>I hear and I forget, I see and I remember. I do and I understand. </blockquote>,
 <blockquote>In teaching others we teach ourselves. </blockquote>,
 <blockquote>Happiness will never come to those who fail to appreciate what they already have. </blockquote>,
 <blockquote>Without His love I can do nothing, with His love there is nothing I cannot do. </blockquote>]

In [22]:
quotes[0].text

'There is a natural aristocracy among men. The grounds of this are virtue and talents. '

In [23]:
quote_list = []
for quote in quotes:
    string = quote.text
    quote_list.append(string)


In [24]:
df = pd.DataFrame(quote_list, columns=['Quote'])
df

Unnamed: 0,Quote
0,There is a natural aristocracy among men. The ...
1,All our words from loose using have lost their...
2,"God couldn't be everywhere, so he created moth..."
3,"Be not afraid of going slowly, be afraid only ..."
4,"Learn from yesterday, live for today, hope for..."
5,Do not confine your children to your own learn...
6,"I hear and I forget, I see and I remember. I d..."
7,In teaching others we teach ourselves.
8,Happiness will never come to those who fail to...
9,"Without His love I can do nothing, with His lo..."


In [25]:
authors=soup.find_all('p', class_="a")

In [26]:
authors[0].text

'\nThomas Jefferson (1743-1826) Third president of the United States.\n'

In [27]:
authors[0].text[1:-1]

'Thomas Jefferson (1743-1826) Third president of the United States.'

In [28]:
author_list=[]
for author in authors:
    string = author.text[1:-1]
    author_list.append(string)
df['Author']=author_list
df

Unnamed: 0,Quote,Author
0,There is a natural aristocracy among men. The ...,Thomas Jefferson (1743-1826) Third president o...
1,All our words from loose using have lost their...,Ernest Hemingway (1898-1961) American Writer.
2,"God couldn't be everywhere, so he created moth...",Jewish proverb
3,"Be not afraid of going slowly, be afraid only ...",Chinese proverb
4,"Learn from yesterday, live for today, hope for...",Unknown Source
5,Do not confine your children to your own learn...,Chinese proverb
6,"I hear and I forget, I see and I remember. I d...",Chinese proverb
7,In teaching others we teach ourselves.,Proverb
8,Happiness will never come to those who fail to...,Unknown Source
9,"Without His love I can do nothing, with His lo...",Unknown Source


# Getting data from the web using APIs

In [29]:
import json

# Load credentials
with open('client-credentials.json') as file:
    client_credentials = json.load(file)

print('Credentials:', list(client_credentials.keys())) # ['client_id', 'client_secret']

Credentials: ['client_id', 'client_secret']


In [30]:
# print(client_credentials['client_id']) # Client ID

Reach API using Python Requests
---

In [31]:
from urllib.parse import urlencode

# Generate link that users can copy/paste in their browser to authorize our app
oauth_params = {
    'client_id': client_credentials['client_id'],
    'scope': 'read_all,profile:read_all,activity:read_all',
    'redirect_uri': 'https://localhost',
    'response_type': 'code'
}
print('https://www.strava.com/oauth/authorize' + '?' + urlencode(oauth_params))

https://www.strava.com/oauth/authorize?client_id=71481&scope=read_all%2Cprofile%3Aread_all%2Cactivity%3Aread_all&redirect_uri=https%3A%2F%2Flocalhost&response_type=code


In [None]:
from getpass import getpass

# After authorizing the app, user is redirected to
authorization_response = getpass(prompt='Full callback URL')

In [None]:
from urllib.parse import urlparse, parse_qs

# Extract Authorization Code from URL
authorization_code = parse_qs(urlparse(authorization_response).query)['code'][0]

In [None]:
# urlparse(authorization_response).query # state=&code=...&scope=...

In [None]:
# parse_qs(urlparse(authorization_response).query)

In [None]:
# print(authorization_code)

In [None]:
import requests

# Exchange Authorization Code for Access Token
r = requests.post('https://www.strava.com/oauth/token', data={
    'client_id': client_credentials['client_id'],
    'client_secret': client_credentials['client_secret'],
    'code': authorization_code,
    'grant_type': 'authorization_code'
})
r.status_code

In [None]:
# print(r.text) # ex. {"token_type":"Bearer","expires_at": ... }

In [None]:
# r.json()

In [None]:
# Token saver
def token_saver(token_obj):
    with open('token.json', 'w') as file:
        json.dump(token_obj, file, indent=4)

token_saver(r.json())

In [None]:
# Token loader
def get_token():
    with open('token.json', 'r') as file:
        return json.load(file)

token = get_token()
token.keys() # 'token_type', 'expires_at', 'expires_in', 'refresh_token', 'access_token', 'athlete'

In [None]:
print('Expires in:', token['expires_in']) # initially: 21600 (6 hours)
print('Expires at:', token['expires_at']) # in seconds

In [None]:
from datetime import datetime, timedelta

print('Expires at:', datetime.fromtimestamp(token['expires_at'])) # date, time
print('Expires in:', timedelta(seconds=token['expires_in'])) # time delta

In [None]:
# Refresh expired Access Tokens
r = requests.post('https://www.strava.com/oauth/token', data={
    'client_id': client_credentials['client_id'],
    'client_secret': client_credentials['client_secret'],
    'refresh_token': token['refresh_token'],
    'grant_type': 'refresh_token'
})
token_saver(r.json())
token = get_token()

In [None]:
# List activities
r = requests.get('https://www.strava.com/api/v3/athlete/activities', params={
    'access_token': token['access_token']
})
r.status_code

In [None]:
# Save actvitivies
with open('activities.json', 'w') as file:
    json.dump(r.json(), file, indent=4)

In [None]:
# Load data into DataFrame
activities_df = pd.read_json(r.text)
activities_df[['name', 'type', 'distance', 'elapsed_time', 'max_speed']]

With requests_oauthlib
---

In [None]:
from requests_oauthlib import OAuth2Session

# Create a session for initialization
init_session = OAuth2Session(
    client_credentials['client_id'],
    redirect_uri='https://localhost',
    scope='read_all,profile:read_all,activity:read_all'
)

# Get authorization link
user_link, state = init_session.authorization_url('https://www.strava.com/oauth/authorize')
print('Visit link:', user_link)
authorization_response = getpass(prompt='Full callback URL')

In [None]:
# Get Access Token
token = init_session.fetch_token(
    'https://www.strava.com/oauth/token',
    authorization_response=authorization_response,
    include_client_id=True,
    client_secret=client_credentials['client_secret']
)
token_saver(token)

In [None]:
# Create a session for reaching the API
api_session = OAuth2Session(
    client_credentials['client_id'],
    token=token, # pass Access Token
    
    # Automatically refresh expired token
    auto_refresh_url='https://www.strava.com/oauth/token',
    auto_refresh_kwargs={
        'client_id': client_credentials['client_id'],
        'client_secret': client_credentials['client_secret']
    },
    token_updater=token_saver # automatically saves new tokens
)

In [None]:
# List activities
r = api_session.get('https://www.strava.com/api/v3/athlete/activities')
r.status_code

In [None]:
activities_df = pd.read_json(r.text)
activities_df[['name', 'type', 'distance', 'elapsed_time', 'max_speed']]

With custom libraries - stravalib
---

In [None]:
from stravalib import Client

# Create client
client = Client()

# Get Authorization URL
user_link = client.authorization_url(
    client_id=client_credentials['client_id'],
    redirect_uri='https://localhost',
    scope=['read_all', 'profile:read_all', 'activity:read_all']
)
print('Visit link:', user_link)
authorization_response = getpass(prompt='Full callback URL')
authorization_code = parse_qs(urlparse(authorization_response).query)['code'][0]

In [None]:
# Get access token
token = client.exchange_code_for_token(
    client_id=client_credentials['client_id'],
    client_secret=client_credentials['client_secret'],
    code=authorization_code)
token_saver(token)

In [None]:
import time

# Refresh token if necessary
if time.time() > token['expires_at']:
    token = client.refresh_access_token(
        client_id=client_credentials['client_id'],
        client_secret=client_credentials['client_secret'],
        refresh_token=token['refresh_token'])
    token_saver(token)

In [None]:
# Get activities
activities = client.get_activities(limit=5)
activities

In [None]:
for activity in activities:
    print(activity)

In [None]:
a = list(activities)[0] # Get the first activity

print('Activity name:', a.name)
print('Distance:', a.distance)
print('Athlete name:', a.athlete.firstname)
print('Average heart rate:', a.average_heartrate)