#### Pulling data from public APIs using GET request.

In [None]:
base_url = "https://api.exchangeratesapi.io/v1/latest?access_key=cc58cb8e6bde80d83b32cbc7f2e2b80d"

In [None]:
import requests # Imports the module to talk directly with the url.

In [None]:
response = requests.get(base_url) # The variable stores the response from the website.

In [None]:
response.status_code # Returns the status code of request.

In [None]:
response.content # Returns the content of the response.

In [None]:
response.json() # This converts out to a python object(dict).

##### Import JSON package to improve the clarity of the output using loads and dumps.
* Loads function converts JSON formatted string to a python object
* Dumps converts object back to a regular string.

In [None]:
import json

In [None]:
print(json.dumps(response.json(), indent=4)) # This improves reading clarity.

In [None]:
response.json().keys() # Check the keys in the output.

#### Specifying parameters in the base url using GET request.

In [None]:
param_url = "https://api.exchangeratesapi.io/v1/latest?access_key=cc58cb8e6bde80d83b32cbc7f2e2b80d&symbols=USD,GBP,AUD,CAD,PLN,MXN"
param_url

In [None]:
response = requests.get(param_url)
response

In [None]:
data = response.json()
data

#### itunes search API

In [None]:
itunes_api = "https://itunes.apple.com/lookup?id=254654363&entity=album"
response = requests.get(itunes_api)

In [None]:
response.status_code

In [None]:
itunes_data = response.json()
itunes_data

In [None]:
base_site = "https://itunes.apple.com/search"
params = {"term": "the beatles", "country": "us"} # Search parameter

In [None]:
r = requests.get(base_site, params=params) # Communicates to the url and parameters.
r.status_code

In [None]:
artist_info = r.json()
print(json.dumps(artist_info, indent=4)) # Investigate the output from the url.

In [None]:
artist_info.keys() #Investigates the keys in the result.

In [None]:
print(json.dumps(artist_info['results'][0], indent=4)) # Returns details for first item.

##### Structuring and exporting the data using pandas.

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(artist_info['results']) # Converts the result to pandas DataFrame.
df.head()

#### Pagination
* Dealing with websites with multiple pages

#### Webscrapping using beautiful soup.
* Inspect the page
* obtain HTML
* Parse HTML (html.parser, lxml, html5lib)
* create a beautiful soup object
* Exporting the HTML to a file

#### Scrapping Rotten Tomatoes website using Beautiful Soup library

In [None]:
import requests
from bs4 import BeautifulSoup # Imports the packages required.

In [None]:
base_url_3 = "https://editorial.rottentomatoes.com/guide/140-essential-action-movies-to-watch-now/"

In [None]:
r = requests.get(base_url_3)
r

In [None]:
html = r.content

In [None]:
soup = BeautifulSoup(html, 'lxml') # Creates beautiful object.

In [None]:
with open('rotten_tomatoes.html', 'wb') as file: # Exports output to a file.
    file.write(soup.prettify('utf-8'))

In [None]:
movie_info = soup.find_all('div', {'class': 'col-sm-18 col-full-xs countdown-item-content' }) # Stores all divs.
movie_info

In [None]:
headings = [div.find("h2") for div in movie_info] 
headings

In [None]:
headings[0]

In [None]:
# To extract the title
movie_titles = [heading.find('a').string for heading in headings]
movie_titles

In [None]:
# To extract the years
years = [heading.find('span', class_ = 'start-year').string for heading in headings]
years

In [None]:
years = [year.strip('()') for year in years] # Strips the parenthesis.
years

In [None]:
years = [int(year) for year in years] # Converts to integers.
years

In [None]:
scores = [heading.find('span', class_ = 'tMeterScore').string for heading in headings] # Extracts scores from class.
scores

In [None]:
scores = [score.strip('%') for score in scores] # Strips the % sign as it will affect analysis.
scores

In [None]:
scores = [int(score) for score in scores] # Converts to integer.
scores

## Extracting the rest of the information.

In [None]:
consensus = [div.find("div", {'class': "info critics-consensus"}) for div in movie_info]
consensus

In [None]:
[con.text for con in consensus]

In [None]:
[con.text.strip() for con in consensus] # This strips the white spaces and other characters not relevant.

### Text processing

In [None]:
common_phrase = 'Critics Consensus: '
len(common_phrase)

In [None]:
common_len = len(common_phrase) # Stores the length of common phrase.
common_len

In [None]:
consensus_txt = [con.text[common_len:].strip() for con in consensus] # Extracts the rest of the consensus information.
consensus_txt

In [None]:
# To extract the directors of the movies.
directors = [div.find('div', {'class': 'director'}) for div in movie_info] # Iterates over the movie info to extract director details.
directors

In [None]:
directors = [director.find('a').string for director in directors]
directors

In [None]:
movie_cast = [div.find('div', {'class': 'cast'}) for div in movie_info] # Extracts the casting crew.
movie_cast

In [None]:
cast_crew = movie_cast[0].find_all('a') # The cast for the first movie on the page.
cast_crew

In [None]:
cast_names = [cast.string for cast in cast_crew] # Extracts cast names from href link.
cast_names

In [None]:
casting = [] # Empty list to save the names of the actors in all movies.
for c in movie_cast: # Iterates over the movies.
    cast_crew = c.find_all('a') # Finds all href tags in the html that contains actors names.
    cast_names = [cast.string for cast in cast_crew] # Extarcts the text in all a tags.
    result = ', '.join(cast_names) # Join all the cast names in one string separated by commas.
    casting.append(result) # Appends the result to the list.

In [None]:
casting

In [None]:
import pandas as pd

In [None]:
# Exporting to Pandas Data Frame.
Action_movies = pd.DataFrame()
pd.set_option('display.max_colwidth', None) # Shows full length with truncation.


Action_movies['Title'] = movie_titles
Action_movies['ReleaseYear'] = years
Action_movies['Score'] = scores
Action_movies['Director'] = directors
Action_movies['Synopsis'] = consensus_txt
Action_movies['Cast'] = casting

Action_movies


In [None]:
# Exporting to csv file.
Action_movies.to_csv("Action_movies.csv", index = False, header = True)

In [None]:
Action_movies.head()

In [None]:
Action_movies.describe()

In [None]:
Action_movies.info()

In [None]:
Action_movies.shape

In [None]:
Action_movies.isna()

## Using Pandas to extract tables.

In [None]:
tables = pd.read_html("https://toscrape.com/")
type(tables[0])


In [None]:
len(tables) # Returns number of tables.

In [None]:
tables[1] # Inspect the table of interest.

#### Request Headers
* user-agent
* Cookie

In [None]:
headers= {
    "User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36"
}

In [None]:
r = requests.get("https://www.youtube.com/", headers = headers)
r.status_code